]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
log: Add key/value support to the log context
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
ee617a4e 42#include "argv-util.h"
8dd4c05b
LP
43#include "async.h"
44#include "barrier.h"
b1994387 45#include "bpf-lsm.h"
8dd4c05b 46#include "cap-list.h"
430f0182 47#include "capability-util.h"
fdb3deca 48#include "cgroup-setup.h"
f4351959 49#include "chase-symlinks.h"
bb0c0d6f 50#include "chown-recursive.h"
28db6fbf 51#include "constants.h"
da681e1b 52#include "cpu-set-util.h"
43144be4 53#include "creds-util.h"
6a818c3c 54#include "data-fd-util.h"
686d13b9 55#include "env-file.h"
4d1a6904 56#include "env-util.h"
17df7223 57#include "errno-list.h"
8a62620e 58#include "escape.h"
3ffd4af2 59#include "execute.h"
8dd4c05b 60#include "exit-status.h"
3ffd4af2 61#include "fd-util.h"
bb0c0d6f 62#include "fileio.h"
f97b34a6 63#include "format-util.h"
7d50b32a 64#include "glob-util.h"
0389f4fa 65#include "hexdecoct.h"
c004493c 66#include "io-util.h"
032b3afb 67#include "ioprio-util.h"
a1164ae3 68#include "label.h"
8dd4c05b
LP
69#include "log.h"
70#include "macro.h"
e8a565cb 71#include "manager.h"
2a341bb9 72#include "manager-dump.h"
0a970718 73#include "memory-util.h"
f5947a5e 74#include "missing_fs.h"
5bead76e 75#include "missing_ioprio.h"
7a114ed4 76#include "missing_prctl.h"
35cd0ba5 77#include "mkdir-label.h"
21935150 78#include "mount-util.h"
bb0c0d6f 79#include "mountpoint-util.h"
8dd4c05b 80#include "namespace.h"
6bedfcbb 81#include "parse-util.h"
8dd4c05b 82#include "path-util.h"
0b452006 83#include "process-util.h"
6bb00842 84#include "psi-util.h"
d3dcf4e3 85#include "random-util.h"
3989bdc1 86#include "recurse-dir.h"
78f22b97 87#include "rlimit-util.h"
8dd4c05b 88#include "rm-rf.h"
349cc4a5 89#if HAVE_SECCOMP
3ffd4af2
LP
90#include "seccomp-util.h"
91#endif
07d46372 92#include "securebits-util.h"
8dd4c05b 93#include "selinux-util.h"
24882e06 94#include "signal-util.h"
8dd4c05b 95#include "smack-util.h"
57b7a260 96#include "socket-util.h"
a2ab603c 97#include "sort-util.h"
fd63e712 98#include "special.h"
949befd3 99#include "stat-util.h"
8b43440b 100#include "string-table.h"
07630cea 101#include "string-util.h"
8dd4c05b 102#include "strv.h"
7ccbd1ae 103#include "syslog-util.h"
8dd4c05b 104#include "terminal-util.h"
bb0c0d6f 105#include "tmpfile-util.h"
566b7d23 106#include "umask-util.h"
2d3b784d 107#include "unit-serialize.h"
b1d4f8e1 108#include "user-util.h"
8dd4c05b 109#include "utmp-wtmp.h"
5cb5a6ff 110
e056b01d 111#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 112#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 113
531dca78
LP
114#define SNDBUF_SIZE (8*1024*1024)
115
da6053d0 116static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
117 if (n_fds <= 0)
118 return 0;
119
a0d40ac5
LP
120 /* Modifies the fds array! (sorts it) */
121
034c6ed7
LP
122 assert(fds);
123
5b10116e
ZJS
124 for (int start = 0;;) {
125 int restart_from = -1;
034c6ed7 126
5b10116e 127 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
128 int nfd;
129
130 /* Already at right index? */
131 if (fds[i] == i+3)
132 continue;
133
3cc2aff1
LP
134 nfd = fcntl(fds[i], F_DUPFD, i + 3);
135 if (nfd < 0)
034c6ed7
LP
136 return -errno;
137
03e334a1 138 safe_close(fds[i]);
034c6ed7
LP
139 fds[i] = nfd;
140
141 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 142 * let's remember that and try again from here */
034c6ed7
LP
143 if (nfd != i+3 && restart_from < 0)
144 restart_from = i;
145 }
146
147 if (restart_from < 0)
148 break;
149
150 start = restart_from;
151 }
152
153 return 0;
154}
155
cd48e23f
RP
156static int flags_fds(
157 const int fds[],
158 size_t n_socket_fds,
159 size_t n_fds,
160 bool nonblock) {
161
e2c76839 162 int r;
47a71eed
LP
163
164 if (n_fds <= 0)
165 return 0;
166
167 assert(fds);
168
9b141911
FB
169 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
170 * O_NONBLOCK only applies to socket activation though. */
47a71eed 171
5b10116e 172 for (size_t i = 0; i < n_fds; i++) {
47a71eed 173
9b141911
FB
174 if (i < n_socket_fds) {
175 r = fd_nonblock(fds[i], nonblock);
176 if (r < 0)
177 return r;
178 }
47a71eed 179
451a074f
LP
180 /* We unconditionally drop FD_CLOEXEC from the fds,
181 * since after all we want to pass these fds to our
182 * children */
47a71eed 183
3cc2aff1
LP
184 r = fd_cloexec(fds[i], false);
185 if (r < 0)
e2c76839 186 return r;
47a71eed
LP
187 }
188
189 return 0;
190}
191
1e22b5cd 192static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
193 assert(context);
194
1e22b5cd
LP
195 if (context->stdio_as_fds)
196 return NULL;
197
80876c20
LP
198 if (context->tty_path)
199 return context->tty_path;
200
201 return "/dev/console";
202}
203
1e22b5cd
LP
204static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
205 const char *path;
206
6ea832a2
LP
207 assert(context);
208
1e22b5cd 209 path = exec_context_tty_path(context);
6ea832a2 210
1e22b5cd
LP
211 if (context->tty_vhangup) {
212 if (p && p->stdin_fd >= 0)
213 (void) terminal_vhangup_fd(p->stdin_fd);
214 else if (path)
215 (void) terminal_vhangup(path);
216 }
6ea832a2 217
1e22b5cd
LP
218 if (context->tty_reset) {
219 if (p && p->stdin_fd >= 0)
220 (void) reset_terminal_fd(p->stdin_fd, true);
221 else if (path)
222 (void) reset_terminal(path);
223 }
224
51462135
DDM
225 if (p && p->stdin_fd >= 0)
226 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
227
1e22b5cd
LP
228 if (context->tty_vt_disallocate && path)
229 (void) vt_disallocate(path);
6ea832a2
LP
230}
231
6af760f3
LP
232static bool is_terminal_input(ExecInput i) {
233 return IN_SET(i,
234 EXEC_INPUT_TTY,
235 EXEC_INPUT_TTY_FORCE,
236 EXEC_INPUT_TTY_FAIL);
237}
238
3a1286b6 239static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
240 return IN_SET(o,
241 EXEC_OUTPUT_TTY,
6af760f3
LP
242 EXEC_OUTPUT_KMSG_AND_CONSOLE,
243 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
244}
245
aac8c0c3
LP
246static bool is_kmsg_output(ExecOutput o) {
247 return IN_SET(o,
248 EXEC_OUTPUT_KMSG,
249 EXEC_OUTPUT_KMSG_AND_CONSOLE);
250}
251
6af760f3
LP
252static bool exec_context_needs_term(const ExecContext *c) {
253 assert(c);
254
255 /* Return true if the execution context suggests we should set $TERM to something useful. */
256
257 if (is_terminal_input(c->std_input))
258 return true;
259
260 if (is_terminal_output(c->std_output))
261 return true;
262
263 if (is_terminal_output(c->std_error))
264 return true;
265
266 return !!c->tty_path;
3a1286b6
MS
267}
268
80876c20 269static int open_null_as(int flags, int nfd) {
046a82c1 270 int fd;
071830ff 271
80876c20 272 assert(nfd >= 0);
071830ff 273
613b411c
LP
274 fd = open("/dev/null", flags|O_NOCTTY);
275 if (fd < 0)
071830ff
LP
276 return -errno;
277
046a82c1 278 return move_fd(fd, nfd, false);
071830ff
LP
279}
280
91dd5f7c
LP
281static int connect_journal_socket(
282 int fd,
283 const char *log_namespace,
284 uid_t uid,
285 gid_t gid) {
286
524daa8c
ZJS
287 uid_t olduid = UID_INVALID;
288 gid_t oldgid = GID_INVALID;
91dd5f7c 289 const char *j;
524daa8c
ZJS
290 int r;
291
91dd5f7c
LP
292 j = log_namespace ?
293 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
294 "/run/systemd/journal/stdout";
91dd5f7c 295
cad93f29 296 if (gid_is_valid(gid)) {
524daa8c
ZJS
297 oldgid = getgid();
298
92a17af9 299 if (setegid(gid) < 0)
524daa8c
ZJS
300 return -errno;
301 }
302
cad93f29 303 if (uid_is_valid(uid)) {
524daa8c
ZJS
304 olduid = getuid();
305
92a17af9 306 if (seteuid(uid) < 0) {
524daa8c
ZJS
307 r = -errno;
308 goto restore_gid;
309 }
310 }
311
1861986a 312 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 313
1861986a
LP
314 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
315 an LSM interferes. */
524daa8c 316
cad93f29 317 if (uid_is_valid(uid))
524daa8c
ZJS
318 (void) seteuid(olduid);
319
320 restore_gid:
cad93f29 321 if (gid_is_valid(gid))
524daa8c
ZJS
322 (void) setegid(oldgid);
323
324 return r;
325}
326
fd1f9c89 327static int connect_logger_as(
34cf6c43 328 const Unit *unit,
fd1f9c89 329 const ExecContext *context,
af635cf3 330 const ExecParameters *params,
fd1f9c89
LP
331 ExecOutput output,
332 const char *ident,
fd1f9c89
LP
333 int nfd,
334 uid_t uid,
335 gid_t gid) {
336
254d1313 337 _cleanup_close_ int fd = -EBADF;
2ac1ff68 338 int r;
071830ff
LP
339
340 assert(context);
af635cf3 341 assert(params);
80876c20
LP
342 assert(output < _EXEC_OUTPUT_MAX);
343 assert(ident);
344 assert(nfd >= 0);
071830ff 345
54fe0cdb
LP
346 fd = socket(AF_UNIX, SOCK_STREAM, 0);
347 if (fd < 0)
80876c20 348 return -errno;
071830ff 349
91dd5f7c 350 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
351 if (r < 0)
352 return r;
071830ff 353
2ac1ff68 354 if (shutdown(fd, SHUT_RD) < 0)
80876c20 355 return -errno;
071830ff 356
fd1f9c89 357 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 358
2ac1ff68 359 if (dprintf(fd,
62bca2c6 360 "%s\n"
80876c20
LP
361 "%s\n"
362 "%i\n"
54fe0cdb
LP
363 "%i\n"
364 "%i\n"
365 "%i\n"
4f4a1dbf 366 "%i\n",
c867611e 367 context->syslog_identifier ?: ident,
af635cf3 368 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
369 context->syslog_priority,
370 !!context->syslog_level_prefix,
f3dc6af2 371 false,
aac8c0c3 372 is_kmsg_output(output),
2ac1ff68
EV
373 is_terminal_output(output)) < 0)
374 return -errno;
80876c20 375
2ac1ff68 376 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 377}
2ac1ff68 378
3a274a21 379static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 380 int fd;
071830ff 381
80876c20
LP
382 assert(path);
383 assert(nfd >= 0);
fd1f9c89 384
3a274a21 385 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 386 if (fd < 0)
80876c20 387 return fd;
071830ff 388
046a82c1 389 return move_fd(fd, nfd, false);
80876c20 390}
071830ff 391
2038c3f5 392static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 393 _cleanup_close_ int fd = -EBADF;
86fca584 394 int r;
071830ff 395
80876c20 396 assert(path);
071830ff 397
2038c3f5
LP
398 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
399 flags |= O_CREAT;
400
401 fd = open(path, flags|O_NOCTTY, mode);
402 if (fd >= 0)
15a3e96f 403 return TAKE_FD(fd);
071830ff 404
2038c3f5
LP
405 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
406 return -errno;
2038c3f5
LP
407
408 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
409
410 fd = socket(AF_UNIX, SOCK_STREAM, 0);
411 if (fd < 0)
412 return -errno;
413
1861986a
LP
414 r = connect_unix_path(fd, AT_FDCWD, path);
415 if (IN_SET(r, -ENOTSOCK, -EINVAL))
416 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
417 * wasn't an AF_UNIX socket after all */
418 return -ENXIO;
419 if (r < 0)
420 return r;
071830ff 421
2038c3f5
LP
422 if ((flags & O_ACCMODE) == O_RDONLY)
423 r = shutdown(fd, SHUT_WR);
424 else if ((flags & O_ACCMODE) == O_WRONLY)
425 r = shutdown(fd, SHUT_RD);
426 else
86fca584 427 r = 0;
15a3e96f 428 if (r < 0)
2038c3f5 429 return -errno;
2038c3f5 430
15a3e96f 431 return TAKE_FD(fd);
80876c20 432}
071830ff 433
08f3be7a
LP
434static int fixup_input(
435 const ExecContext *context,
436 int socket_fd,
437 bool apply_tty_stdin) {
438
439 ExecInput std_input;
440
441 assert(context);
442
443 std_input = context->std_input;
1e3ad081
LP
444
445 if (is_terminal_input(std_input) && !apply_tty_stdin)
446 return EXEC_INPUT_NULL;
071830ff 447
03fd9c49 448 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
449 return EXEC_INPUT_NULL;
450
08f3be7a
LP
451 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
452 return EXEC_INPUT_NULL;
453
03fd9c49 454 return std_input;
4f2d528d
LP
455}
456
7966a916 457static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 458
7966a916 459 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
460 return EXEC_OUTPUT_INHERIT;
461
7966a916 462 return output;
4f2d528d
LP
463}
464
a34ceba6
LP
465static int setup_input(
466 const ExecContext *context,
467 const ExecParameters *params,
52c239d7 468 int socket_fd,
2caa38e9 469 const int named_iofds[static 3]) {
a34ceba6 470
4f2d528d 471 ExecInput i;
51462135 472 int r;
4f2d528d
LP
473
474 assert(context);
a34ceba6 475 assert(params);
2caa38e9 476 assert(named_iofds);
a34ceba6
LP
477
478 if (params->stdin_fd >= 0) {
479 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
480 return -errno;
481
482 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
483 if (isatty(STDIN_FILENO)) {
484 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
485 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 486 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 487 }
a34ceba6
LP
488
489 return STDIN_FILENO;
490 }
4f2d528d 491
08f3be7a 492 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
493
494 switch (i) {
071830ff 495
80876c20
LP
496 case EXEC_INPUT_NULL:
497 return open_null_as(O_RDONLY, STDIN_FILENO);
498
499 case EXEC_INPUT_TTY:
500 case EXEC_INPUT_TTY_FORCE:
501 case EXEC_INPUT_TTY_FAIL: {
046a82c1 502 int fd;
071830ff 503
1e22b5cd 504 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
505 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
506 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
507 ACQUIRE_TERMINAL_WAIT,
3a43da28 508 USEC_INFINITY);
970edce6 509 if (fd < 0)
80876c20
LP
510 return fd;
511
51462135
DDM
512 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
513 if (r < 0)
514 return r;
515
046a82c1 516 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
517 }
518
4f2d528d 519 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
520 assert(socket_fd >= 0);
521
7c248223 522 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 523
52c239d7 524 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
525 assert(named_iofds[STDIN_FILENO] >= 0);
526
52c239d7 527 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 528 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 529
08f3be7a
LP
530 case EXEC_INPUT_DATA: {
531 int fd;
532
533 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
534 if (fd < 0)
535 return fd;
536
537 return move_fd(fd, STDIN_FILENO, false);
538 }
539
2038c3f5
LP
540 case EXEC_INPUT_FILE: {
541 bool rw;
542 int fd;
543
544 assert(context->stdio_file[STDIN_FILENO]);
545
546 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
547 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
548
549 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
550 if (fd < 0)
551 return fd;
552
553 return move_fd(fd, STDIN_FILENO, false);
554 }
555
80876c20 556 default:
04499a70 557 assert_not_reached();
80876c20
LP
558 }
559}
560
41fc585a
LP
561static bool can_inherit_stderr_from_stdout(
562 const ExecContext *context,
563 ExecOutput o,
564 ExecOutput e) {
565
566 assert(context);
567
568 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
569 * stderr fd */
570
571 if (e == EXEC_OUTPUT_INHERIT)
572 return true;
573 if (e != o)
574 return false;
575
576 if (e == EXEC_OUTPUT_NAMED_FD)
577 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
578
8d7dab1f 579 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
580 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
581
582 return true;
583}
584
a34ceba6 585static int setup_output(
34cf6c43 586 const Unit *unit,
a34ceba6
LP
587 const ExecContext *context,
588 const ExecParameters *params,
589 int fileno,
590 int socket_fd,
2caa38e9 591 const int named_iofds[static 3],
a34ceba6 592 const char *ident,
7bce046b
LP
593 uid_t uid,
594 gid_t gid,
595 dev_t *journal_stream_dev,
596 ino_t *journal_stream_ino) {
a34ceba6 597
4f2d528d
LP
598 ExecOutput o;
599 ExecInput i;
47c1d80d 600 int r;
4f2d528d 601
f2341e0a 602 assert(unit);
80876c20 603 assert(context);
a34ceba6 604 assert(params);
80876c20 605 assert(ident);
7bce046b
LP
606 assert(journal_stream_dev);
607 assert(journal_stream_ino);
80876c20 608
a34ceba6
LP
609 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
610
611 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
612 return -errno;
613
614 return STDOUT_FILENO;
615 }
616
617 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
618 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
619 return -errno;
620
621 return STDERR_FILENO;
622 }
623
08f3be7a 624 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 625 o = fixup_output(context->std_output, socket_fd);
4f2d528d 626
eb17e935
MS
627 if (fileno == STDERR_FILENO) {
628 ExecOutput e;
629 e = fixup_output(context->std_error, socket_fd);
80876c20 630
eb17e935
MS
631 /* This expects the input and output are already set up */
632
633 /* Don't change the stderr file descriptor if we inherit all
634 * the way and are not on a tty */
635 if (e == EXEC_OUTPUT_INHERIT &&
636 o == EXEC_OUTPUT_INHERIT &&
637 i == EXEC_INPUT_NULL &&
638 !is_terminal_input(context->std_input) &&
7966a916 639 getppid() != 1)
eb17e935
MS
640 return fileno;
641
642 /* Duplicate from stdout if possible */
41fc585a 643 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 644 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 645
eb17e935 646 o = e;
80876c20 647
eb17e935 648 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
649 /* If input got downgraded, inherit the original value */
650 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 651 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 652
08f3be7a
LP
653 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
654 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 655 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 656
acb591e4
LP
657 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
658 if (getppid() != 1)
eb17e935 659 return fileno;
94f04347 660
eb17e935
MS
661 /* We need to open /dev/null here anew, to get the right access mode. */
662 return open_null_as(O_WRONLY, fileno);
071830ff 663 }
94f04347 664
eb17e935 665 switch (o) {
80876c20
LP
666
667 case EXEC_OUTPUT_NULL:
eb17e935 668 return open_null_as(O_WRONLY, fileno);
80876c20
LP
669
670 case EXEC_OUTPUT_TTY:
4f2d528d 671 if (is_terminal_input(i))
7c248223 672 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
673
674 /* We don't reset the terminal if this is just about output */
1e22b5cd 675 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 676
9a6bca7a 677 case EXEC_OUTPUT_KMSG:
28dbc1e8 678 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
679 case EXEC_OUTPUT_JOURNAL:
680 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 681 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 682 if (r < 0) {
7966a916
ZJS
683 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
684 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 685 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
686 } else {
687 struct stat st;
688
689 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
690 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
691 * services to detect whether they are connected to the journal or not.
692 *
693 * If both stdout and stderr are connected to a stream then let's make sure to store the data
694 * about STDERR as that's usually the best way to do logging. */
7bce046b 695
ab2116b1
LP
696 if (fstat(fileno, &st) >= 0 &&
697 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
698 *journal_stream_dev = st.st_dev;
699 *journal_stream_ino = st.st_ino;
700 }
47c1d80d
MS
701 }
702 return r;
4f2d528d
LP
703
704 case EXEC_OUTPUT_SOCKET:
705 assert(socket_fd >= 0);
e75a9ed1 706
7c248223 707 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 708
52c239d7 709 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
710 assert(named_iofds[fileno] >= 0);
711
52c239d7 712 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 713 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 714
566b7d23 715 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
716 case EXEC_OUTPUT_FILE_APPEND:
717 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 718 bool rw;
566b7d23 719 int fd, flags;
2038c3f5
LP
720
721 assert(context->stdio_file[fileno]);
722
723 rw = context->std_input == EXEC_INPUT_FILE &&
724 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
725
726 if (rw)
7c248223 727 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 728
566b7d23
ZD
729 flags = O_WRONLY;
730 if (o == EXEC_OUTPUT_FILE_APPEND)
731 flags |= O_APPEND;
8d7dab1f
LW
732 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
733 flags |= O_TRUNC;
566b7d23
ZD
734
735 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
736 if (fd < 0)
737 return fd;
738
566b7d23 739 return move_fd(fd, fileno, 0);
2038c3f5
LP
740 }
741
94f04347 742 default:
04499a70 743 assert_not_reached();
94f04347 744 }
071830ff
LP
745}
746
02a51aba 747static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 748 int r;
02a51aba
LP
749
750 assert(fd >= 0);
02a51aba 751
1ff74fb6 752 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
753 if (isatty(fd) < 1) {
754 if (IN_SET(errno, EINVAL, ENOTTY))
755 return 0; /* not a tty */
1ff74fb6 756
02a51aba 757 return -errno;
4b3b5bc7 758 }
02a51aba 759
4b3b5bc7 760 /* This might fail. What matters are the results. */
f2df231f 761 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
762 if (r < 0)
763 return r;
02a51aba 764
4b3b5bc7 765 return 1;
02a51aba
LP
766}
767
aedec452 768static int setup_confirm_stdio(
51462135 769 const ExecContext *context,
aedec452
LP
770 const char *vc,
771 int *ret_saved_stdin,
772 int *ret_saved_stdout) {
773
254d1313 774 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
3d18b167 775 int r;
80876c20 776
aedec452
LP
777 assert(ret_saved_stdin);
778 assert(ret_saved_stdout);
80876c20 779
af6da548
LP
780 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
781 if (saved_stdin < 0)
782 return -errno;
80876c20 783
af6da548 784 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
785 if (saved_stdout < 0)
786 return -errno;
80876c20 787
8854d795 788 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
789 if (fd < 0)
790 return fd;
80876c20 791
af6da548
LP
792 r = chown_terminal(fd, getuid());
793 if (r < 0)
3d18b167 794 return r;
02a51aba 795
3d18b167
LP
796 r = reset_terminal_fd(fd, true);
797 if (r < 0)
798 return r;
80876c20 799
51462135
DDM
800 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
801 if (r < 0)
802 return r;
803
aedec452
LP
804 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
805 TAKE_FD(fd);
2b33ab09
LP
806 if (r < 0)
807 return r;
80876c20 808
aedec452
LP
809 *ret_saved_stdin = TAKE_FD(saved_stdin);
810 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 811 return 0;
80876c20
LP
812}
813
63d77c92 814static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
815 assert(err < 0);
816
817 if (err == -ETIMEDOUT)
63d77c92 818 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
819 else {
820 errno = -err;
63d77c92 821 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
822 }
823}
824
63d77c92 825static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 826 _cleanup_close_ int fd = -EBADF;
80876c20 827
3b20f877 828 assert(vc);
80876c20 829
7d5ceb64 830 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 831 if (fd < 0)
3b20f877 832 return;
80876c20 833
63d77c92 834 write_confirm_error_fd(err, fd, u);
af6da548 835}
80876c20 836
3d18b167 837static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 838 int r = 0;
80876c20 839
af6da548
LP
840 assert(saved_stdin);
841 assert(saved_stdout);
842
843 release_terminal();
844
845 if (*saved_stdin >= 0)
80876c20 846 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 847 r = -errno;
80876c20 848
af6da548 849 if (*saved_stdout >= 0)
80876c20 850 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 851 r = -errno;
80876c20 852
3d18b167
LP
853 *saved_stdin = safe_close(*saved_stdin);
854 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
855
856 return r;
857}
858
3b20f877
FB
859enum {
860 CONFIRM_PRETEND_FAILURE = -1,
861 CONFIRM_PRETEND_SUCCESS = 0,
862 CONFIRM_EXECUTE = 1,
863};
864
51462135 865static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 866 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 867 _cleanup_free_ char *e = NULL;
3b20f877 868 char c;
af6da548 869
3b20f877 870 /* For any internal errors, assume a positive response. */
51462135 871 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 872 if (r < 0) {
63d77c92 873 write_confirm_error(r, vc, u);
3b20f877
FB
874 return CONFIRM_EXECUTE;
875 }
af6da548 876
b0eb2944
FB
877 /* confirm_spawn might have been disabled while we were sleeping. */
878 if (manager_is_confirm_spawn_disabled(u->manager)) {
879 r = 1;
880 goto restore_stdio;
881 }
af6da548 882
2bcd3c26
FB
883 e = ellipsize(cmdline, 60, 100);
884 if (!e) {
885 log_oom();
886 r = CONFIRM_EXECUTE;
887 goto restore_stdio;
888 }
af6da548 889
d172b175 890 for (;;) {
539622bd 891 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 892 if (r < 0) {
63d77c92 893 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
894 r = CONFIRM_EXECUTE;
895 goto restore_stdio;
896 }
af6da548 897
d172b175 898 switch (c) {
b0eb2944
FB
899 case 'c':
900 printf("Resuming normal execution.\n");
901 manager_disable_confirm_spawn();
902 r = 1;
903 break;
dd6f9ac0
FB
904 case 'D':
905 unit_dump(u, stdout, " ");
906 continue; /* ask again */
d172b175
FB
907 case 'f':
908 printf("Failing execution.\n");
909 r = CONFIRM_PRETEND_FAILURE;
910 break;
911 case 'h':
b0eb2944
FB
912 printf(" c - continue, proceed without asking anymore\n"
913 " D - dump, show the state of the unit\n"
dd6f9ac0 914 " f - fail, don't execute the command and pretend it failed\n"
d172b175 915 " h - help\n"
eedf223a 916 " i - info, show a short summary of the unit\n"
56fde33a 917 " j - jobs, show jobs that are in progress\n"
d172b175
FB
918 " s - skip, don't execute the command and pretend it succeeded\n"
919 " y - yes, execute the command\n");
dd6f9ac0 920 continue; /* ask again */
eedf223a
FB
921 case 'i':
922 printf(" Description: %s\n"
923 " Unit: %s\n"
924 " Command: %s\n",
925 u->id, u->description, cmdline);
926 continue; /* ask again */
56fde33a 927 case 'j':
d1d8786c 928 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 929 continue; /* ask again */
539622bd
FB
930 case 'n':
931 /* 'n' was removed in favor of 'f'. */
932 printf("Didn't understand 'n', did you mean 'f'?\n");
933 continue; /* ask again */
d172b175
FB
934 case 's':
935 printf("Skipping execution.\n");
936 r = CONFIRM_PRETEND_SUCCESS;
937 break;
938 case 'y':
939 r = CONFIRM_EXECUTE;
940 break;
941 default:
04499a70 942 assert_not_reached();
d172b175 943 }
3b20f877 944 break;
3b20f877 945 }
af6da548 946
3b20f877 947restore_stdio:
af6da548 948 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 949 return r;
80876c20
LP
950}
951
4d885bd3
DH
952static int get_fixed_user(const ExecContext *c, const char **user,
953 uid_t *uid, gid_t *gid,
954 const char **home, const char **shell) {
81a2b7ce 955 int r;
4d885bd3 956 const char *name;
81a2b7ce 957
4d885bd3 958 assert(c);
81a2b7ce 959
23deef88
LP
960 if (!c->user)
961 return 0;
962
4d885bd3
DH
963 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
964 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 965
23deef88 966 name = c->user;
fafff8f1 967 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
968 if (r < 0)
969 return r;
81a2b7ce 970
4d885bd3
DH
971 *user = name;
972 return 0;
973}
974
975static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
976 int r;
977 const char *name;
978
979 assert(c);
980
981 if (!c->group)
982 return 0;
983
984 name = c->group;
fafff8f1 985 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
986 if (r < 0)
987 return r;
988
989 *group = name;
990 return 0;
991}
992
cdc5d5c5
DH
993static int get_supplementary_groups(const ExecContext *c, const char *user,
994 const char *group, gid_t gid,
995 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
996 int r, k = 0;
997 int ngroups_max;
998 bool keep_groups = false;
999 gid_t *groups = NULL;
1000 _cleanup_free_ gid_t *l_gids = NULL;
1001
1002 assert(c);
1003
bbeea271
DH
1004 /*
1005 * If user is given, then lookup GID and supplementary groups list.
1006 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1007 * here and as early as possible so we keep the list of supplementary
1008 * groups of the caller.
bbeea271
DH
1009 */
1010 if (user && gid_is_valid(gid) && gid != 0) {
1011 /* First step, initialize groups from /etc/groups */
1012 if (initgroups(user, gid) < 0)
1013 return -errno;
1014
1015 keep_groups = true;
1016 }
1017
ac6e8be6 1018 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1019 return 0;
1020
366ddd25
DH
1021 /*
1022 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1023 * be positive, otherwise fail.
1024 */
1025 errno = 0;
1026 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1027 if (ngroups_max <= 0)
1028 return errno_or_else(EOPNOTSUPP);
366ddd25 1029
4d885bd3
DH
1030 l_gids = new(gid_t, ngroups_max);
1031 if (!l_gids)
1032 return -ENOMEM;
81a2b7ce 1033
4d885bd3
DH
1034 if (keep_groups) {
1035 /*
1036 * Lookup the list of groups that the user belongs to, we
1037 * avoid NSS lookups here too for gid=0.
1038 */
1039 k = ngroups_max;
1040 if (getgrouplist(user, gid, l_gids, &k) < 0)
1041 return -EINVAL;
1042 } else
1043 k = 0;
81a2b7ce 1044
4d885bd3
DH
1045 STRV_FOREACH(i, c->supplementary_groups) {
1046 const char *g;
81a2b7ce 1047
4d885bd3
DH
1048 if (k >= ngroups_max)
1049 return -E2BIG;
81a2b7ce 1050
4d885bd3 1051 g = *i;
fafff8f1 1052 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1053 if (r < 0)
1054 return r;
81a2b7ce 1055
4d885bd3
DH
1056 k++;
1057 }
81a2b7ce 1058
4d885bd3
DH
1059 /*
1060 * Sets ngids to zero to drop all supplementary groups, happens
1061 * when we are under root and SupplementaryGroups= is empty.
1062 */
1063 if (k == 0) {
1064 *ngids = 0;
1065 return 0;
1066 }
81a2b7ce 1067
4d885bd3
DH
1068 /* Otherwise get the final list of supplementary groups */
1069 groups = memdup(l_gids, sizeof(gid_t) * k);
1070 if (!groups)
1071 return -ENOMEM;
1072
1073 *supplementary_gids = groups;
1074 *ngids = k;
1075
1076 groups = NULL;
1077
1078 return 0;
1079}
1080
34cf6c43 1081static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1082 int r;
1083
709dbeac
YW
1084 /* Handle SupplementaryGroups= if it is not empty */
1085 if (ngids > 0) {
4d885bd3
DH
1086 r = maybe_setgroups(ngids, supplementary_gids);
1087 if (r < 0)
97f0e76f 1088 return r;
4d885bd3 1089 }
81a2b7ce 1090
4d885bd3
DH
1091 if (gid_is_valid(gid)) {
1092 /* Then set our gids */
1093 if (setresgid(gid, gid, gid) < 0)
1094 return -errno;
81a2b7ce
LP
1095 }
1096
1097 return 0;
1098}
1099
a954b249
LP
1100static int set_securebits(unsigned bits, unsigned mask) {
1101 unsigned applied;
1102 int current;
1103
dbdc4098
TK
1104 current = prctl(PR_GET_SECUREBITS);
1105 if (current < 0)
1106 return -errno;
a954b249 1107
dbdc4098 1108 /* Clear all securebits defined in mask and set bits */
a954b249
LP
1109 applied = ((unsigned) current & ~mask) | bits;
1110 if ((unsigned) current == applied)
dbdc4098 1111 return 0;
a954b249 1112
dbdc4098
TK
1113 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1114 return -errno;
a954b249 1115
dbdc4098
TK
1116 return 1;
1117}
1118
638fd8cc
LP
1119static int enforce_user(
1120 const ExecContext *context,
1121 uid_t uid,
1122 uint64_t capability_ambient_set) {
81a2b7ce 1123 assert(context);
dbdc4098 1124 int r;
81a2b7ce 1125
4d885bd3
DH
1126 if (!uid_is_valid(uid))
1127 return 0;
1128
a954b249
LP
1129 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1130 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1131 * case. */
81a2b7ce 1132
638fd8cc 1133 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
81a2b7ce 1134
a954b249
LP
1135 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1136 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1137 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1138 if (r < 0)
1139 return r;
81a2b7ce
LP
1140 }
1141
479050b3 1142 /* Second step: actually set the uids */
81a2b7ce
LP
1143 if (setresuid(uid, uid, uid) < 0)
1144 return -errno;
1145
a954b249
LP
1146 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1147 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1148 * outside of this call. */
81a2b7ce
LP
1149 return 0;
1150}
1151
349cc4a5 1152#if HAVE_PAM
5b6319dc
LP
1153
1154static int null_conv(
1155 int num_msg,
1156 const struct pam_message **msg,
1157 struct pam_response **resp,
1158 void *appdata_ptr) {
1159
1160 /* We don't support conversations */
1161
1162 return PAM_CONV_ERR;
1163}
1164
cefc33ae
LP
1165#endif
1166
5b6319dc
LP
1167static int setup_pam(
1168 const char *name,
1169 const char *user,
940c5210 1170 uid_t uid,
2d6fce8d 1171 gid_t gid,
5b6319dc 1172 const char *tty,
421bb42d 1173 char ***env, /* updated on success */
5b8d1f6b 1174 const int fds[], size_t n_fds) {
5b6319dc 1175
349cc4a5 1176#if HAVE_PAM
cefc33ae 1177
5b6319dc
LP
1178 static const struct pam_conv conv = {
1179 .conv = null_conv,
1180 .appdata_ptr = NULL
1181 };
1182
2d7c6aa2 1183 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1184 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1185 pam_handle_t *handle = NULL;
d6e5f3ad 1186 sigset_t old_ss;
7bb70b6e 1187 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1188 bool close_session = false;
1189 pid_t pam_pid = 0, parent_pid;
970edce6 1190 int flags = 0;
5b6319dc
LP
1191
1192 assert(name);
1193 assert(user);
2065ca69 1194 assert(env);
5b6319dc
LP
1195
1196 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1197 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1198 * systemd via the cgroup logic. It will then remove the PAM
1199 * session again. The parent process will exec() the actual
1200 * daemon. We do things this way to ensure that the main PID
1201 * of the daemon is the one we initially fork()ed. */
1202
7bb70b6e
LP
1203 r = barrier_create(&barrier);
1204 if (r < 0)
2d7c6aa2
DH
1205 goto fail;
1206
553d2243 1207 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1208 flags |= PAM_SILENT;
1209
f546241b
ZJS
1210 pam_code = pam_start(name, user, &conv, &handle);
1211 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1212 handle = NULL;
1213 goto fail;
1214 }
1215
3cd24c1a
LP
1216 if (!tty) {
1217 _cleanup_free_ char *q = NULL;
1218
1219 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1220 * out if that's the case, and read the TTY off it. */
1221
1222 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1223 tty = strjoina("/dev/", q);
1224 }
1225
513cf7da
MS
1226 if (tty) {
1227 pam_code = pam_set_item(handle, PAM_TTY, tty);
1228 if (pam_code != PAM_SUCCESS)
1229 goto fail;
1230 }
5b6319dc 1231
84eada2f
JW
1232 STRV_FOREACH(nv, *env) {
1233 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1234 if (pam_code != PAM_SUCCESS)
1235 goto fail;
1236 }
1237
970edce6 1238 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1239 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1240 goto fail;
1241
3bb39ea9
DG
1242 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1243 if (pam_code != PAM_SUCCESS)
46d7c6af 1244 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1245
970edce6 1246 pam_code = pam_open_session(handle, flags);
f546241b 1247 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1248 goto fail;
1249
1250 close_session = true;
1251
f546241b
ZJS
1252 e = pam_getenvlist(handle);
1253 if (!e) {
5b6319dc
LP
1254 pam_code = PAM_BUF_ERR;
1255 goto fail;
1256 }
1257
cafc5ca1 1258 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1259
72c0a2c2 1260 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1261
df0ff127 1262 parent_pid = getpid_cached();
5b6319dc 1263
4c253ed1
LP
1264 r = safe_fork("(sd-pam)", 0, &pam_pid);
1265 if (r < 0)
5b6319dc 1266 goto fail;
4c253ed1 1267 if (r == 0) {
7bb70b6e 1268 int sig, ret = EXIT_PAM;
5b6319dc 1269
cafc5ca1 1270 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1271 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1272
1da37e58
ZJS
1273 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1274 * those fds are open here that have been opened by PAM. */
4c253ed1 1275 (void) close_many(fds, n_fds);
5b6319dc 1276
cafc5ca1
LP
1277 /* Drop privileges - we don't need any to pam_close_session and this will make
1278 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1279 * threads to fail to exit normally */
2d6fce8d 1280
97f0e76f
LP
1281 r = maybe_setgroups(0, NULL);
1282 if (r < 0)
1283 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1284 if (setresgid(gid, gid, gid) < 0)
1285 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1286 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1287 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1288
9c274488 1289 (void) ignore_signals(SIGPIPE);
ce30c8dc 1290
cafc5ca1
LP
1291 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1292 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1293 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1294 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1295 goto child_finish;
1296
cafc5ca1
LP
1297 /* Tell the parent that our setup is done. This is especially important regarding dropping
1298 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1299 *
cafc5ca1 1300 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1301 (void) barrier_place(&barrier);
2d7c6aa2 1302
643f4706 1303 /* Check if our parent process might already have died? */
5b6319dc 1304 if (getppid() == parent_pid) {
d6e5f3ad
DM
1305 sigset_t ss;
1306
1307 assert_se(sigemptyset(&ss) >= 0);
1308 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1309
3dead8d9
LP
1310 for (;;) {
1311 if (sigwait(&ss, &sig) < 0) {
1312 if (errno == EINTR)
1313 continue;
1314
1315 goto child_finish;
1316 }
5b6319dc 1317
3dead8d9
LP
1318 assert(sig == SIGTERM);
1319 break;
1320 }
5b6319dc
LP
1321 }
1322
3bb39ea9
DG
1323 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1324 if (pam_code != PAM_SUCCESS)
1325 goto child_finish;
1326
3dead8d9 1327 /* If our parent died we'll end the session */
f546241b 1328 if (getppid() != parent_pid) {
970edce6 1329 pam_code = pam_close_session(handle, flags);
f546241b 1330 if (pam_code != PAM_SUCCESS)
5b6319dc 1331 goto child_finish;
f546241b 1332 }
5b6319dc 1333
7bb70b6e 1334 ret = 0;
5b6319dc
LP
1335
1336 child_finish:
7feb2b57
LP
1337 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1338 * know about this. See pam_end(3) */
1339 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1340 _exit(ret);
5b6319dc
LP
1341 }
1342
2d7c6aa2
DH
1343 barrier_set_role(&barrier, BARRIER_PARENT);
1344
cafc5ca1
LP
1345 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1346 * here. */
5b6319dc
LP
1347 handle = NULL;
1348
3b8bddde 1349 /* Unblock SIGTERM again in the parent */
72c0a2c2 1350 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1351
cafc5ca1
LP
1352 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1353 * this fd around. */
5b6319dc
LP
1354 closelog();
1355
cafc5ca1
LP
1356 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1357 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1358 if (!barrier_place_and_sync(&barrier))
1359 log_error("PAM initialization failed");
1360
130d3d22 1361 return strv_free_and_replace(*env, e);
5b6319dc
LP
1362
1363fail:
970edce6
ZJS
1364 if (pam_code != PAM_SUCCESS) {
1365 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1366 r = -EPERM; /* PAM errors do not map to errno */
1367 } else
1368 log_error_errno(r, "PAM failed: %m");
9ba35398 1369
5b6319dc
LP
1370 if (handle) {
1371 if (close_session)
970edce6 1372 pam_code = pam_close_session(handle, flags);
5b6319dc 1373
7feb2b57 1374 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1375 }
1376
5b6319dc 1377 closelog();
7bb70b6e 1378 return r;
cefc33ae
LP
1379#else
1380 return 0;
5b6319dc 1381#endif
cefc33ae 1382}
5b6319dc 1383
5d6b1584 1384static void rename_process_from_path(const char *path) {
a99626c1 1385 _cleanup_free_ char *buf = NULL;
5d6b1584 1386 const char *p;
5d6b1584 1387
a99626c1
LP
1388 assert(path);
1389
1390 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1391 * /bin/ps */
5d6b1584 1392
a99626c1 1393 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1394 rename_process("(...)");
1395 return;
1396 }
1397
a99626c1 1398 size_t l = strlen(buf);
5d6b1584 1399 if (l > 8) {
a99626c1 1400 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1401 * "systemd-" */
a99626c1 1402 p = buf + l - 8;
5d6b1584 1403 l = 8;
a99626c1
LP
1404 } else
1405 p = buf;
5d6b1584 1406
a99626c1 1407 char process_name[11];
5d6b1584
LP
1408 process_name[0] = '(';
1409 memcpy(process_name+1, p, l);
1410 process_name[1+l] = ')';
1411 process_name[1+l+1] = 0;
1412
1413 rename_process(process_name);
1414}
1415
469830d1
LP
1416static bool context_has_address_families(const ExecContext *c) {
1417 assert(c);
1418
6b000af4 1419 return c->address_families_allow_list ||
469830d1
LP
1420 !set_isempty(c->address_families);
1421}
1422
1423static bool context_has_syscall_filters(const ExecContext *c) {
1424 assert(c);
1425
6b000af4 1426 return c->syscall_allow_list ||
8cfa775f 1427 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1428}
1429
9df2cdd8
TM
1430static bool context_has_syscall_logs(const ExecContext *c) {
1431 assert(c);
1432
1433 return c->syscall_log_allow_list ||
1434 !hashmap_isempty(c->syscall_log);
1435}
1436
469830d1
LP
1437static bool context_has_no_new_privileges(const ExecContext *c) {
1438 assert(c);
1439
1440 if (c->no_new_privileges)
1441 return true;
1442
26c45a6c 1443 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1444 return false;
1445
1446 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1447 return c->lock_personality ||
469830d1 1448 c->memory_deny_write_execute ||
0538d2a8 1449 c->private_devices ||
fc64760d 1450 c->protect_clock ||
0538d2a8 1451 c->protect_hostname ||
469830d1
LP
1452 c->protect_kernel_tunables ||
1453 c->protect_kernel_modules ||
84703040 1454 c->protect_kernel_logs ||
0538d2a8
YW
1455 context_has_address_families(c) ||
1456 exec_context_restrict_namespaces_set(c) ||
1457 c->restrict_realtime ||
1458 c->restrict_suid_sgid ||
78e864e5 1459 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1460 context_has_syscall_filters(c) ||
1461 context_has_syscall_logs(c);
469830d1
LP
1462}
1463
bb0c0d6f
LP
1464static bool exec_context_has_credentials(const ExecContext *context) {
1465
1466 assert(context);
1467
1468 return !hashmap_isempty(context->set_credentials) ||
43144be4 1469 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1470}
1471
349cc4a5 1472#if HAVE_SECCOMP
17df7223 1473
83f12b27 1474static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1475
1476 if (is_seccomp_available())
1477 return false;
1478
f673b62d 1479 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1480 return true;
83f12b27
FS
1481}
1482
165a31c0 1483static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1484 uint32_t negative_action, default_action, action;
165a31c0 1485 int r;
8351ceae 1486
469830d1 1487 assert(u);
c0467cf3 1488 assert(c);
8351ceae 1489
469830d1 1490 if (!context_has_syscall_filters(c))
83f12b27
FS
1491 return 0;
1492
469830d1
LP
1493 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1494 return 0;
e9642be2 1495
005bfaf1 1496 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1497
6b000af4 1498 if (c->syscall_allow_list) {
469830d1
LP
1499 default_action = negative_action;
1500 action = SCMP_ACT_ALLOW;
7c66bae2 1501 } else {
469830d1
LP
1502 default_action = SCMP_ACT_ALLOW;
1503 action = negative_action;
57183d11 1504 }
8351ceae 1505
165a31c0 1506 if (needs_ambient_hack) {
6b000af4 1507 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1508 if (r < 0)
1509 return r;
1510 }
1511
b54f36c6 1512 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1513}
1514
9df2cdd8
TM
1515static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1516#ifdef SCMP_ACT_LOG
1517 uint32_t default_action, action;
1518#endif
1519
1520 assert(u);
1521 assert(c);
1522
1523 if (!context_has_syscall_logs(c))
1524 return 0;
1525
1526#ifdef SCMP_ACT_LOG
1527 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1528 return 0;
1529
1530 if (c->syscall_log_allow_list) {
1531 /* Log nothing but the ones listed */
1532 default_action = SCMP_ACT_ALLOW;
1533 action = SCMP_ACT_LOG;
1534 } else {
1535 /* Log everything but the ones listed */
1536 default_action = SCMP_ACT_LOG;
1537 action = SCMP_ACT_ALLOW;
1538 }
1539
1540 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1541#else
1542 /* old libseccomp */
1543 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1544 return 0;
1545#endif
1546}
1547
469830d1
LP
1548static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1549 assert(u);
4298d0b5
LP
1550 assert(c);
1551
469830d1 1552 if (set_isempty(c->syscall_archs))
83f12b27
FS
1553 return 0;
1554
469830d1
LP
1555 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1556 return 0;
4298d0b5 1557
469830d1
LP
1558 return seccomp_restrict_archs(c->syscall_archs);
1559}
4298d0b5 1560
469830d1
LP
1561static int apply_address_families(const Unit* u, const ExecContext *c) {
1562 assert(u);
1563 assert(c);
4298d0b5 1564
469830d1
LP
1565 if (!context_has_address_families(c))
1566 return 0;
4298d0b5 1567
469830d1
LP
1568 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1569 return 0;
4298d0b5 1570
6b000af4 1571 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1572}
4298d0b5 1573
83f12b27 1574static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
7a114ed4
TM
1575 int r;
1576
469830d1 1577 assert(u);
f3e43635
TM
1578 assert(c);
1579
469830d1 1580 if (!c->memory_deny_write_execute)
83f12b27
FS
1581 return 0;
1582
7a114ed4
TM
1583 /* use prctl() if kernel supports it (6.3) */
1584 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1585 if (r == 0) {
1586 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1587 return 0;
1588 }
1589 if (r < 0 && errno != EINVAL)
1590 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1591 /* else use seccomp */
1592 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1593
469830d1
LP
1594 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1595 return 0;
f3e43635 1596
469830d1 1597 return seccomp_memory_deny_write_execute();
f3e43635
TM
1598}
1599
83f12b27 1600static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1601 assert(u);
f4170c67
LP
1602 assert(c);
1603
469830d1 1604 if (!c->restrict_realtime)
83f12b27
FS
1605 return 0;
1606
469830d1
LP
1607 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1608 return 0;
f4170c67 1609
469830d1 1610 return seccomp_restrict_realtime();
f4170c67
LP
1611}
1612
f69567cb
LP
1613static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1614 assert(u);
1615 assert(c);
1616
1617 if (!c->restrict_suid_sgid)
1618 return 0;
1619
1620 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1621 return 0;
1622
1623 return seccomp_restrict_suid_sgid();
1624}
1625
59e856c7 1626static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1627 assert(u);
59eeb84b
LP
1628 assert(c);
1629
1630 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1631 * let's protect even those systems where this is left on in the kernel. */
1632
469830d1 1633 if (!c->protect_kernel_tunables)
59eeb84b
LP
1634 return 0;
1635
469830d1
LP
1636 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1637 return 0;
59eeb84b 1638
469830d1 1639 return seccomp_protect_sysctl();
59eeb84b
LP
1640}
1641
59e856c7 1642static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1643 assert(u);
502d704e
DH
1644 assert(c);
1645
25a8d8a0 1646 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1647
469830d1
LP
1648 if (!c->protect_kernel_modules)
1649 return 0;
1650
502d704e
DH
1651 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1652 return 0;
1653
b54f36c6 1654 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1655}
1656
84703040
KK
1657static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1658 assert(u);
1659 assert(c);
1660
1661 if (!c->protect_kernel_logs)
1662 return 0;
1663
1664 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1665 return 0;
1666
1667 return seccomp_protect_syslog();
1668}
1669
daf8f72b 1670static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1671 assert(u);
1672 assert(c);
1673
1674 if (!c->protect_clock)
1675 return 0;
1676
1677 if (skip_seccomp_unavailable(u, "ProtectClock="))
1678 return 0;
1679
1680 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1681}
1682
59e856c7 1683static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1684 assert(u);
ba128bb8
LP
1685 assert(c);
1686
8f81a5f6 1687 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1688
469830d1
LP
1689 if (!c->private_devices)
1690 return 0;
1691
ba128bb8
LP
1692 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1693 return 0;
1694
b54f36c6 1695 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1696}
1697
34cf6c43 1698static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1699 assert(u);
add00535
LP
1700 assert(c);
1701
1702 if (!exec_context_restrict_namespaces_set(c))
1703 return 0;
1704
1705 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1706 return 0;
1707
1708 return seccomp_restrict_namespaces(c->restrict_namespaces);
1709}
1710
78e864e5 1711static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1712 unsigned long personality;
1713 int r;
78e864e5
TM
1714
1715 assert(u);
1716 assert(c);
1717
1718 if (!c->lock_personality)
1719 return 0;
1720
1721 if (skip_seccomp_unavailable(u, "LockPersonality="))
1722 return 0;
1723
e8132d63
LP
1724 personality = c->personality;
1725
1726 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1727 if (personality == PERSONALITY_INVALID) {
1728
1729 r = opinionated_personality(&personality);
1730 if (r < 0)
1731 return r;
1732 }
78e864e5
TM
1733
1734 return seccomp_lock_personality(personality);
1735}
1736
c0467cf3 1737#endif
8351ceae 1738
7a8288f6 1739#if HAVE_LIBBPF
7a8288f6
DM
1740static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1741 assert(u);
1742 assert(c);
1743
1744 if (!exec_context_restrict_filesystems_set(c))
1745 return 0;
1746
46004616
ZJS
1747 if (!u->manager->restrict_fs) {
1748 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1749 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1750 return 0;
46004616 1751 }
7a8288f6
DM
1752
1753 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1754}
1755#endif
1756
daf8f72b 1757static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1758 assert(u);
1759 assert(c);
1760
1761 if (!c->protect_hostname)
1762 return 0;
1763
1764 if (ns_type_supported(NAMESPACE_UTS)) {
1765 if (unshare(CLONE_NEWUTS) < 0) {
1766 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1767 *ret_exit_status = EXIT_NAMESPACE;
1768 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1769 }
1770
1771 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1772 }
1773 } else
1774 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1775
1776#if HAVE_SECCOMP
8f3e342f
ZJS
1777 int r;
1778
daf8f72b
LP
1779 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1780 return 0;
1781
1782 r = seccomp_protect_hostname();
1783 if (r < 0) {
1784 *ret_exit_status = EXIT_SECCOMP;
1785 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1786 }
1787#endif
1788
1789 return 0;
1790}
1791
3042bbeb 1792static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1793 assert(idle_pipe);
1794
54eb2300
LP
1795 idle_pipe[1] = safe_close(idle_pipe[1]);
1796 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1797
1798 if (idle_pipe[0] >= 0) {
1799 int r;
1800
1801 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1802
1803 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1804 ssize_t n;
1805
31a7eb86 1806 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1807 n = write(idle_pipe[3], "x", 1);
1808 if (n > 0)
cd972d69 1809 /* Wait for systemd to react to the signal above. */
54756dce 1810 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1811 }
1812
54eb2300 1813 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1814
1815 }
1816
54eb2300 1817 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1818}
1819
fb2042dd
YW
1820static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1821
7cae38c4 1822static int build_environment(
34cf6c43 1823 const Unit *u,
9fa95f85 1824 const ExecContext *c,
1e22b5cd 1825 const ExecParameters *p,
6bb00842 1826 const CGroupContext *cgroup_context,
da6053d0 1827 size_t n_fds,
cd48e23f 1828 char **fdnames,
7cae38c4
LP
1829 const char *home,
1830 const char *username,
1831 const char *shell,
7bce046b
LP
1832 dev_t journal_stream_dev,
1833 ino_t journal_stream_ino,
6bb00842 1834 const char *memory_pressure_path,
7cae38c4
LP
1835 char ***ret) {
1836
1837 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1838 size_t n_env = 0;
7cae38c4
LP
1839 char *x;
1840
4b58153d 1841 assert(u);
7cae38c4 1842 assert(c);
7c1cb6f1 1843 assert(p);
7cae38c4
LP
1844 assert(ret);
1845
6bb00842 1846#define N_ENV_VARS 19
8d5bb13d 1847 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1848 if (!our_env)
1849 return -ENOMEM;
1850
1851 if (n_fds > 0) {
8dd4c05b
LP
1852 _cleanup_free_ char *joined = NULL;
1853
df0ff127 1854 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1855 return -ENOMEM;
1856 our_env[n_env++] = x;
1857
da6053d0 1858 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1859 return -ENOMEM;
1860 our_env[n_env++] = x;
8dd4c05b 1861
cd48e23f 1862 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1863 if (!joined)
1864 return -ENOMEM;
1865
605405c6 1866 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1867 if (!x)
1868 return -ENOMEM;
1869 our_env[n_env++] = x;
7cae38c4
LP
1870 }
1871
b08af3b1 1872 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1873 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1874 return -ENOMEM;
1875 our_env[n_env++] = x;
1876
1e22b5cd 1877 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1878 return -ENOMEM;
1879 our_env[n_env++] = x;
1880 }
1881
de90700f
LP
1882 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1883 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1884 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1885 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1886 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1887 if (!x)
1888 return -ENOMEM;
1889 our_env[n_env++] = x;
1890 }
1891
7cae38c4 1892 if (home) {
b910cc72 1893 x = strjoin("HOME=", home);
7cae38c4
LP
1894 if (!x)
1895 return -ENOMEM;
7bbead1d 1896
4ff361cc 1897 path_simplify(x + 5);
7cae38c4
LP
1898 our_env[n_env++] = x;
1899 }
1900
1901 if (username) {
b910cc72 1902 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1903 if (!x)
1904 return -ENOMEM;
1905 our_env[n_env++] = x;
1906
b910cc72 1907 x = strjoin("USER=", username);
7cae38c4
LP
1908 if (!x)
1909 return -ENOMEM;
1910 our_env[n_env++] = x;
1911 }
1912
1913 if (shell) {
b910cc72 1914 x = strjoin("SHELL=", shell);
7cae38c4
LP
1915 if (!x)
1916 return -ENOMEM;
7bbead1d 1917
4ff361cc 1918 path_simplify(x + 6);
7cae38c4
LP
1919 our_env[n_env++] = x;
1920 }
1921
4b58153d
LP
1922 if (!sd_id128_is_null(u->invocation_id)) {
1923 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1924 return -ENOMEM;
1925
1926 our_env[n_env++] = x;
1927 }
1928
6af760f3
LP
1929 if (exec_context_needs_term(c)) {
1930 const char *tty_path, *term = NULL;
1931
1932 tty_path = exec_context_tty_path(c);
1933
e8cf09b2
LP
1934 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1935 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1936 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1937
e8cf09b2 1938 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1939 term = getenv("TERM");
e8cf09b2 1940
6af760f3
LP
1941 if (!term)
1942 term = default_term_for_tty(tty_path);
7cae38c4 1943
b910cc72 1944 x = strjoin("TERM=", term);
7cae38c4
LP
1945 if (!x)
1946 return -ENOMEM;
1947 our_env[n_env++] = x;
1948 }
1949
7bce046b
LP
1950 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1951 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1952 return -ENOMEM;
1953
1954 our_env[n_env++] = x;
1955 }
1956
91dd5f7c
LP
1957 if (c->log_namespace) {
1958 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1959 if (!x)
1960 return -ENOMEM;
1961
1962 our_env[n_env++] = x;
1963 }
1964
5b10116e 1965 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1966 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1967 const char *n;
1968
1969 if (!p->prefix[t])
1970 continue;
1971
211a3d87 1972 if (c->directories[t].n_items == 0)
fb2042dd
YW
1973 continue;
1974
1975 n = exec_directory_env_name_to_string(t);
1976 if (!n)
1977 continue;
1978
211a3d87
LB
1979 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1980 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1981
211a3d87
LB
1982 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1983 if (!prefixed)
1984 return -ENOMEM;
1985
1986 if (!strextend_with_separator(&joined, ":", prefixed))
1987 return -ENOMEM;
1988 }
fb2042dd
YW
1989
1990 x = strjoin(n, "=", joined);
1991 if (!x)
1992 return -ENOMEM;
1993
1994 our_env[n_env++] = x;
1995 }
1996
bb0c0d6f
LP
1997 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1998 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1999 if (!x)
2000 return -ENOMEM;
2001
2002 our_env[n_env++] = x;
2003 }
2004
dc4e2940
YW
2005 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2006 return -ENOMEM;
2007
2008 our_env[n_env++] = x;
2009
6bb00842
LP
2010 if (memory_pressure_path) {
2011 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2012 if (!x)
2013 return -ENOMEM;
2014
2015 our_env[n_env++] = x;
2016
2017 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2018 _cleanup_free_ char *b = NULL, *e = NULL;
2019
2020 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2021 MEMORY_PRESSURE_DEFAULT_TYPE,
2022 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2023 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2024 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2025 return -ENOMEM;
2026
2027 if (base64mem(b, strlen(b) + 1, &e) < 0)
2028 return -ENOMEM;
2029
2030 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2031 if (!x)
2032 return -ENOMEM;
2033
2034 our_env[n_env++] = x;
2035 }
2036 }
2037
2038 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
8d5bb13d 2039#undef N_ENV_VARS
7cae38c4 2040
ae2a15bc 2041 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2042
2043 return 0;
2044}
2045
b4c14404
FB
2046static int build_pass_environment(const ExecContext *c, char ***ret) {
2047 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2048 size_t n_env = 0;
b4c14404
FB
2049
2050 STRV_FOREACH(i, c->pass_environment) {
2051 _cleanup_free_ char *x = NULL;
2052 char *v;
2053
2054 v = getenv(*i);
2055 if (!v)
2056 continue;
605405c6 2057 x = strjoin(*i, "=", v);
b4c14404
FB
2058 if (!x)
2059 return -ENOMEM;
00819cc1 2060
319a4f4b 2061 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2062 return -ENOMEM;
00819cc1 2063
1cc6c93a 2064 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2065 pass_env[n_env] = NULL;
b4c14404
FB
2066 }
2067
ae2a15bc 2068 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2069
2070 return 0;
2071}
2072
fbbb9697
YW
2073bool exec_needs_network_namespace(const ExecContext *context) {
2074 assert(context);
2075
2076 return context->private_network || context->network_namespace_path;
2077}
2078
fde36d25
YW
2079static bool exec_needs_ipc_namespace(const ExecContext *context) {
2080 assert(context);
2081
2082 return context->private_ipc || context->ipc_namespace_path;
2083}
2084
5e8deb94 2085bool exec_needs_mount_namespace(
8b44a3d2
LP
2086 const ExecContext *context,
2087 const ExecParameters *params,
4657abb5 2088 const ExecRuntime *runtime) {
8b44a3d2
LP
2089
2090 assert(context);
8b44a3d2 2091
915e6d16
LP
2092 if (context->root_image)
2093 return true;
2094
2a624c36
AP
2095 if (!strv_isempty(context->read_write_paths) ||
2096 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2097 !strv_isempty(context->inaccessible_paths) ||
2098 !strv_isempty(context->exec_paths) ||
2099 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2100 return true;
2101
42b1d8e0 2102 if (context->n_bind_mounts > 0)
d2d6c096
LP
2103 return true;
2104
2abd4e38
YW
2105 if (context->n_temporary_filesystems > 0)
2106 return true;
2107
b3d13314
LB
2108 if (context->n_mount_images > 0)
2109 return true;
2110
93f59701
LB
2111 if (context->n_extension_images > 0)
2112 return true;
2113
a07b9926
LB
2114 if (!strv_isempty(context->extension_directories))
2115 return true;
2116
874cdcbc 2117 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
8b44a3d2
LP
2118 return true;
2119
2120 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2121 return true;
2122
8b44a3d2 2123 if (context->private_devices ||
24002121 2124 context->private_mounts > 0 ||
c2da3bf2 2125 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
8b44a3d2 2126 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2127 context->protect_home != PROTECT_HOME_NO ||
2128 context->protect_kernel_tunables ||
c575770b 2129 context->protect_kernel_modules ||
94a7b275 2130 context->protect_kernel_logs ||
4e399953
LP
2131 context->protect_control_groups ||
2132 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44 2133 context->proc_subset != PROC_SUBSET_ALL ||
fde36d25 2134 exec_needs_ipc_namespace(context))
8b44a3d2
LP
2135 return true;
2136
37c56f89 2137 if (context->root_directory) {
5e98086d 2138 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2139 return true;
2140
5b10116e 2141 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2142 if (params && !params->prefix[t])
37c56f89
YW
2143 continue;
2144
211a3d87 2145 if (context->directories[t].n_items > 0)
37c56f89
YW
2146 return true;
2147 }
2148 }
5d997827 2149
42b1d8e0 2150 if (context->dynamic_user &&
211a3d87
LB
2151 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2152 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2153 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2154 return true;
2155
91dd5f7c
LP
2156 if (context->log_namespace)
2157 return true;
2158
8b44a3d2
LP
2159 return false;
2160}
2161
5749f855 2162static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2163 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2164 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2165 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2166 _cleanup_(sigkill_waitp) pid_t pid = 0;
2167 uint64_t c = 1;
d251207d
LP
2168 ssize_t n;
2169 int r;
2170
5749f855
AZ
2171 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2172 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2173 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2174 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2175 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2176 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2177 * continues execution normally.
2178 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2179 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2180
5749f855 2181 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2182 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2183 r = asprintf(&uid_map,
5749f855 2184 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2185 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2186 ouid, ouid, uid, uid);
2187 else
2188 r = asprintf(&uid_map,
2189 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2190 ouid, ouid);
d251207d 2191
5749f855
AZ
2192 if (r < 0)
2193 return -ENOMEM;
2194
2195 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2196 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2197 r = asprintf(&gid_map,
5749f855 2198 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2199 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2200 ogid, ogid, gid, gid);
2201 else
2202 r = asprintf(&gid_map,
2203 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2204 ogid, ogid);
2205
2206 if (r < 0)
2207 return -ENOMEM;
d251207d
LP
2208
2209 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2210 * namespace. */
2211 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2212 if (unshare_ready_fd < 0)
2213 return -errno;
2214
2215 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2216 * failed. */
2217 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2218 return -errno;
2219
4c253ed1
LP
2220 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2221 if (r < 0)
2222 return r;
2223 if (r == 0) {
254d1313 2224 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2225 const char *a;
2226 pid_t ppid;
2227
2228 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2229 * here, after the parent opened its own user namespace. */
2230
2231 ppid = getppid();
2232 errno_pipe[0] = safe_close(errno_pipe[0]);
2233
2234 /* Wait until the parent unshared the user namespace */
2235 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2236 r = -errno;
2237 goto child_fail;
2238 }
2239
2240 /* Disable the setgroups() system call in the child user namespace, for good. */
2241 a = procfs_file_alloca(ppid, "setgroups");
2242 fd = open(a, O_WRONLY|O_CLOEXEC);
2243 if (fd < 0) {
2244 if (errno != ENOENT) {
2245 r = -errno;
2246 goto child_fail;
2247 }
2248
2249 /* If the file is missing the kernel is too old, let's continue anyway. */
2250 } else {
2251 if (write(fd, "deny\n", 5) < 0) {
2252 r = -errno;
2253 goto child_fail;
2254 }
2255
2256 fd = safe_close(fd);
2257 }
2258
2259 /* First write the GID map */
2260 a = procfs_file_alloca(ppid, "gid_map");
2261 fd = open(a, O_WRONLY|O_CLOEXEC);
2262 if (fd < 0) {
2263 r = -errno;
2264 goto child_fail;
2265 }
2266 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2267 r = -errno;
2268 goto child_fail;
2269 }
2270 fd = safe_close(fd);
2271
2272 /* The write the UID map */
2273 a = procfs_file_alloca(ppid, "uid_map");
2274 fd = open(a, O_WRONLY|O_CLOEXEC);
2275 if (fd < 0) {
2276 r = -errno;
2277 goto child_fail;
2278 }
2279 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2280 r = -errno;
2281 goto child_fail;
2282 }
2283
2284 _exit(EXIT_SUCCESS);
2285
2286 child_fail:
2287 (void) write(errno_pipe[1], &r, sizeof(r));
2288 _exit(EXIT_FAILURE);
2289 }
2290
2291 errno_pipe[1] = safe_close(errno_pipe[1]);
2292
2293 if (unshare(CLONE_NEWUSER) < 0)
2294 return -errno;
2295
2296 /* Let the child know that the namespace is ready now */
2297 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2298 return -errno;
2299
2300 /* Try to read an error code from the child */
2301 n = read(errno_pipe[0], &r, sizeof(r));
2302 if (n < 0)
2303 return -errno;
2304 if (n == sizeof(r)) { /* an error code was sent to us */
2305 if (r < 0)
2306 return r;
2307 return -EIO;
2308 }
2309 if (n != 0) /* on success we should have read 0 bytes */
2310 return -EIO;
2311
8f03de53 2312 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2313 if (r < 0)
2314 return r;
2e87a1fd 2315 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2316 return -EIO;
2317
2318 return 0;
2319}
2320
494d0247
YW
2321static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2322 if (!context->dynamic_user)
2323 return false;
2324
2325 if (type == EXEC_DIRECTORY_CONFIGURATION)
2326 return false;
2327
2328 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2329 return false;
2330
2331 return true;
2332}
2333
211a3d87
LB
2334static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2335 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2336 int r;
2337
2338 assert(source);
2339
2340 src_abs = path_join(root, source);
2341 if (!src_abs)
2342 return -ENOMEM;
2343
2344 STRV_FOREACH(dst, symlinks) {
2345 _cleanup_free_ char *dst_abs = NULL;
2346
2347 dst_abs = path_join(root, *dst);
2348 if (!dst_abs)
2349 return -ENOMEM;
2350
2351 r = mkdir_parents_label(dst_abs, 0755);
2352 if (r < 0)
2353 return r;
2354
2355 r = symlink_idempotent(src_abs, dst_abs, true);
2356 if (r < 0)
2357 return r;
2358 }
2359
2360 return 0;
2361}
2362
3536f49e 2363static int setup_exec_directory(
07689d5d
LP
2364 const ExecContext *context,
2365 const ExecParameters *params,
2366 uid_t uid,
3536f49e 2367 gid_t gid,
3536f49e 2368 ExecDirectoryType type,
211a3d87 2369 bool needs_mount_namespace,
3536f49e 2370 int *exit_status) {
07689d5d 2371
72fd1768 2372 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2373 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2374 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2375 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2376 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2377 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2378 };
07689d5d
LP
2379 int r;
2380
2381 assert(context);
2382 assert(params);
72fd1768 2383 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2384 assert(exit_status);
07689d5d 2385
3536f49e
YW
2386 if (!params->prefix[type])
2387 return 0;
2388
8679efde 2389 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2390 if (!uid_is_valid(uid))
2391 uid = 0;
2392 if (!gid_is_valid(gid))
2393 gid = 0;
2394 }
2395
211a3d87 2396 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2397 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2398
211a3d87 2399 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2400 if (!p) {
2401 r = -ENOMEM;
2402 goto fail;
2403 }
07689d5d 2404
23a7448e
YW
2405 r = mkdir_parents_label(p, 0755);
2406 if (r < 0)
3536f49e 2407 goto fail;
23a7448e 2408
494d0247 2409 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2410 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2411 * case we want to avoid leaving a directory around fully accessible that is owned by
2412 * a dynamic user whose UID is later on reused. To lock this down we use the same
2413 * trick used by container managers to prohibit host users to get access to files of
2414 * the same UID in containers: we place everything inside a directory that has an
2415 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2416 * for unprivileged host code. We then use fs namespacing to make this directory
2417 * permeable for the service itself.
6c47cd7d 2418 *
3f5b1508
LP
2419 * Specifically: for a service which wants a special directory "foo/" we first create
2420 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2421 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2422 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2423 * unprivileged host users can't look into it. Inside of the namespace of the unit
2424 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2425 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2426 * for the service and making sure it only gets access to the dirs it needs but no
2427 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2428 *
3f5b1508
LP
2429 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2430 * to be owned by the service itself.
2431 *
2432 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2433 * for sharing files or sockets with other services. */
6c47cd7d 2434
4ede9802
LP
2435 pp = path_join(params->prefix[type], "private");
2436 if (!pp) {
6c47cd7d
LP
2437 r = -ENOMEM;
2438 goto fail;
2439 }
2440
2441 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2442 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2443 if (r < 0)
2444 goto fail;
2445
211a3d87 2446 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2447 r = -ENOMEM;
2448 goto fail;
2449 }
2450
2451 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2452 r = mkdir_parents_label(pp, 0755);
2453 if (r < 0)
2454 goto fail;
2455
949befd3
LP
2456 if (is_dir(p, false) > 0 &&
2457 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2458
2459 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2460 * it over. Most likely the service has been upgraded from one that didn't use
2461 * DynamicUser=1, to one that does. */
2462
cf52c45d
LP
2463 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2464 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2465 exec_directory_type_to_string(type), p, pp);
2466
949befd3
LP
2467 if (rename(p, pp) < 0) {
2468 r = -errno;
2469 goto fail;
2470 }
2471 } else {
2472 /* Otherwise, create the actual directory for the service */
2473
2474 r = mkdir_label(pp, context->directories[type].mode);
2475 if (r < 0 && r != -EEXIST)
2476 goto fail;
2477 }
6c47cd7d 2478
a2ab603c
YW
2479 if (!context->directories[type].items[i].only_create) {
2480 /* And link it up from the original place.
2481 * Notes
2482 * 1) If a mount namespace is going to be used, then this symlink remains on
2483 * the host, and a new one for the child namespace will be created later.
2484 * 2) It is not necessary to create this symlink when one of its parent
2485 * directories is specified and already created. E.g.
2486 * StateDirectory=foo foo/bar
2487 * In that case, the inode points to pp and p for "foo/bar" are the same:
2488 * pp = "/var/lib/private/foo/bar"
2489 * p = "/var/lib/foo/bar"
2490 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2491 * we do not need to create the symlink, but we cannot create the symlink.
2492 * See issue #24783. */
2493 r = symlink_idempotent(pp, p, true);
2494 if (r < 0)
2495 goto fail;
2496 }
6c47cd7d 2497
6c47cd7d 2498 } else {
5c6d40d1
LP
2499 _cleanup_free_ char *target = NULL;
2500
2501 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2502 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2503 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2504
2505 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2506 * by DynamicUser=1 (see above)?
2507 *
2508 * We do this for all directory types except for ConfigurationDirectory=,
2509 * since they all support the private/ symlink logic at least in some
2510 * configurations, see above. */
5c6d40d1 2511
578dc69f
YW
2512 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2513 if (r < 0)
2514 goto fail;
2515
211a3d87 2516 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2517 if (!q) {
2518 r = -ENOMEM;
2519 goto fail;
2520 }
2521
578dc69f
YW
2522 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2523 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2524 if (r < 0)
2525 goto fail;
2526
2527 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2528
2529 /* Hmm, apparently DynamicUser= was once turned on for this service,
2530 * but is no longer. Let's move the directory back up. */
2531
cf52c45d
LP
2532 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2533 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2534 exec_directory_type_to_string(type), q, p);
2535
5c6d40d1
LP
2536 if (unlink(p) < 0) {
2537 r = -errno;
2538 goto fail;
2539 }
2540
2541 if (rename(q, p) < 0) {
2542 r = -errno;
2543 goto fail;
2544 }
2545 }
2546 }
2547
6c47cd7d 2548 r = mkdir_label(p, context->directories[type].mode);
d484580c 2549 if (r < 0) {
d484580c
LP
2550 if (r != -EEXIST)
2551 goto fail;
2552
206e9864
LP
2553 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2554 struct stat st;
2555
2556 /* Don't change the owner/access mode of the configuration directory,
2557 * as in the common case it is not written to by a service, and shall
2558 * not be writable. */
2559
2560 if (stat(p, &st) < 0) {
2561 r = -errno;
2562 goto fail;
2563 }
2564
2565 /* Still complain if the access mode doesn't match */
2566 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2567 log_warning("%s \'%s\' already exists but the mode is different. "
2568 "(File system: %o %sMode: %o)",
211a3d87 2569 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2570 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2571
6cff72eb 2572 continue;
206e9864 2573 }
6cff72eb 2574 }
a1164ae3 2575 }
07689d5d 2576
206e9864 2577 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2578 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2579 * current UID/GID ownership.) */
2580 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2581 if (r < 0)
2582 goto fail;
c71b2eb7 2583
607b358e
LP
2584 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2585 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2586 * assignments to exist. */
607b358e 2587 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2588 if (r < 0)
3536f49e 2589 goto fail;
07689d5d
LP
2590 }
2591
211a3d87
LB
2592 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2593 * they are set up later, to allow configuring empty var/run/etc. */
2594 if (!needs_mount_namespace)
2595 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2596 r = create_many_symlinks(params->prefix[type],
2597 context->directories[type].items[i].path,
2598 context->directories[type].items[i].symlinks);
2599 if (r < 0)
2600 goto fail;
2601 }
2602
07689d5d 2603 return 0;
3536f49e
YW
2604
2605fail:
2606 *exit_status = exit_status_table[type];
3536f49e 2607 return r;
07689d5d
LP
2608}
2609
bb0c0d6f
LP
2610static int write_credential(
2611 int dfd,
2612 const char *id,
2613 const void *data,
2614 size_t size,
2615 uid_t uid,
2616 bool ownership_ok) {
2617
2618 _cleanup_(unlink_and_freep) char *tmp = NULL;
254d1313 2619 _cleanup_close_ int fd = -EBADF;
bb0c0d6f
LP
2620 int r;
2621
2622 r = tempfn_random_child("", "cred", &tmp);
2623 if (r < 0)
2624 return r;
2625
2626 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2627 if (fd < 0) {
2628 tmp = mfree(tmp);
2629 return -errno;
2630 }
2631
43144be4 2632 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2633 if (r < 0)
2634 return r;
2635
2636 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2637 return -errno;
2638
2639 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2640 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2641 if (r < 0) {
2642 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2643 return r;
2644
2645 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2646 * to express: that the user gets read access and nothing
2647 * else. But if the backing fs can't support that (e.g. ramfs)
2648 * then we can use file ownership instead. But that's only safe if
2649 * we can then re-mount the whole thing read-only, so that the
2650 * user can no longer chmod() the file to gain write access. */
2651 return r;
2652
f5fbe71d 2653 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2654 return -errno;
2655 }
2656 }
2657
2658 if (renameat(dfd, tmp, dfd, id) < 0)
2659 return -errno;
2660
2661 tmp = mfree(tmp);
2662 return 0;
2663}
2664
2ad591a3
LP
2665static char **credential_search_path(
2666 const ExecParameters *params,
2667 bool encrypted) {
2668
2669 _cleanup_strv_free_ char **l = NULL;
2670
2671 assert(params);
2672
2673 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2674 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2675 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2676
2677 if (encrypted) {
2678 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2679 return NULL;
2680
2681 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2682 return NULL;
2683 }
2684
2685 if (params->received_credentials_directory)
2686 if (strv_extend(&l, params->received_credentials_directory) < 0)
2687 return NULL;
2688
2689 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2690 return NULL;
2691
2692 if (DEBUG_LOGGING) {
2693 _cleanup_free_ char *t = strv_join(l, ":");
2694
2695 log_debug("Credential search path is: %s", t);
2696 }
2697
2698 return TAKE_PTR(l);
2699}
2700
3989bdc1
AB
2701static int load_credential(
2702 const ExecContext *context,
2703 const ExecParameters *params,
10b44e1d
LP
2704 const char *id,
2705 const char *path,
2706 bool encrypted,
3989bdc1
AB
2707 const char *unit,
2708 int read_dfd,
2709 int write_dfd,
2710 uid_t uid,
2711 bool ownership_ok,
2712 uint64_t *left) {
2713
3989bdc1 2714 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2715 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2716 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2717 _cleanup_free_ char *bindname = NULL;
2718 const char *source = NULL;
3989bdc1 2719 bool missing_ok = true;
2ad591a3 2720 size_t size, add, maxsz;
3989bdc1
AB
2721 int r;
2722
10b44e1d
LP
2723 assert(context);
2724 assert(params);
2725 assert(id);
2726 assert(path);
2727 assert(unit);
661e4251 2728 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
10b44e1d
LP
2729 assert(write_dfd >= 0);
2730 assert(left);
2731
2ad591a3
LP
2732 if (read_dfd >= 0) {
2733 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2734 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2735 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2736 * open it. */
2737
2738 if (!filename_is_valid(path)) /* safety check */
2739 return -EINVAL;
2740
2741 missing_ok = true;
10b44e1d 2742 source = path;
2ad591a3
LP
2743
2744 } else if (path_is_absolute(path)) {
2745 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2746 * sockets */
2747
2748 if (!path_is_valid(path)) /* safety check */
2749 return -EINVAL;
2750
3989bdc1
AB
2751 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2752
2753 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2754 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2755 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2756 return -ENOMEM;
2757
2758 missing_ok = false;
2ad591a3 2759 source = path;
3989bdc1 2760
2ad591a3
LP
2761 } else if (credential_name_valid(path)) {
2762 /* If this is a relative path, take it as credential name relative to the credentials
2763 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2764 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2765
2766 search_path = credential_search_path(params, encrypted);
2767 if (!search_path)
3989bdc1
AB
2768 return -ENOMEM;
2769
2ad591a3 2770 missing_ok = true;
3989bdc1
AB
2771 } else
2772 source = NULL;
2773
2ad591a3
LP
2774 if (encrypted)
2775 flags |= READ_FULL_FILE_UNBASE64;
2776
2777 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2778
2779 if (search_path) {
2780 STRV_FOREACH(d, search_path) {
2781 _cleanup_free_ char *j = NULL;
2782
2783 j = path_join(*d, path);
2784 if (!j)
2785 return -ENOMEM;
2786
2787 r = read_full_file_full(
2788 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2789 UINT64_MAX,
2790 maxsz,
2791 flags,
2792 NULL,
2793 &data, &size);
2794 if (r != -ENOENT)
2795 break;
2796 }
2797 } else if (source)
3989bdc1
AB
2798 r = read_full_file_full(
2799 read_dfd, source,
2800 UINT64_MAX,
2ad591a3
LP
2801 maxsz,
2802 flags,
3989bdc1
AB
2803 bindname,
2804 &data, &size);
2805 else
2806 r = -ENOENT;
2807
10b44e1d 2808 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2809 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2810 * will get clear errors if we don't pass such a missing credential on as they
2811 * themselves will get ENOENT when trying to read them, which should not be much
2812 * worse than when we handle the error here and make it fatal.
2813 *
2814 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2815 * we are fine, too. */
10b44e1d 2816 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2817 return 0;
2818 }
2819 if (r < 0)
10b44e1d 2820 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2821
10b44e1d 2822 if (encrypted) {
3989bdc1
AB
2823 _cleanup_free_ void *plaintext = NULL;
2824 size_t plaintext_size = 0;
2825
6a0779cb 2826 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2827 if (r < 0)
2828 return r;
2829
2830 free_and_replace(data, plaintext);
2831 size = plaintext_size;
2832 }
2833
10b44e1d 2834 add = strlen(id) + size;
3989bdc1
AB
2835 if (add > *left)
2836 return -E2BIG;
2837
10b44e1d 2838 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2839 if (r < 0)
94602bff 2840 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2841
2842 *left -= add;
2843 return 0;
2844}
2845
2846struct load_cred_args {
3989bdc1
AB
2847 const ExecContext *context;
2848 const ExecParameters *params;
461345a1 2849 bool encrypted;
3989bdc1
AB
2850 const char *unit;
2851 int dfd;
2852 uid_t uid;
2853 bool ownership_ok;
2854 uint64_t *left;
2855};
2856
2857static int load_cred_recurse_dir_cb(
2858 RecurseDirEvent event,
2859 const char *path,
2860 int dir_fd,
2861 int inode_fd,
2862 const struct dirent *de,
2863 const struct statx *sx,
2864 void *userdata) {
2865
6394e5cd 2866 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2867 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2868 int r;
2869
2870 if (event != RECURSE_DIR_ENTRY)
2871 return RECURSE_DIR_CONTINUE;
2872
2873 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2874 return RECURSE_DIR_CONTINUE;
2875
11348386 2876 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2877 if (!sub_id)
2878 return -ENOMEM;
2879
2880 if (!credential_name_valid(sub_id))
1451435c 2881 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2882
5bec447a 2883 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2884 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2885 return RECURSE_DIR_CONTINUE;
2886 }
5bec447a
LP
2887 if (errno != ENOENT)
2888 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2889
10b44e1d
LP
2890 r = load_credential(
2891 args->context,
2892 args->params,
2893 sub_id,
2894 de->d_name,
461345a1 2895 args->encrypted,
10b44e1d
LP
2896 args->unit,
2897 dir_fd,
2898 args->dfd,
2899 args->uid,
2900 args->ownership_ok,
2901 args->left);
3989bdc1
AB
2902 if (r < 0)
2903 return r;
2904
2905 return RECURSE_DIR_CONTINUE;
2906}
2907
bb0c0d6f
LP
2908static int acquire_credentials(
2909 const ExecContext *context,
2910 const ExecParameters *params,
d3dcf4e3 2911 const char *unit,
bb0c0d6f
LP
2912 const char *p,
2913 uid_t uid,
2914 bool ownership_ok) {
2915
43144be4 2916 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
254d1313 2917 _cleanup_close_ int dfd = -EBADF;
43144be4 2918 ExecLoadCredential *lc;
bb0c0d6f 2919 ExecSetCredential *sc;
bb0c0d6f
LP
2920 int r;
2921
2922 assert(context);
2923 assert(p);
2924
2925 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2926 if (dfd < 0)
2927 return -errno;
2928
43144be4
LP
2929 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2930 HASHMAP_FOREACH(lc, context->load_credentials) {
254d1313 2931 _cleanup_close_ int sub_fd = -EBADF;
d3dcf4e3 2932
f344f7fd
LP
2933 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2934 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2935 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2936 * propagate a credential passed to us from further up. */
43144be4 2937
f344f7fd
LP
2938 if (path_is_absolute(lc->path)) {
2939 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2940 if (sub_fd < 0 && !IN_SET(errno,
2941 ENOTDIR, /* Not a directory */
2942 ENOENT)) /* Doesn't exist? */
2943 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2944 }
43144be4 2945
61c5a49e 2946 if (sub_fd < 0)
f344f7fd 2947 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2948 r = load_credential(
2949 context,
2950 params,
2951 lc->id,
2952 lc->path,
2953 lc->encrypted,
2954 unit,
661e4251 2955 AT_FDCWD,
10b44e1d
LP
2956 dfd,
2957 uid,
2958 ownership_ok,
2959 &left);
61c5a49e 2960 else
10b44e1d 2961 /* Directory */
3989bdc1
AB
2962 r = recurse_dir(
2963 sub_fd,
11348386 2964 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2965 /* statx_mask= */ 0,
2966 /* n_depth_max= */ UINT_MAX,
9883cbb2 2967 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2968 load_cred_recurse_dir_cb,
2969 &(struct load_cred_args) {
3989bdc1
AB
2970 .context = context,
2971 .params = params,
461345a1 2972 .encrypted = lc->encrypted,
3989bdc1
AB
2973 .unit = unit,
2974 .dfd = dfd,
2975 .uid = uid,
2976 .ownership_ok = ownership_ok,
2977 .left = &left,
2978 });
61c5a49e
LP
2979 if (r < 0)
2980 return r;
bb0c0d6f
LP
2981 }
2982
9e6e9d61
LP
2983 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2984 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2985 HASHMAP_FOREACH(sc, context->set_credentials) {
2986 _cleanup_(erase_and_freep) void *plaintext = NULL;
2987 const char *data;
2988 size_t size, add;
2989
9e6e9d61
LP
2990 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2991 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2992 * slow and involved, hence it's nice to be able to skip that if the credential already
2993 * exists anyway. */
43144be4
LP
2994 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2995 continue;
2996 if (errno != ENOENT)
2997 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2998
2999 if (sc->encrypted) {
6a0779cb 3000 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
43144be4
LP
3001 if (r < 0)
3002 return r;
3003
3004 data = plaintext;
3005 } else {
3006 data = sc->data;
3007 size = sc->size;
3008 }
3009
3010 add = strlen(sc->id) + size;
3011 if (add > left)
3012 return -E2BIG;
3013
3014 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
3015 if (r < 0)
3016 return r;
3017
43144be4
LP
3018 left -= add;
3019 }
3020
bb0c0d6f
LP
3021 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
3022 return -errno;
3023
3024 /* After we created all keys with the right perms, also make sure the credential store as a whole is
3025 * accessible */
3026
3027 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 3028 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
3029 if (r < 0) {
3030 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3031 return r;
3032
3033 if (!ownership_ok)
3034 return r;
3035
f5fbe71d 3036 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
3037 return -errno;
3038 }
3039 }
3040
3041 return 0;
3042}
3043
3044static int setup_credentials_internal(
3045 const ExecContext *context,
3046 const ExecParameters *params,
d3dcf4e3 3047 const char *unit,
bb0c0d6f
LP
3048 const char *final, /* This is where the credential store shall eventually end up at */
3049 const char *workspace, /* This is where we can prepare it before moving it to the final place */
3050 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
3051 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3052 uid_t uid) {
3053
3054 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3055 * if we mounted something; false if we definitely can't mount anything */
3056 bool final_mounted;
3057 const char *where;
3058
3059 assert(context);
3060 assert(final);
3061 assert(workspace);
3062
3063 if (reuse_workspace) {
3064 r = path_is_mount_point(workspace, NULL, 0);
3065 if (r < 0)
3066 return r;
3067 if (r > 0)
3068 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3069 else
3070 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3071 } else
3072 workspace_mounted = -1; /* ditto */
3073
3074 r = path_is_mount_point(final, NULL, 0);
3075 if (r < 0)
3076 return r;
3077 if (r > 0) {
3078 /* If the final place already has something mounted, we use that. If the workspace also has
3079 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3080 * different). */
3081 final_mounted = true;
3082
3083 if (workspace_mounted < 0) {
f0353cf2 3084 /* If the final place is mounted, but the workspace isn't, then let's bind mount
bb0c0d6f
LP
3085 * the final version to the workspace, and make it writable, so that we can make
3086 * changes */
3087
21935150
LP
3088 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3089 if (r < 0)
3090 return r;
bb0c0d6f 3091
21935150
LP
3092 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3093 if (r < 0)
3094 return r;
bb0c0d6f
LP
3095
3096 workspace_mounted = true;
3097 }
3098 } else
3099 final_mounted = false;
3100
3101 if (workspace_mounted < 0) {
3102 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3103 for (int try = 0;; try++) {
3104
3105 if (try == 0) {
3106 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3107 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3108 if (r >= 0) {
bb0c0d6f
LP
3109 workspace_mounted = true;
3110 break;
3111 }
3112
3113 } else if (try == 1) {
3114 _cleanup_free_ char *opts = NULL;
3115
43144be4 3116 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3117 return -ENOMEM;
3118
3119 /* Fall back to "tmpfs" otherwise */
21935150
LP
3120 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3121 if (r >= 0) {
bb0c0d6f
LP
3122 workspace_mounted = true;
3123 break;
3124 }
3125
3126 } else {
3127 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3128 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3129 if (r < 0) {
3130 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3131 return r;
bb0c0d6f
LP
3132
3133 if (must_mount) /* If we it's not OK to use the plain directory
3134 * fallback, propagate all errors too */
21935150 3135 return r;
bb0c0d6f
LP
3136
3137 /* If we lack privileges to bind mount stuff, then let's gracefully
3138 * proceed for compat with container envs, and just use the final dir
3139 * as is. */
3140
3141 workspace_mounted = false;
3142 break;
3143 }
3144
3145 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3146 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3147 if (r < 0)
3148 return r;
bb0c0d6f
LP
3149
3150 workspace_mounted = true;
3151 break;
3152 }
3153 }
3154 }
3155
3156 assert(!must_mount || workspace_mounted > 0);
3157 where = workspace_mounted ? workspace : final;
3158
03bc11d1 3159 (void) label_fix_full(AT_FDCWD, where, final, 0);
e3a0a862 3160
d3dcf4e3 3161 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3162 if (r < 0)
3163 return r;
3164
3165 if (workspace_mounted) {
3166 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3167 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3168 if (r < 0)
3169 return r;
bb0c0d6f
LP
3170
3171 /* And mount it to the final place, read-only */
21935150
LP
3172 if (final_mounted)
3173 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3174 else
3175 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3176 if (r < 0)
3177 return r;
bb0c0d6f
LP
3178 } else {
3179 _cleanup_free_ char *parent = NULL;
3180
3181 /* If we do not have our own mount put used the plain directory fallback, then we need to
3182 * open access to the top-level credential directory and the per-service directory now */
3183
45519d13
LP
3184 r = path_extract_directory(final, &parent);
3185 if (r < 0)
3186 return r;
bb0c0d6f
LP
3187 if (chmod(parent, 0755) < 0)
3188 return -errno;
3189 }
3190
3191 return 0;
3192}
3193
3194static int setup_credentials(
3195 const ExecContext *context,
3196 const ExecParameters *params,
3197 const char *unit,
3198 uid_t uid) {
3199
3200 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3201 int r;
3202
3203 assert(context);
3204 assert(params);
3205
3206 if (!exec_context_has_credentials(context))
3207 return 0;
3208
3209 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3210 return -EINVAL;
3211
3212 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3213 * and the subdir we mount over with a read-only file system readable by the service's user */
3214 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3215 if (!q)
3216 return -ENOMEM;
3217
3218 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3219 if (r < 0 && r != -EEXIST)
3220 return r;
3221
3222 p = path_join(q, unit);
3223 if (!p)
3224 return -ENOMEM;
3225
3226 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3227 if (r < 0 && r != -EEXIST)
3228 return r;
3229
3230 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3231 if (r < 0) {
3232 _cleanup_free_ char *t = NULL, *u = NULL;
3233
3234 /* If this is not a privilege or support issue then propagate the error */
3235 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3236 return r;
3237
3238 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3239 * it into place, so that users can't access half-initialized credential stores. */
3240 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3241 if (!t)
3242 return -ENOMEM;
3243
3244 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3245 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3246 * after it is fully set up */
3247 u = path_join(t, unit);
3248 if (!u)
3249 return -ENOMEM;
3250
3251 FOREACH_STRING(i, t, u) {
3252 r = mkdir_label(i, 0700);
3253 if (r < 0 && r != -EEXIST)
3254 return r;
3255 }
3256
3257 r = setup_credentials_internal(
3258 context,
3259 params,
d3dcf4e3 3260 unit,
bb0c0d6f
LP
3261 p, /* final mount point */
3262 u, /* temporary workspace to overmount */
3263 true, /* reuse the workspace if it is already a mount */
3264 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3265 uid);
3266
3267 (void) rmdir(u); /* remove the workspace again if we can. */
3268
3269 if (r < 0)
3270 return r;
3271
3272 } else if (r == 0) {
3273
3274 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3275 * we can use the same directory for all cases, after turning off propagation. Question
3276 * though is: where do we turn off propagation exactly, and where do we place the workspace
3277 * directory? We need some place that is guaranteed to be a mount point in the host, and
3278 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3279 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3280 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3281 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3282 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3283 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3284 * propagation on the former, and then overmount the latter.
3285 *
3286 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3287 * for this purpose, but there are few other candidates that work equally well for us, and
3288 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3289 * that no one else sees this should be OK to do. */
bb0c0d6f 3290
21935150
LP
3291 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3292 if (r < 0)
bb0c0d6f
LP
3293 goto child_fail;
3294
3295 r = setup_credentials_internal(
3296 context,
3297 params,
d3dcf4e3 3298 unit,
bb0c0d6f
LP
3299 p, /* final mount point */
3300 "/dev/shm", /* temporary workspace to overmount */
3301 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3302 true, /* insist that something is mounted, do not allow fallback to plain directory */
3303 uid);
3304 if (r < 0)
3305 goto child_fail;
3306
3307 _exit(EXIT_SUCCESS);
3308
3309 child_fail:
3310 _exit(EXIT_FAILURE);
3311 }
3312
3313 return 0;
3314}
3315
92b423b9 3316#if ENABLE_SMACK
cefc33ae 3317static int setup_smack(
aa5ae971 3318 const Manager *manager,
cefc33ae 3319 const ExecContext *context,
b83d5050 3320 int executable_fd) {
cefc33ae
LP
3321 int r;
3322
3323 assert(context);
b83d5050 3324 assert(executable_fd >= 0);
cefc33ae 3325
cefc33ae
LP
3326 if (context->smack_process_label) {
3327 r = mac_smack_apply_pid(0, context->smack_process_label);
3328 if (r < 0)
3329 return r;
aa5ae971 3330 } else if (manager->default_smack_process_label) {
cefc33ae
LP
3331 _cleanup_free_ char *exec_label = NULL;
3332
b83d5050 3333 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 3334 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
3335 return r;
3336
1da3cb81 3337 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
cefc33ae
LP
3338 if (r < 0)
3339 return r;
3340 }
cefc33ae
LP
3341
3342 return 0;
3343}
92b423b9 3344#endif
cefc33ae 3345
6c47cd7d
LP
3346static int compile_bind_mounts(
3347 const ExecContext *context,
3348 const ExecParameters *params,
3349 BindMount **ret_bind_mounts,
da6053d0 3350 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3351 char ***ret_empty_directories) {
3352
3353 _cleanup_strv_free_ char **empty_directories = NULL;
3354 BindMount *bind_mounts;
5b10116e 3355 size_t n, h = 0;
6c47cd7d
LP
3356 int r;
3357
3358 assert(context);
3359 assert(params);
3360 assert(ret_bind_mounts);
3361 assert(ret_n_bind_mounts);
3362 assert(ret_empty_directories);
3363
3364 n = context->n_bind_mounts;
5b10116e 3365 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3366 if (!params->prefix[t])
3367 continue;
3368
a2ab603c
YW
3369 for (size_t i = 0; i < context->directories[t].n_items; i++)
3370 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
3371 }
3372
3373 if (n <= 0) {
3374 *ret_bind_mounts = NULL;
3375 *ret_n_bind_mounts = 0;
3376 *ret_empty_directories = NULL;
3377 return 0;
3378 }
3379
3380 bind_mounts = new(BindMount, n);
3381 if (!bind_mounts)
3382 return -ENOMEM;
3383
5b10116e 3384 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3385 BindMount *item = context->bind_mounts + i;
3386 char *s, *d;
3387
3388 s = strdup(item->source);
3389 if (!s) {
3390 r = -ENOMEM;
3391 goto finish;
3392 }
3393
3394 d = strdup(item->destination);
3395 if (!d) {
3396 free(s);
3397 r = -ENOMEM;
3398 goto finish;
3399 }
3400
3401 bind_mounts[h++] = (BindMount) {
3402 .source = s,
3403 .destination = d,
3404 .read_only = item->read_only,
3405 .recursive = item->recursive,
3406 .ignore_enoent = item->ignore_enoent,
3407 };
3408 }
3409
5b10116e 3410 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3411 if (!params->prefix[t])
3412 continue;
3413
211a3d87 3414 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3415 continue;
3416
494d0247 3417 if (exec_directory_is_private(context, t) &&
74e12520 3418 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3419 char *private_root;
3420
3421 /* So this is for a dynamic user, and we need to make sure the process can access its own
3422 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3423 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3424
657ee2d8 3425 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3426 if (!private_root) {
3427 r = -ENOMEM;
3428 goto finish;
3429 }
3430
3431 r = strv_consume(&empty_directories, private_root);
a635a7ae 3432 if (r < 0)
6c47cd7d 3433 goto finish;
6c47cd7d
LP
3434 }
3435
211a3d87 3436 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3437 char *s, *d;
3438
a2ab603c
YW
3439 /* When one of the parent directories is in the list, we cannot create the symlink
3440 * for the child directory. See also the comments in setup_exec_directory(). */
3441 if (context->directories[t].items[i].only_create)
3442 continue;
3443
494d0247 3444 if (exec_directory_is_private(context, t))
211a3d87 3445 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3446 else
211a3d87 3447 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3448 if (!s) {
3449 r = -ENOMEM;
3450 goto finish;
3451 }
3452
494d0247 3453 if (exec_directory_is_private(context, t) &&
74e12520 3454 exec_context_with_rootfs(context))
5609f688
YW
3455 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3456 * directory is not created on the root directory. So, let's bind-mount the directory
3457 * on the 'non-private' place. */
211a3d87 3458 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3459 else
3460 d = strdup(s);
6c47cd7d
LP
3461 if (!d) {
3462 free(s);
3463 r = -ENOMEM;
3464 goto finish;
3465 }
3466
3467 bind_mounts[h++] = (BindMount) {
3468 .source = s,
3469 .destination = d,
3470 .read_only = false,
9ce4e4b0 3471 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3472 .recursive = true,
3473 .ignore_enoent = false,
3474 };
3475 }
3476 }
3477
3478 assert(h == n);
3479
3480 *ret_bind_mounts = bind_mounts;
3481 *ret_n_bind_mounts = n;
ae2a15bc 3482 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3483
3484 return (int) n;
3485
3486finish:
3487 bind_mount_free_many(bind_mounts, h);
3488 return r;
3489}
3490
df61e79a
LB
3491/* ret_symlinks will contain a list of pairs src:dest that describes
3492 * the symlinks to create later on. For example, the symlinks needed
3493 * to safely give private directories to DynamicUser=1 users. */
3494static int compile_symlinks(
3495 const ExecContext *context,
3496 const ExecParameters *params,
3497 char ***ret_symlinks) {
3498
3499 _cleanup_strv_free_ char **symlinks = NULL;
3500 int r;
3501
3502 assert(context);
3503 assert(params);
3504 assert(ret_symlinks);
3505
3506 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3507 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3508 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3509
211a3d87
LB
3510 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3511 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3512
211a3d87
LB
3513 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3514 dst_abs = path_join(params->prefix[dt], *symlink);
3515 if (!src_abs || !dst_abs)
3516 return -ENOMEM;
df61e79a 3517
211a3d87
LB
3518 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3519 if (r < 0)
3520 return r;
3521 }
3522
a2ab603c
YW
3523 if (!exec_directory_is_private(context, dt) ||
3524 exec_context_with_rootfs(context) ||
3525 context->directories[dt].items[i].only_create)
211a3d87
LB
3526 continue;
3527
3528 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3529 if (!private_path)
3530 return -ENOMEM;
3531
211a3d87 3532 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3533 if (!path)
3534 return -ENOMEM;
3535
3536 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3537 if (r < 0)
3538 return r;
3539 }
3540 }
3541
3542 *ret_symlinks = TAKE_PTR(symlinks);
3543
3544 return 0;
3545}
3546
4e677599
LP
3547static bool insist_on_sandboxing(
3548 const ExecContext *context,
3549 const char *root_dir,
3550 const char *root_image,
3551 const BindMount *bind_mounts,
3552 size_t n_bind_mounts) {
3553
4e677599
LP
3554 assert(context);
3555 assert(n_bind_mounts == 0 || bind_mounts);
3556
3557 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3558 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3559 * rearrange stuff in a way we cannot ignore gracefully. */
3560
3561 if (context->n_temporary_filesystems > 0)
3562 return true;
3563
3564 if (root_dir || root_image)
3565 return true;
3566
b3d13314
LB
3567 if (context->n_mount_images > 0)
3568 return true;
3569
4e677599
LP
3570 if (context->dynamic_user)
3571 return true;
3572
4355c04f
LB
3573 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3574 return true;
3575
4e677599
LP
3576 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3577 * essential. */
5b10116e 3578 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3579 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3580 return true;
3581
91dd5f7c
LP
3582 if (context->log_namespace)
3583 return true;
3584
4e677599
LP
3585 return false;
3586}
3587
6818c54c 3588static int apply_mount_namespace(
34cf6c43 3589 const Unit *u,
9f71ba8d 3590 ExecCommandFlags command_flags,
6818c54c
LP
3591 const ExecContext *context,
3592 const ExecParameters *params,
7cc5ef5f 3593 const ExecRuntime *runtime,
d4b6ec98 3594 const char *memory_pressure_path,
7cc5ef5f 3595 char **error_path) {
6818c54c 3596
d4b6ec98
LB
3597 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3598 **read_write_paths_cleanup = NULL;
56a13a49 3599 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3600 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3601 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3602 *extension_dir = NULL;
d4b6ec98 3603 char **read_write_paths;
228af36f 3604 NamespaceInfo ns_info;
165a31c0 3605 bool needs_sandboxing;
6c47cd7d 3606 BindMount *bind_mounts = NULL;
da6053d0 3607 size_t n_bind_mounts = 0;
6818c54c 3608 int r;
93c6bb51 3609
2b3c1b9e
DH
3610 assert(context);
3611
915e6d16
LP
3612 if (params->flags & EXEC_APPLY_CHROOT) {
3613 root_image = context->root_image;
3614
3615 if (!root_image)
3616 root_dir = context->root_directory;
3617 }
93c6bb51 3618
6c47cd7d
LP
3619 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3620 if (r < 0)
3621 return r;
3622
211a3d87 3623 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3624 r = compile_symlinks(context, params, &symlinks);
3625 if (r < 0)
41abd7f6 3626 goto finalize;
df61e79a 3627
d4b6ec98
LB
3628 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3629 * service will need to write to it in order to start the notifications. */
3630 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3631 read_write_paths_cleanup = strv_copy(context->read_write_paths);
3632 if (!read_write_paths_cleanup) {
3633 r = -ENOMEM;
3634 goto finalize;
3635 }
3636
3637 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3638 if (r < 0)
3639 goto finalize;
3640
3641 read_write_paths = read_write_paths_cleanup;
3642 } else
3643 read_write_paths = context->read_write_paths;
3644
9f71ba8d 3645 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3646 if (needs_sandboxing) {
3647 /* The runtime struct only contains the parent of the private /tmp,
3648 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3649 * that is sticky, and that's the one we want to use here.
3650 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3651
3652 if (context->private_tmp && runtime) {
56a13a49
ZJS
3653 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3654 tmp_dir = runtime->tmp_dir;
3655 else if (runtime->tmp_dir)
3656 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3657
3658 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3659 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3660 else if (runtime->var_tmp_dir)
56a13a49 3661 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3662 }
3663
b5a33299
YW
3664 ns_info = (NamespaceInfo) {
3665 .ignore_protect_paths = false,
3666 .private_dev = context->private_devices,
3667 .protect_control_groups = context->protect_control_groups,
3668 .protect_kernel_tunables = context->protect_kernel_tunables,
3669 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3670 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3671 .protect_hostname = context->protect_hostname,
5e98086d 3672 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
52b3d652
LP
3673 .protect_home = context->protect_home,
3674 .protect_system = context->protect_system,
4e399953
LP
3675 .protect_proc = context->protect_proc,
3676 .proc_subset = context->proc_subset,
c2da3bf2 3677 .private_network = exec_needs_network_namespace(context),
fde36d25 3678 .private_ipc = exec_needs_ipc_namespace(context),
6720e356 3679 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3680 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3681 };
ecf63c91 3682 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3683 /*
3684 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3685 * sandbox info, otherwise enforce it, don't ignore protected paths and
3686 * fail if we are enable to apply the sandbox inside the mount namespace.
3687 */
3688 ns_info = (NamespaceInfo) {
3689 .ignore_protect_paths = true,
3690 };
3691 else
3692 ns_info = (NamespaceInfo) {};
b5a33299 3693
874cdcbc 3694 if (context->mount_propagation_flag == MS_SHARED)
37ed15d7
FB
3695 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3696
a631cbfa
LP
3697 if (exec_context_has_credentials(context) &&
3698 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3699 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3700 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3701 if (!creds_path) {
3702 r = -ENOMEM;
3703 goto finalize;
3704 }
bbb4e7f3
LP
3705 }
3706
5e8deb94
LB
3707 if (MANAGER_IS_SYSTEM(u->manager)) {
3708 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3709 if (!propagate_dir) {
3710 r = -ENOMEM;
3711 goto finalize;
3712 }
3713
5e8deb94 3714 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3715 if (!incoming_dir) {
3716 r = -ENOMEM;
3717 goto finalize;
3718 }
24759d8f
LB
3719
3720 extension_dir = strdup("/run/systemd/unit-extensions");
3721 if (!extension_dir) {
3722 r = -ENOMEM;
3723 goto finalize;
3724 }
3725 } else
3726 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3727 r = -ENOMEM;
3728 goto finalize;
3729 }
5e8deb94 3730
18d73705 3731 r = setup_namespace(root_dir, root_image, context->root_image_options,
d4b6ec98 3732 &ns_info, read_write_paths,
165a31c0
LP
3733 needs_sandboxing ? context->read_only_paths : NULL,
3734 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3735 needs_sandboxing ? context->exec_paths : NULL,
3736 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3737 empty_directories,
df61e79a 3738 symlinks,
6c47cd7d
LP
3739 bind_mounts,
3740 n_bind_mounts,
2abd4e38
YW
3741 context->temporary_filesystems,
3742 context->n_temporary_filesystems,
b3d13314
LB
3743 context->mount_images,
3744 context->n_mount_images,
56a13a49
ZJS
3745 tmp_dir,
3746 var_tmp_dir,
bbb4e7f3 3747 creds_path,
91dd5f7c 3748 context->log_namespace,
874cdcbc 3749 context->mount_propagation_flag,
d4d55b0d
LB
3750 context->root_hash, context->root_hash_size, context->root_hash_path,
3751 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3752 context->root_verity,
93f59701
LB
3753 context->extension_images,
3754 context->n_extension_images,
a07b9926 3755 context->extension_directories,
5e8deb94
LB
3756 propagate_dir,
3757 incoming_dir,
24759d8f 3758 extension_dir,
3bdc25a4 3759 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3760 error_path);
93c6bb51 3761
1beab8b0 3762 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3763 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3764 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3765 * completely different execution environment. */
aca835ed 3766 if (r == -ENOANO) {
4e677599
LP
3767 if (insist_on_sandboxing(
3768 context,
3769 root_dir, root_image,
3770 bind_mounts,
3771 n_bind_mounts)) {
3772 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3773 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3774 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3775
3776 r = -EOPNOTSUPP;
3777 } else {
aca835ed 3778 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3779 r = 0;
aca835ed 3780 }
93c6bb51
DH
3781 }
3782
8062e643 3783finalize:
4e677599 3784 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3785 return r;
3786}
3787
915e6d16
LP
3788static int apply_working_directory(
3789 const ExecContext *context,
3790 const ExecParameters *params,
3791 const char *home,
376fecf6 3792 int *exit_status) {
915e6d16 3793
6732edab 3794 const char *d, *wd;
2b3c1b9e
DH
3795
3796 assert(context);
376fecf6 3797 assert(exit_status);
2b3c1b9e 3798
6732edab
LP
3799 if (context->working_directory_home) {
3800
376fecf6
LP
3801 if (!home) {
3802 *exit_status = EXIT_CHDIR;
6732edab 3803 return -ENXIO;
376fecf6 3804 }
6732edab 3805
2b3c1b9e 3806 wd = home;
6732edab 3807
14eb3285
LP
3808 } else
3809 wd = empty_to_root(context->working_directory);
e7f1e7c6 3810
fa97f630 3811 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3812 d = wd;
fa97f630 3813 else
3b0e5bb5 3814 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3815
376fecf6
LP
3816 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3817 *exit_status = EXIT_CHDIR;
2b3c1b9e 3818 return -errno;
376fecf6 3819 }
e7f1e7c6
DH
3820
3821 return 0;
3822}
3823
fa97f630
JB
3824static int apply_root_directory(
3825 const ExecContext *context,
3826 const ExecParameters *params,
3827 const bool needs_mount_ns,
3828 int *exit_status) {
3829
3830 assert(context);
3831 assert(exit_status);
3832
5b10116e 3833 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3834 if (!needs_mount_ns && context->root_directory)
3835 if (chroot(context->root_directory) < 0) {
3836 *exit_status = EXIT_CHROOT;
3837 return -errno;
3838 }
fa97f630
JB
3839
3840 return 0;
3841}
3842
b1edf445 3843static int setup_keyring(
34cf6c43 3844 const Unit *u,
b1edf445
LP
3845 const ExecContext *context,
3846 const ExecParameters *p,
3847 uid_t uid, gid_t gid) {
3848
74dd6b51 3849 key_serial_t keyring;
e64c2d0b
DJL
3850 int r = 0;
3851 uid_t saved_uid;
3852 gid_t saved_gid;
74dd6b51
LP
3853
3854 assert(u);
b1edf445 3855 assert(context);
74dd6b51
LP
3856 assert(p);
3857
3858 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3859 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3860 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3861 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3862 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3863 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3864
b1edf445
LP
3865 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3866 return 0;
3867
e64c2d0b
DJL
3868 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3869 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3870 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3871 * & group is just as nasty as acquiring a reference to the user keyring. */
3872
3873 saved_uid = getuid();
3874 saved_gid = getgid();
3875
3876 if (gid_is_valid(gid) && gid != saved_gid) {
3877 if (setregid(gid, -1) < 0)
3878 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3879 }
3880
3881 if (uid_is_valid(uid) && uid != saved_uid) {
3882 if (setreuid(uid, -1) < 0) {
3883 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3884 goto out;
3885 }
3886 }
3887
74dd6b51
LP
3888 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3889 if (keyring == -1) {
3890 if (errno == ENOSYS)
8002fb97 3891 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3892 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3893 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3894 else if (errno == EDQUOT)
8002fb97 3895 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3896 else
e64c2d0b 3897 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3898
e64c2d0b 3899 goto out;
74dd6b51
LP
3900 }
3901
e64c2d0b
DJL
3902 /* When requested link the user keyring into the session keyring. */
3903 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3904
3905 if (keyctl(KEYCTL_LINK,
3906 KEY_SPEC_USER_KEYRING,
3907 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3908 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3909 goto out;
3910 }
3911 }
3912
3913 /* Restore uid/gid back */
3914 if (uid_is_valid(uid) && uid != saved_uid) {
3915 if (setreuid(saved_uid, -1) < 0) {
3916 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3917 goto out;
3918 }
3919 }
3920
3921 if (gid_is_valid(gid) && gid != saved_gid) {
3922 if (setregid(saved_gid, -1) < 0)
3923 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3924 }
3925
3926 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3927 if (!sd_id128_is_null(u->invocation_id)) {
3928 key_serial_t key;
3929
3930 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3931 if (key == -1)
8002fb97 3932 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3933 else {
3934 if (keyctl(KEYCTL_SETPERM, key,
3935 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3936 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3937 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3938 }
3939 }
3940
e64c2d0b 3941out:
37b22b3b 3942 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3943 /* no extra logging, as only the first already reported error matters */
3944 if (getuid() != saved_uid)
3945 (void) setreuid(saved_uid, -1);
b1edf445 3946
e64c2d0b
DJL
3947 if (getgid() != saved_gid)
3948 (void) setregid(saved_gid, -1);
b1edf445 3949
e64c2d0b 3950 return r;
74dd6b51
LP
3951}
3952
3042bbeb 3953static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3954 assert(array);
3955 assert(n);
2caa38e9 3956 assert(pair);
29206d46
LP
3957
3958 if (pair[0] >= 0)
3959 array[(*n)++] = pair[0];
3960 if (pair[1] >= 0)
3961 array[(*n)++] = pair[1];
3962}
3963
a34ceba6
LP
3964static int close_remaining_fds(
3965 const ExecParameters *params,
34cf6c43
YW
3966 const ExecRuntime *runtime,
3967 const DynamicCreds *dcreds,
00d9ef85 3968 int user_lookup_fd,
a34ceba6 3969 int socket_fd,
5b8d1f6b 3970 const int *fds, size_t n_fds) {
a34ceba6 3971
da6053d0 3972 size_t n_dont_close = 0;
00d9ef85 3973 int dont_close[n_fds + 12];
a34ceba6
LP
3974
3975 assert(params);
3976
3977 if (params->stdin_fd >= 0)
3978 dont_close[n_dont_close++] = params->stdin_fd;
3979 if (params->stdout_fd >= 0)
3980 dont_close[n_dont_close++] = params->stdout_fd;
3981 if (params->stderr_fd >= 0)
3982 dont_close[n_dont_close++] = params->stderr_fd;
3983
3984 if (socket_fd >= 0)
3985 dont_close[n_dont_close++] = socket_fd;
3986 if (n_fds > 0) {
3987 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3988 n_dont_close += n_fds;
3989 }
3990
a70581ff 3991 if (runtime) {
29206d46 3992 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3993 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3994 }
29206d46
LP
3995
3996 if (dcreds) {
3997 if (dcreds->user)
3998 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3999 if (dcreds->group)
4000 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
4001 }
4002
00d9ef85
LP
4003 if (user_lookup_fd >= 0)
4004 dont_close[n_dont_close++] = user_lookup_fd;
4005
a34ceba6
LP
4006 return close_all_fds(dont_close, n_dont_close);
4007}
4008
00d9ef85
LP
4009static int send_user_lookup(
4010 Unit *unit,
4011 int user_lookup_fd,
4012 uid_t uid,
4013 gid_t gid) {
4014
4015 assert(unit);
4016
4017 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
4018 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
4019 * specified. */
4020
4021 if (user_lookup_fd < 0)
4022 return 0;
4023
4024 if (!uid_is_valid(uid) && !gid_is_valid(gid))
4025 return 0;
4026
4027 if (writev(user_lookup_fd,
4028 (struct iovec[]) {
ce16d177
YW
4029 IOVEC_MAKE(&uid, sizeof(uid)),
4030 IOVEC_MAKE(&gid, sizeof(gid)),
4031 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
4032 return -errno;
4033
4034 return 0;
4035}
4036
6732edab
LP
4037static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
4038 int r;
4039
4040 assert(c);
4041 assert(home);
4042 assert(buf);
4043
4044 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
4045
4046 if (*home)
4047 return 0;
4048
4049 if (!c->working_directory_home)
4050 return 0;
4051
6732edab
LP
4052 r = get_home_dir(buf);
4053 if (r < 0)
4054 return r;
4055
4056 *home = *buf;
4057 return 1;
4058}
4059
da50b85a
LP
4060static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
4061 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
4062 int r;
4063
4064 assert(c);
4065 assert(p);
4066 assert(ret);
4067
4068 assert(c->dynamic_user);
4069
4070 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4071 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4072 * directories. */
4073
5b10116e 4074 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
4075 if (t == EXEC_DIRECTORY_CONFIGURATION)
4076 continue;
4077
4078 if (!p->prefix[t])
4079 continue;
4080
211a3d87 4081 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
4082 char *e;
4083
494d0247 4084 if (exec_directory_is_private(c, t))
211a3d87 4085 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 4086 else
211a3d87 4087 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
4088 if (!e)
4089 return -ENOMEM;
4090
4091 r = strv_consume(&list, e);
4092 if (r < 0)
4093 return r;
4094 }
4095 }
4096
ae2a15bc 4097 *ret = TAKE_PTR(list);
da50b85a
LP
4098
4099 return 0;
4100}
4101
78f93209
LP
4102static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4103 bool using_subcgroup;
4104 char *p;
4105
4106 assert(params);
4107 assert(ret);
4108
4109 if (!params->cgroup_path)
4110 return -EINVAL;
4111
4112 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4113 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4114 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4115 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4116 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4117 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4118 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4119 * flag, which is only passed for the former statements, not for the latter. */
4120
4121 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4122 if (using_subcgroup)
657ee2d8 4123 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4124 else
4125 p = strdup(params->cgroup_path);
4126 if (!p)
4127 return -ENOMEM;
4128
4129 *ret = p;
4130 return using_subcgroup;
4131}
4132
e2b2fb7f
MS
4133static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4134 _cleanup_(cpu_set_reset) CPUSet s = {};
4135 int r;
4136
4137 assert(c);
4138 assert(ret);
4139
4140 if (!c->numa_policy.nodes.set) {
4141 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4142 return 0;
4143 }
4144
4145 r = numa_to_cpu_set(&c->numa_policy, &s);
4146 if (r < 0)
4147 return r;
4148
4149 cpu_set_reset(ret);
4150
4151 return cpu_set_add_all(ret, &s);
4152}
4153
4154bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4155 assert(c);
4156
4157 return c->cpu_affinity_from_numa;
4158}
4159
1da37e58
ZJS
4160static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4161 int r;
4162
4163 assert(fds);
4164 assert(n_fds);
4165 assert(*n_fds < fds_size);
4166 assert(ret_fd);
4167
4168 if (fd < 0) {
254d1313 4169 *ret_fd = -EBADF;
1da37e58
ZJS
4170 return 0;
4171 }
4172
4173 if (fd < 3 + (int) *n_fds) {
4174 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4175 * the fds we pass to the process (or which are closed only during execve). */
4176
4177 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4178 if (r < 0)
4179 return -errno;
4180
ee3455cf 4181 close_and_replace(fd, r);
1da37e58
ZJS
4182 }
4183
4184 *ret_fd = fds[*n_fds] = fd;
4185 (*n_fds) ++;
4186 return 1;
4187}
4188
cd48e23f
RP
4189static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4190 union sockaddr_union addr = {
4191 .un.sun_family = AF_UNIX,
4192 };
4193 socklen_t sa_len;
4194 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4195 int r;
4196
4197 assert(u);
4198 assert(of);
4199 assert(ofd >= 0);
4200
4201 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4202 if (r < 0)
4203 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4204
4205 sa_len = r;
4206
4207 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4208 _cleanup_close_ int fd = -EBADF;
4209
4210 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4211 if (fd < 0)
4212 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4213
4214 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4215 if (r == -EPROTOTYPE)
4216 continue;
4217 if (r < 0)
4218 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4219
4220 return TAKE_FD(fd);
4221 }
4222
4223 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4224}
4225
4226static int get_open_file_fd(Unit *u, const OpenFile *of) {
4227 struct stat st;
4228 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4229
4230 assert(u);
4231 assert(of);
4232
4233 ofd = open(of->path, O_PATH | O_CLOEXEC);
4234 if (ofd < 0)
dcebb015
DDM
4235 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
4236
cd48e23f 4237 if (fstat(ofd, &st) < 0)
dcebb015 4238 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
cd48e23f
RP
4239
4240 if (S_ISSOCK(st.st_mode)) {
4241 fd = connect_unix_harder(u, of, ofd);
4242 if (fd < 0)
4243 return fd;
4244
4245 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
dcebb015
DDM
4246 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
4247 of->path);
cd48e23f
RP
4248
4249 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4250 } else {
4251 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4252 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4253 flags |= O_APPEND;
4254 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4255 flags |= O_TRUNC;
4256
4257 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4258 if (fd < 0)
4259 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4260
4261 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4262 }
4263
4264 return TAKE_FD(fd);
4265}
4266
4267static int collect_open_file_fds(
4268 Unit *u,
4269 OpenFile* open_files,
4270 int **fds,
4271 char ***fdnames,
4272 size_t *n_fds) {
4273 int r;
4274
4275 assert(u);
4276 assert(fds);
4277 assert(fdnames);
4278 assert(n_fds);
4279
4280 LIST_FOREACH(open_files, of, open_files) {
4281 _cleanup_close_ int fd = -EBADF;
4282
4283 fd = get_open_file_fd(u, of);
4284 if (fd < 0) {
4285 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4286 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4287 continue;
4288 }
4289
4290 return fd;
4291 }
4292
4293 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4294 return -ENOMEM;
4295
4296 r = strv_extend(fdnames, of->fdname);
4297 if (r < 0)
4298 return r;
4299
4300 (*fds)[*n_fds] = TAKE_FD(fd);
4301
4302 (*n_fds)++;
4303 }
4304
4305 return 0;
4306}
4307
ff0af2a1 4308static int exec_child(
f2341e0a 4309 Unit *unit,
34cf6c43 4310 const ExecCommand *command,
ff0af2a1
LP
4311 const ExecContext *context,
4312 const ExecParameters *params,
4313 ExecRuntime *runtime,
29206d46 4314 DynamicCreds *dcreds,
6bb00842 4315 const CGroupContext *cgroup_context,
ff0af2a1 4316 int socket_fd,
2caa38e9 4317 const int named_iofds[static 3],
cd48e23f 4318 int *params_fds,
da6053d0 4319 size_t n_socket_fds,
25b583d7 4320 size_t n_storage_fds,
ff0af2a1 4321 char **files_env,
00d9ef85 4322 int user_lookup_fd,
12145637 4323 int *exit_status) {
d35fbf6b 4324
8c35c10d 4325 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4326 int r, ngids = 0, exec_fd;
4d885bd3
DH
4327 _cleanup_free_ gid_t *supplementary_gids = NULL;
4328 const char *username = NULL, *groupname = NULL;
6bb00842 4329 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
2b3c1b9e 4330 const char *home = NULL, *shell = NULL;
7ca69792 4331 char **final_argv = NULL;
7bce046b
LP
4332 dev_t journal_stream_dev = 0;
4333 ino_t journal_stream_ino = 0;
5749f855 4334 bool userns_set_up = false;
165a31c0
LP
4335 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4336 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4337 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4338 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4339#if HAVE_SELINUX
7f59dd35 4340 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4341 bool use_selinux = false;
ecfbc84f 4342#endif
f9fa32f0 4343#if ENABLE_SMACK
43b1f709 4344 bool use_smack = false;
ecfbc84f 4345#endif
349cc4a5 4346#if HAVE_APPARMOR
43b1f709 4347 bool use_apparmor = false;
ecfbc84f 4348#endif
5749f855
AZ
4349 uid_t saved_uid = getuid();
4350 gid_t saved_gid = getgid();
fed1e721
LP
4351 uid_t uid = UID_INVALID;
4352 gid_t gid = GID_INVALID;
1da37e58
ZJS
4353 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4354 n_keep_fds; /* total number of fds not to close */
165a31c0 4355 int secure_bits;
afb11bf1
DG
4356 _cleanup_free_ gid_t *gids_after_pam = NULL;
4357 int ngids_after_pam = 0;
cd48e23f
RP
4358 _cleanup_free_ int *fds = NULL;
4359 _cleanup_strv_free_ char **fdnames = NULL;
034c6ed7 4360
f2341e0a 4361 assert(unit);
5cb5a6ff
LP
4362 assert(command);
4363 assert(context);
d35fbf6b 4364 assert(params);
ff0af2a1 4365 assert(exit_status);
d35fbf6b 4366
69339ae9
LP
4367 /* Explicitly test for CVE-2021-4034 inspired invocations */
4368 assert(command->path);
4369 assert(!strv_isempty(command->argv));
4370
d35fbf6b
DM
4371 rename_process_from_path(command->path);
4372
9c274488
LP
4373 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4374 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4375 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4376 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4377 SIGNALS_IGNORE);
d35fbf6b
DM
4378
4379 if (context->ignore_sigpipe)
9c274488 4380 (void) ignore_signals(SIGPIPE);
d35fbf6b 4381
ff0af2a1
LP
4382 r = reset_signal_mask();
4383 if (r < 0) {
4384 *exit_status = EXIT_SIGNAL_MASK;
12145637 4385 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4386 }
034c6ed7 4387
d35fbf6b
DM
4388 if (params->idle_pipe)
4389 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4390
2c027c62
LP
4391 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4392 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4393 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4394 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4395
d35fbf6b 4396 log_forget_fds();
2c027c62 4397 log_set_open_when_needed(true);
4f2d528d 4398
40a80078
LP
4399 /* In case anything used libc syslog(), close this here, too */
4400 closelog();
4401
cd48e23f
RP
4402 fds = newdup(int, params_fds, n_fds);
4403 if (!fds) {
4404 *exit_status = EXIT_MEMORY;
4405 return log_oom();
4406 }
4407
4408 fdnames = strv_copy((char**) params->fd_names);
4409 if (!fdnames) {
4410 *exit_status = EXIT_MEMORY;
4411 return log_oom();
4412 }
4413
4414 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4415 if (r < 0) {
4416 *exit_status = EXIT_FDS;
4417 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4418 }
4419
b1994387 4420 int keep_fds[n_fds + 3];
1da37e58
ZJS
4421 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4422 n_keep_fds = n_fds;
4423
4424 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4425 if (r < 0) {
4426 *exit_status = EXIT_FDS;
4427 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4428 }
4429
b1994387 4430#if HAVE_LIBBPF
46004616
ZJS
4431 if (unit->manager->restrict_fs) {
4432 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4433 if (bpf_map_fd < 0) {
4434 *exit_status = EXIT_FDS;
46004616 4435 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4436 }
4437
4438 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4439 if (r < 0) {
4440 *exit_status = EXIT_FDS;
4441 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4442 }
4443 }
4444#endif
4445
1da37e58 4446 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4447 if (r < 0) {
4448 *exit_status = EXIT_FDS;
12145637 4449 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4450 }
4451
0af07108
ZJS
4452 if (!context->same_pgrp &&
4453 setsid() < 0) {
4454 *exit_status = EXIT_SETSID;
4455 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4456 }
9e2f7c11 4457
1e22b5cd 4458 exec_context_tty_reset(context, params);
d35fbf6b 4459
c891efaf 4460 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4461 _cleanup_free_ char *cmdline = NULL;
4462
4ef15008 4463 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4464 if (!cmdline) {
0460aa5c 4465 *exit_status = EXIT_MEMORY;
12145637 4466 return log_oom();
3b20f877 4467 }
d35fbf6b 4468
4ef15008 4469 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4470 if (r != CONFIRM_EXECUTE) {
4471 if (r == CONFIRM_PRETEND_SUCCESS) {
4472 *exit_status = EXIT_SUCCESS;
4473 return 0;
4474 }
ff0af2a1 4475 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4476 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4477 "Execution cancelled by the user");
d35fbf6b
DM
4478 }
4479 }
1a63a750 4480
d521916d
LP
4481 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4482 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4483 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4484 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4485 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4486 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4870133b 4487 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(unit->manager->runtime_scope), true) != 0) {
d521916d
LP
4488 *exit_status = EXIT_MEMORY;
4489 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4490 }
4491
29206d46 4492 if (context->dynamic_user && dcreds) {
da50b85a 4493 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4494
d521916d 4495 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4496 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4497 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4498 *exit_status = EXIT_USER;
12145637 4499 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4500 }
4501
da50b85a
LP
4502 r = compile_suggested_paths(context, params, &suggested_paths);
4503 if (r < 0) {
4504 *exit_status = EXIT_MEMORY;
4505 return log_oom();
4506 }
4507
4508 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4509 if (r < 0) {
4510 *exit_status = EXIT_USER;
d85ff944
YW
4511 if (r == -EILSEQ)
4512 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4513 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4514 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4515 }
524daa8c 4516
70dd455c 4517 if (!uid_is_valid(uid)) {
29206d46 4518 *exit_status = EXIT_USER;
d85ff944 4519 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4520 }
4521
4522 if (!gid_is_valid(gid)) {
4523 *exit_status = EXIT_USER;
d85ff944 4524 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4525 }
5bc7452b 4526
29206d46
LP
4527 if (dcreds->user)
4528 username = dcreds->user->name;
4529
4530 } else {
4d885bd3
DH
4531 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4532 if (r < 0) {
4533 *exit_status = EXIT_USER;
12145637 4534 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4535 }
5bc7452b 4536
4d885bd3
DH
4537 r = get_fixed_group(context, &groupname, &gid);
4538 if (r < 0) {
4539 *exit_status = EXIT_GROUP;
12145637 4540 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4541 }
cdc5d5c5 4542 }
29206d46 4543
cdc5d5c5
DH
4544 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4545 r = get_supplementary_groups(context, username, groupname, gid,
4546 &supplementary_gids, &ngids);
4547 if (r < 0) {
4548 *exit_status = EXIT_GROUP;
12145637 4549 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4550 }
5bc7452b 4551
00d9ef85
LP
4552 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4553 if (r < 0) {
4554 *exit_status = EXIT_USER;
12145637 4555 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4556 }
4557
4558 user_lookup_fd = safe_close(user_lookup_fd);
4559
6732edab
LP
4560 r = acquire_home(context, uid, &home, &home_buffer);
4561 if (r < 0) {
4562 *exit_status = EXIT_CHDIR;
12145637 4563 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4564 }
4565
d35fbf6b
DM
4566 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4567 * must sure to drop O_NONBLOCK */
4568 if (socket_fd >= 0)
a34ceba6 4569 (void) fd_nonblock(socket_fd, false);
acbb0225 4570
4c70a4a7
MS
4571 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4572 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4573 if (params->cgroup_path) {
4574 _cleanup_free_ char *p = NULL;
4575
4576 r = exec_parameters_get_cgroup_path(params, &p);
4577 if (r < 0) {
4578 *exit_status = EXIT_CGROUP;
4579 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4580 }
4581
4582 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4583 if (r == -EUCLEAN) {
4584 *exit_status = EXIT_CGROUP;
4585 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4586 "because the cgroup or one of its parents or "
4587 "siblings is in the threaded mode: %m", p);
4588 }
4c70a4a7
MS
4589 if (r < 0) {
4590 *exit_status = EXIT_CGROUP;
4591 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4592 }
4593 }
4594
a8d08f39 4595 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4596 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4597 if (r < 0) {
4598 *exit_status = EXIT_NETWORK;
4599 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4600 }
4601 }
4602
a70581ff
XR
4603 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4604 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4605 if (r < 0) {
4606 *exit_status = EXIT_NAMESPACE;
4607 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4608 }
4609 }
4610
52c239d7 4611 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4612 if (r < 0) {
4613 *exit_status = EXIT_STDIN;
12145637 4614 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4615 }
034c6ed7 4616
52c239d7 4617 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4618 if (r < 0) {
4619 *exit_status = EXIT_STDOUT;
12145637 4620 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4621 }
4622
52c239d7 4623 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4624 if (r < 0) {
4625 *exit_status = EXIT_STDERR;
12145637 4626 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4627 }
4628
d35fbf6b 4629 if (context->oom_score_adjust_set) {
9f8168eb
LP
4630 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4631 * prohibit write access to this file, and we shouldn't trip up over that. */
4632 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4633 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4634 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4635 else if (r < 0) {
ff0af2a1 4636 *exit_status = EXIT_OOM_ADJUST;
12145637 4637 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4638 }
d35fbf6b
DM
4639 }
4640
ad21e542
ZJS
4641 if (context->coredump_filter_set) {
4642 r = set_coredump_filter(context->coredump_filter);
4643 if (ERRNO_IS_PRIVILEGE(r))
4644 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4645 else if (r < 0)
4646 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4647 }
4648
39090201
DJL
4649 if (context->nice_set) {
4650 r = setpriority_closest(context->nice);
4651 if (r < 0)
4652 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4653 }
613b411c 4654
d35fbf6b
DM
4655 if (context->cpu_sched_set) {
4656 struct sched_param param = {
4657 .sched_priority = context->cpu_sched_priority,
4658 };
4659
ff0af2a1
LP
4660 r = sched_setscheduler(0,
4661 context->cpu_sched_policy |
4662 (context->cpu_sched_reset_on_fork ?
4663 SCHED_RESET_ON_FORK : 0),
4664 &param);
4665 if (r < 0) {
4666 *exit_status = EXIT_SETSCHEDULER;
12145637 4667 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4668 }
d35fbf6b 4669 }
fc9b2a84 4670
e2b2fb7f
MS
4671 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4672 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4673 const CPUSet *cpu_set;
4674
4675 if (context->cpu_affinity_from_numa) {
4676 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4677 if (r < 0) {
4678 *exit_status = EXIT_CPUAFFINITY;
4679 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4680 }
4681
4682 cpu_set = &converted_cpu_set;
4683 } else
4684 cpu_set = &context->cpu_set;
4685
4686 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4687 *exit_status = EXIT_CPUAFFINITY;
12145637 4688 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4689 }
e2b2fb7f 4690 }
034c6ed7 4691
b070c7c0
MS
4692 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4693 r = apply_numa_policy(&context->numa_policy);
1406bd66
LP
4694 if (r < 0) {
4695 if (ERRNO_IS_NOT_SUPPORTED(r))
4696 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4697 else {
4698 *exit_status = EXIT_NUMA_POLICY;
4699 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4700 }
b070c7c0
MS
4701 }
4702 }
4703
d35fbf6b
DM
4704 if (context->ioprio_set)
4705 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4706 *exit_status = EXIT_IOPRIO;
12145637 4707 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4708 }
da726a4d 4709
d35fbf6b
DM
4710 if (context->timer_slack_nsec != NSEC_INFINITY)
4711 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4712 *exit_status = EXIT_TIMERSLACK;
12145637 4713 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4714 }
9eba9da4 4715
21022b9d
LP
4716 if (context->personality != PERSONALITY_INVALID) {
4717 r = safe_personality(context->personality);
4718 if (r < 0) {
ff0af2a1 4719 *exit_status = EXIT_PERSONALITY;
12145637 4720 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4721 }
21022b9d 4722 }
94f04347 4723
33331d11
VB
4724 if (context->utmp_id) {
4725 const char *line = context->tty_path ?
4726 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4727 NULL;
df0ff127 4728 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4729 line,
023a4f67
LP
4730 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4731 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4732 USER_PROCESS,
6a93917d 4733 username);
33331d11 4734 }
d35fbf6b 4735
08f67696 4736 if (uid_is_valid(uid)) {
ff0af2a1
LP
4737 r = chown_terminal(STDIN_FILENO, uid);
4738 if (r < 0) {
4739 *exit_status = EXIT_STDIN;
12145637 4740 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4741 }
d35fbf6b 4742 }
8e274523 4743
6bb00842
LP
4744 if (params->cgroup_path) {
4745 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4746 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4747 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4748 * touch a single hierarchy too. */
4749
4750 if (params->flags & EXEC_CGROUP_DELEGATE) {
4751 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4752 if (r < 0) {
4753 *exit_status = EXIT_CGROUP;
4754 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4755 }
4756 }
4757
4758 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4759 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4760 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4761 if (r < 0) {
4762 *exit_status = EXIT_MEMORY;
4763 return log_oom();
4764 }
4765
4766 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4767 if (r < 0) {
4768 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4769 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4770 memory_pressure_path = mfree(memory_pressure_path);
4771 }
4772 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4773 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4774 if (!memory_pressure_path) {
4775 *exit_status = EXIT_MEMORY;
4776 return log_oom();
4777 }
4778 }
034c6ed7 4779 }
d35fbf6b 4780 }
034c6ed7 4781
211a3d87
LB
4782 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4783
5b10116e 4784 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4785 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4786 if (r < 0)
4787 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4788 }
94f04347 4789
bb0c0d6f
LP
4790 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4791 r = setup_credentials(context, params, unit->id, uid);
4792 if (r < 0) {
4793 *exit_status = EXIT_CREDENTIALS;
4794 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4795 }
4796 }
4797
7bce046b 4798 r = build_environment(
fd63e712 4799 unit,
7bce046b
LP
4800 context,
4801 params,
6bb00842 4802 cgroup_context,
7bce046b 4803 n_fds,
cd48e23f 4804 fdnames,
7bce046b
LP
4805 home,
4806 username,
4807 shell,
4808 journal_stream_dev,
4809 journal_stream_ino,
6bb00842 4810 memory_pressure_path,
7bce046b 4811 &our_env);
2065ca69
JW
4812 if (r < 0) {
4813 *exit_status = EXIT_MEMORY;
12145637 4814 return log_oom();
2065ca69
JW
4815 }
4816
4817 r = build_pass_environment(context, &pass_env);
4818 if (r < 0) {
4819 *exit_status = EXIT_MEMORY;
12145637 4820 return log_oom();
2065ca69
JW
4821 }
4822
adf769b0
ZJS
4823 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4824 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4825 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4826 if (!strv_isempty(context->exec_search_path)) {
4827 _cleanup_free_ char *joined = NULL;
4828
4829 joined = strv_join(context->exec_search_path, ":");
4830 if (!joined) {
4831 *exit_status = EXIT_MEMORY;
4832 return log_oom();
4833 }
4834
4835 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4836 if (r < 0) {
4837 *exit_status = EXIT_MEMORY;
4838 return log_oom();
4839 }
4840 }
4841
4ab3d29f 4842 accum_env = strv_env_merge(params->environment,
2065ca69 4843 our_env,
8c35c10d 4844 joined_exec_search_path,
2065ca69
JW
4845 pass_env,
4846 context->environment,
44e5d006 4847 files_env);
2065ca69
JW
4848 if (!accum_env) {
4849 *exit_status = EXIT_MEMORY;
12145637 4850 return log_oom();
2065ca69 4851 }
1280503b 4852 accum_env = strv_env_clean(accum_env);
2065ca69 4853
096424d1 4854 (void) umask(context->umask);
b213e1c1 4855
b1edf445 4856 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4857 if (r < 0) {
4858 *exit_status = EXIT_KEYRING;
12145637 4859 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4860 }
4861
adf769b0
ZJS
4862 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4863 * from it. */
1703fa41 4864 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4865
adf769b0
ZJS
4866 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4867 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4868 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4869
adf769b0
ZJS
4870 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4871 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4872 * desired. */
165a31c0
LP
4873 if (needs_ambient_hack)
4874 needs_setuid = false;
4875 else
4876 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4877
638fd8cc
LP
4878 uint64_t capability_ambient_set = context->capability_ambient_set;
4879
165a31c0 4880 if (needs_sandboxing) {
adf769b0
ZJS
4881 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4882 * /sys being present. The actual MAC context application will happen later, as late as
4883 * possible, to avoid impacting our own code paths. */
7f18ef0a 4884
349cc4a5 4885#if HAVE_SELINUX
43b1f709 4886 use_selinux = mac_selinux_use();
7f18ef0a 4887#endif
f9fa32f0 4888#if ENABLE_SMACK
43b1f709 4889 use_smack = mac_smack_use();
7f18ef0a 4890#endif
349cc4a5 4891#if HAVE_APPARMOR
43b1f709 4892 use_apparmor = mac_apparmor_use();
7f18ef0a 4893#endif
165a31c0 4894 }
7f18ef0a 4895
ce932d2d
LP
4896 if (needs_sandboxing) {
4897 int which_failed;
4898
4899 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4900 * is set here. (See below.) */
4901
4902 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4903 if (r < 0) {
4904 *exit_status = EXIT_LIMITS;
4905 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4906 }
4907 }
4908
0af07108 4909 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4910 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4911 * wins here. (See above.) */
4912
1da37e58 4913 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4914 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4915 if (r < 0) {
4916 *exit_status = EXIT_PAM;
4917 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4918 }
ac45f971 4919
638fd8cc
LP
4920 if (ambient_capabilities_supported()) {
4921 uint64_t ambient_after_pam;
4922
4923 /* PAM modules might have set some ambient caps. Query them here and merge them into
4924 * the caps we want to set in the end, so that we don't end up unsetting them. */
4925 r = capability_get_ambient(&ambient_after_pam);
4926 if (r < 0) {
4927 *exit_status = EXIT_CAPABILITIES;
4928 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4929 }
4930
4931 capability_ambient_set |= ambient_after_pam;
4932 }
4933
0af07108
ZJS
4934 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4935 if (ngids_after_pam < 0) {
4936 *exit_status = EXIT_MEMORY;
4937 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4938 }
b213e1c1 4939 }
5749f855 4940
26c45a6c 4941 if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
5749f855
AZ
4942 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4943 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4944 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4945
4946 userns_set_up = true;
4947 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4948 if (r < 0) {
4949 *exit_status = EXIT_USER;
4950 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4951 }
4952 }
4953
fbbb9697 4954 if (exec_needs_network_namespace(context) && runtime && runtime->netns_storage_socket[0] >= 0) {
a8d08f39 4955
6e2d7c4f 4956 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4957 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
1406bd66
LP
4958 if (r < 0) {
4959 if (ERRNO_IS_PRIVILEGE(r))
4960 log_unit_warning_errno(unit, r,
4961 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4962 else {
4963 *exit_status = EXIT_NETWORK;
4964 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4965 }
6e2d7c4f 4966 }
a8d08f39
LP
4967 } else if (context->network_namespace_path) {
4968 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4969 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4970 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4971 } else
4972 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4973 }
169c1bda 4974
fde36d25 4975 if (exec_needs_ipc_namespace(context) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
a70581ff
XR
4976
4977 if (ns_type_supported(NAMESPACE_IPC)) {
4978 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4979 if (r == -EPERM)
4980 log_unit_warning_errno(unit, r,
4981 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4982 else if (r < 0) {
4983 *exit_status = EXIT_NAMESPACE;
4984 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4985 }
4986 } else if (context->ipc_namespace_path) {
4987 *exit_status = EXIT_NAMESPACE;
4988 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4989 "IPCNamespacePath= is not supported, refusing.");
4990 } else
4991 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4992 }
4993
ee818b89 4994 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4995 _cleanup_free_ char *error_path = NULL;
4996
d4b6ec98 4997 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
3fbe8dbe
LP
4998 if (r < 0) {
4999 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
5000 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
5001 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 5002 }
d35fbf6b 5003 }
81a2b7ce 5004
daf8f72b
LP
5005 if (needs_sandboxing) {
5006 r = apply_protect_hostname(unit, context, exit_status);
5007 if (r < 0)
5008 return r;
aecd5ac6
TM
5009 }
5010
5749f855
AZ
5011 /* Drop groups as early as possible.
5012 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
5013 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 5014 if (needs_setuid) {
afb11bf1
DG
5015 _cleanup_free_ gid_t *gids_to_enforce = NULL;
5016 int ngids_to_enforce = 0;
5017
5018 ngids_to_enforce = merge_gid_lists(supplementary_gids,
5019 ngids,
5020 gids_after_pam,
5021 ngids_after_pam,
5022 &gids_to_enforce);
5023 if (ngids_to_enforce < 0) {
5024 *exit_status = EXIT_MEMORY;
5025 return log_unit_error_errno(unit,
5026 ngids_to_enforce,
5027 "Failed to merge group lists. Group membership might be incorrect: %m");
5028 }
5029
5030 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
5031 if (r < 0) {
5032 *exit_status = EXIT_GROUP;
12145637 5033 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 5034 }
165a31c0 5035 }
096424d1 5036
5749f855
AZ
5037 /* If the user namespace was not set up above, try to do it now.
5038 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 5039 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
5040 * case of mount namespaces being less privileged when the mount point list is copied from a
5041 * different user namespace). */
9008e1ac 5042
5749f855
AZ
5043 if (needs_sandboxing && context->private_users && !userns_set_up) {
5044 r = setup_private_users(saved_uid, saved_gid, uid, gid);
5045 if (r < 0) {
5046 *exit_status = EXIT_USER;
5047 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
5048 }
5049 }
5050
9f71ba8d
ZJS
5051 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
5052 * shall execute. */
5053
5054 _cleanup_free_ char *executable = NULL;
254d1313 5055 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 5056 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
5057 if (r < 0) {
5058 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
5059 log_unit_struct_errno(unit, LOG_INFO, r,
5060 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5061 LOG_UNIT_INVOCATION_ID(unit),
5062 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
5063 command->path),
5064 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
5065 return 0;
5066 }
5067
5068 *exit_status = EXIT_EXEC;
c2503e35
RH
5069
5070 return log_unit_struct_errno(unit, LOG_INFO, r,
5071 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5072 LOG_UNIT_INVOCATION_ID(unit),
5073 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
5074 command->path),
5075 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
5076 }
5077
b83d5050
ZJS
5078 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
5079 if (r < 0) {
5080 *exit_status = EXIT_FDS;
5081 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
5082 }
5083
9f71ba8d 5084#if HAVE_SELINUX
49590d67 5085 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 5086 int fd = -EBADF;
49590d67
MS
5087
5088 if (socket_fd >= 0)
5089 fd = socket_fd;
5090 else if (params->n_socket_fds == 1)
5091 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
5092 * use context from that fd to compute the label. */
5093 fd = params->fds[0];
5094
5095 if (fd >= 0) {
5096 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
5097 if (r < 0) {
5098 if (!context->selinux_context_ignore) {
5099 *exit_status = EXIT_SELINUX_CONTEXT;
5100 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
5101 }
5102 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 5103 }
9f71ba8d
ZJS
5104 }
5105 }
5106#endif
5107
165a31c0 5108 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 5109 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
5110 * however if we have it as we want to keep it open until the final execve(). */
5111
1da37e58 5112 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
5113 if (r >= 0)
5114 r = shift_fds(fds, n_fds);
5115 if (r >= 0)
cd48e23f 5116 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
5117 if (r < 0) {
5118 *exit_status = EXIT_FDS;
12145637 5119 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 5120 }
e66cf1a3 5121
5686391b
LP
5122 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5123 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5124 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5125 * came this far. */
5126
165a31c0 5127 secure_bits = context->secure_bits;
e66cf1a3 5128
165a31c0
LP
5129 if (needs_sandboxing) {
5130 uint64_t bset;
e66cf1a3 5131
ce932d2d
LP
5132 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5133 * requested. (Note this is placed after the general resource limit initialization, see
5134 * above, in order to take precedence.) */
f4170c67
LP
5135 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5136 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5137 *exit_status = EXIT_LIMITS;
12145637 5138 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
5139 }
5140 }
5141
37ac2744
JB
5142#if ENABLE_SMACK
5143 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5144 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5145 if (use_smack) {
aa5ae971 5146 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 5147 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
5148 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5149 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5150 }
5151 }
5152#endif
5153
165a31c0
LP
5154 bset = context->capability_bounding_set;
5155 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5156 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5157 * instead of us doing that */
5158 if (needs_ambient_hack)
5159 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5160 (UINT64_C(1) << CAP_SETUID) |
5161 (UINT64_C(1) << CAP_SETGID);
5162
5163 if (!cap_test_all(bset)) {
638fd8cc 5164 r = capability_bounding_set_drop(bset, /* right_now= */ false);
ff0af2a1
LP
5165 if (r < 0) {
5166 *exit_status = EXIT_CAPABILITIES;
12145637 5167 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 5168 }
4c2630eb 5169 }
3b8bddde 5170
16fcb191
TK
5171 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5172 * keep-caps set.
a954b249
LP
5173 *
5174 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5175 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5176 * the ambient capabilities can be raised as they are present in the permitted and
5177 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5178 * without changing the user, so we also set the ambient capabilities here.
5179 *
5180 * The requested ambient capabilities are raised in the inheritable set if the second
5181 * argument is true. */
943800f4 5182 if (!needs_ambient_hack) {
638fd8cc 5183 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
755d4b67
IP
5184 if (r < 0) {
5185 *exit_status = EXIT_CAPABILITIES;
12145637 5186 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 5187 }
755d4b67 5188 }
165a31c0 5189 }
755d4b67 5190
fa97f630
JB
5191 /* chroot to root directory first, before we lose the ability to chroot */
5192 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5193 if (r < 0)
5194 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5195
165a31c0 5196 if (needs_setuid) {
08f67696 5197 if (uid_is_valid(uid)) {
638fd8cc 5198 r = enforce_user(context, uid, capability_ambient_set);
ff0af2a1
LP
5199 if (r < 0) {
5200 *exit_status = EXIT_USER;
12145637 5201 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 5202 }
165a31c0 5203
638fd8cc 5204 if (!needs_ambient_hack && capability_ambient_set != 0) {
755d4b67 5205
16fcb191 5206 /* Raise the ambient capabilities after user change. */
638fd8cc 5207 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
755d4b67
IP
5208 if (r < 0) {
5209 *exit_status = EXIT_CAPABILITIES;
12145637 5210 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 5211 }
755d4b67 5212 }
5b6319dc 5213 }
165a31c0 5214 }
d35fbf6b 5215
56ef8db9
JB
5216 /* Apply working directory here, because the working directory might be on NFS and only the user running
5217 * this service might have the correct privilege to change to the working directory */
fa97f630 5218 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
5219 if (r < 0)
5220 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5221
165a31c0 5222 if (needs_sandboxing) {
37ac2744 5223 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
5224 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5225 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5226 * are restricted. */
5227
349cc4a5 5228#if HAVE_SELINUX
43b1f709 5229 if (use_selinux) {
5cd9cd35
LP
5230 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5231
5232 if (exec_context) {
5233 r = setexeccon(exec_context);
006d1864
TM
5234 if (r < 0) {
5235 if (!context->selinux_context_ignore) {
5236 *exit_status = EXIT_SELINUX_CONTEXT;
5237 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5238 }
5239 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
5240 }
5241 }
5242 }
5243#endif
5244
349cc4a5 5245#if HAVE_APPARMOR
43b1f709 5246 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
5247 r = aa_change_onexec(context->apparmor_profile);
5248 if (r < 0 && !context->apparmor_profile_ignore) {
5249 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 5250 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
5251 }
5252 }
5253#endif
5254
a954b249
LP
5255 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5256 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5257 * requires CAP_SETPCAP. */
dbdc4098 5258 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 5259 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098 5260 * effective set here.
a954b249
LP
5261 *
5262 * The effective set is overwritten during execve() with the following values:
5263 *
dbdc4098 5264 * - ambient set (for non-root processes)
a954b249 5265 *
dbdc4098
TK
5266 * - (inheritable | bounding) set for root processes)
5267 *
5268 * Hence there is no security impact to raise it in the effective set before execve
5269 */
a954b249 5270 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
dbdc4098
TK
5271 if (r < 0) {
5272 *exit_status = EXIT_CAPABILITIES;
5273 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5274 }
755d4b67 5275 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 5276 *exit_status = EXIT_SECUREBITS;
12145637 5277 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 5278 }
dbdc4098 5279 }
5b6319dc 5280
59eeb84b 5281 if (context_has_no_new_privileges(context))
d35fbf6b 5282 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 5283 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 5284 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
5285 }
5286
349cc4a5 5287#if HAVE_SECCOMP
469830d1
LP
5288 r = apply_address_families(unit, context);
5289 if (r < 0) {
5290 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 5291 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 5292 }
04aa0cb9 5293
469830d1
LP
5294 r = apply_memory_deny_write_execute(unit, context);
5295 if (r < 0) {
5296 *exit_status = EXIT_SECCOMP;
12145637 5297 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5298 }
f4170c67 5299
469830d1
LP
5300 r = apply_restrict_realtime(unit, context);
5301 if (r < 0) {
5302 *exit_status = EXIT_SECCOMP;
12145637 5303 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5304 }
5305
f69567cb
LP
5306 r = apply_restrict_suid_sgid(unit, context);
5307 if (r < 0) {
5308 *exit_status = EXIT_SECCOMP;
5309 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5310 }
5311
add00535
LP
5312 r = apply_restrict_namespaces(unit, context);
5313 if (r < 0) {
5314 *exit_status = EXIT_SECCOMP;
12145637 5315 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5316 }
5317
469830d1
LP
5318 r = apply_protect_sysctl(unit, context);
5319 if (r < 0) {
5320 *exit_status = EXIT_SECCOMP;
12145637 5321 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5322 }
5323
469830d1
LP
5324 r = apply_protect_kernel_modules(unit, context);
5325 if (r < 0) {
5326 *exit_status = EXIT_SECCOMP;
12145637 5327 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5328 }
5329
84703040
KK
5330 r = apply_protect_kernel_logs(unit, context);
5331 if (r < 0) {
5332 *exit_status = EXIT_SECCOMP;
5333 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5334 }
5335
fc64760d
KK
5336 r = apply_protect_clock(unit, context);
5337 if (r < 0) {
5338 *exit_status = EXIT_SECCOMP;
5339 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5340 }
5341
469830d1
LP
5342 r = apply_private_devices(unit, context);
5343 if (r < 0) {
5344 *exit_status = EXIT_SECCOMP;
12145637 5345 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5346 }
5347
5348 r = apply_syscall_archs(unit, context);
5349 if (r < 0) {
5350 *exit_status = EXIT_SECCOMP;
12145637 5351 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5352 }
5353
78e864e5
TM
5354 r = apply_lock_personality(unit, context);
5355 if (r < 0) {
5356 *exit_status = EXIT_SECCOMP;
12145637 5357 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5358 }
5359
9df2cdd8
TM
5360 r = apply_syscall_log(unit, context);
5361 if (r < 0) {
5362 *exit_status = EXIT_SECCOMP;
5363 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5364 }
5365
5cd9cd35
LP
5366 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5367 * by the filter as little as possible. */
165a31c0 5368 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5369 if (r < 0) {
5370 *exit_status = EXIT_SECCOMP;
12145637 5371 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5372 }
5373#endif
b1994387
ILG
5374
5375#if HAVE_LIBBPF
5376 r = apply_restrict_filesystems(unit, context);
5377 if (r < 0) {
5378 *exit_status = EXIT_BPF;
5379 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5380 }
5381#endif
5382
d35fbf6b 5383 }
034c6ed7 5384
00819cc1
LP
5385 if (!strv_isempty(context->unset_environment)) {
5386 char **ee = NULL;
5387
5388 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5389 if (!ee) {
5390 *exit_status = EXIT_MEMORY;
12145637 5391 return log_oom();
00819cc1
LP
5392 }
5393
130d3d22 5394 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5395 }
5396
7ca69792
AZ
5397 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5398 replaced_argv = replace_env_argv(command->argv, accum_env);
5399 if (!replaced_argv) {
5400 *exit_status = EXIT_MEMORY;
5401 return log_oom();
5402 }
5403 final_argv = replaced_argv;
5404 } else
5405 final_argv = command->argv;
034c6ed7 5406
f1d34068 5407 if (DEBUG_LOGGING) {
c2b2df60 5408 _cleanup_free_ char *line = NULL;
81a2b7ce 5409
4ef15008 5410 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5411 if (!line) {
5412 *exit_status = EXIT_MEMORY;
5413 return log_oom();
5414 }
5415
5416 log_unit_struct(unit, LOG_DEBUG,
5417 "EXECUTABLE=%s", executable,
5418 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5419 }
dd305ec9 5420
5686391b
LP
5421 if (exec_fd >= 0) {
5422 uint8_t hot = 1;
5423
5424 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5425 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5426
5427 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5428 *exit_status = EXIT_EXEC;
5429 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5430 }
5431 }
5432
a6d9111c 5433 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5434
5435 if (exec_fd >= 0) {
5436 uint8_t hot = 0;
5437
5438 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5439 * that POLLHUP on it no longer means execve() succeeded. */
5440
5441 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5442 *exit_status = EXIT_EXEC;
5443 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5444 }
5445 }
12145637 5446
ff0af2a1 5447 *exit_status = EXIT_EXEC;
9f71ba8d 5448 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5449}
81a2b7ce 5450
34cf6c43 5451static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5452static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5453
f2341e0a
LP
5454int exec_spawn(Unit *unit,
5455 ExecCommand *command,
d35fbf6b
DM
5456 const ExecContext *context,
5457 const ExecParameters *params,
5458 ExecRuntime *runtime,
29206d46 5459 DynamicCreds *dcreds,
6bb00842 5460 const CGroupContext *cgroup_context,
d35fbf6b 5461 pid_t *ret) {
8351ceae 5462
ee39ca20 5463 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5464 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5465 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5466 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5467 _cleanup_free_ char *line = NULL;
d35fbf6b 5468 pid_t pid;
8351ceae 5469
f2341e0a 5470 assert(unit);
d35fbf6b
DM
5471 assert(command);
5472 assert(context);
5473 assert(ret);
5474 assert(params);
25b583d7 5475 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5476
d35fbf6b
DM
5477 if (context->std_input == EXEC_INPUT_SOCKET ||
5478 context->std_output == EXEC_OUTPUT_SOCKET ||
5479 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5480
d85ff944
YW
5481 if (params->n_socket_fds > 1)
5482 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5483
d85ff944
YW
5484 if (params->n_socket_fds == 0)
5485 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5486
d35fbf6b
DM
5487 socket_fd = params->fds[0];
5488 } else {
254d1313 5489 socket_fd = -EBADF;
d35fbf6b 5490 fds = params->fds;
9b141911 5491 n_socket_fds = params->n_socket_fds;
25b583d7 5492 n_storage_fds = params->n_storage_fds;
d35fbf6b 5493 }
94f04347 5494
34cf6c43 5495 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5496 if (r < 0)
5497 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5498
f2341e0a 5499 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5500 if (r < 0)
f2341e0a 5501 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5502
4ef15008 5503 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5504 if (!line)
5505 return log_oom();
fab56fc5 5506
9f71ba8d
ZJS
5507 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5508 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5509 mac_selinux_maybe_reload();
5510
c2503e35
RH
5511 log_unit_struct(unit, LOG_DEBUG,
5512 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5513 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5514 the mount namespace in the child, but we want to log
5515 from the parent, so we need to use the (possibly
5516 inaccurate) path here. */
5517 LOG_UNIT_INVOCATION_ID(unit));
12145637 5518
78f93209
LP
5519 if (params->cgroup_path) {
5520 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5521 if (r < 0)
5522 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5523 if (r > 0) { /* We are using a child cgroup */
5524 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5525 if (r < 0)
5526 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa 5527
523ea123 5528 /* Normally we would not propagate the xattrs to children but since we created this
4e806bfa
AZ
5529 * sub-cgroup internally we should do it. */
5530 cgroup_oomd_xattr_apply(unit, subcgroup_path);
523ea123 5531 cgroup_log_xattr_apply(unit, subcgroup_path);
78f93209
LP
5532 }
5533 }
5534
d35fbf6b
DM
5535 pid = fork();
5536 if (pid < 0)
74129a12 5537 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5538
5539 if (pid == 0) {
12145637 5540 int exit_status = EXIT_SUCCESS;
ff0af2a1 5541
f2341e0a
LP
5542 r = exec_child(unit,
5543 command,
ff0af2a1
LP
5544 context,
5545 params,
5546 runtime,
29206d46 5547 dcreds,
6bb00842 5548 cgroup_context,
ff0af2a1 5549 socket_fd,
52c239d7 5550 named_iofds,
4c47affc 5551 fds,
9b141911 5552 n_socket_fds,
25b583d7 5553 n_storage_fds,
ff0af2a1 5554 files_env,
00d9ef85 5555 unit->manager->user_lookup_fds[1],
12145637
LP
5556 &exit_status);
5557
e1714f02
ZJS
5558 if (r < 0) {
5559 const char *status =
5560 exit_status_to_string(exit_status,
e04ed6db 5561 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5562
c2503e35
RH
5563 log_unit_struct_errno(unit, LOG_ERR, r,
5564 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5565 LOG_UNIT_INVOCATION_ID(unit),
5566 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5567 status, command->path),
5568 "EXECUTABLE=%s", command->path);
e1714f02 5569 }
4c2630eb 5570
ff0af2a1 5571 _exit(exit_status);
034c6ed7
LP
5572 }
5573
f2341e0a 5574 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5575
78f93209
LP
5576 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5577 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5578 * process will be killed too). */
5579 if (subcgroup_path)
5580 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5581
b58b4116 5582 exec_status_start(&command->exec_status, pid);
9fb86720 5583
034c6ed7 5584 *ret = pid;
5cb5a6ff
LP
5585 return 0;
5586}
5587
034c6ed7
LP
5588void exec_context_init(ExecContext *c) {
5589 assert(c);
5590
4c12626c 5591 c->umask = 0022;
0692548c 5592 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5593 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5594 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5595 c->syslog_level_prefix = true;
353e12c2 5596 c->ignore_sigpipe = true;
3a43da28 5597 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5598 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5599 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5600 c->directories[t].mode = 0755;
12213aed 5601 c->timeout_clean_usec = USEC_INFINITY;
3fd5190b 5602 c->capability_bounding_set = CAP_MASK_UNSET;
aa9d574d
YW
5603 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5604 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5605 c->log_level_max = -1;
005bfaf1
TM
5606#if HAVE_SECCOMP
5607 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5608#endif
51462135
DDM
5609 c->tty_rows = UINT_MAX;
5610 c->tty_cols = UINT_MAX;
b070c7c0 5611 numa_policy_reset(&c->numa_policy);
24002121 5612 c->private_mounts = -1;
034c6ed7
LP
5613}
5614
613b411c 5615void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5616 assert(c);
5617
6796073e
LP
5618 c->environment = strv_free(c->environment);
5619 c->environment_files = strv_free(c->environment_files);
b4c14404 5620 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5621 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5622
31ce987c 5623 rlimit_free_all(c->rlimit);
034c6ed7 5624
5b10116e 5625 for (size_t l = 0; l < 3; l++) {
52c239d7 5626 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5627 c->stdio_file[l] = mfree(c->stdio_file[l]);
5628 }
52c239d7 5629
a1e58e8e
LP
5630 c->working_directory = mfree(c->working_directory);
5631 c->root_directory = mfree(c->root_directory);
915e6d16 5632 c->root_image = mfree(c->root_image);
18d73705 5633 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5634 c->root_hash = mfree(c->root_hash);
5635 c->root_hash_size = 0;
5636 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5637 c->root_hash_sig = mfree(c->root_hash_sig);
5638 c->root_hash_sig_size = 0;
5639 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5640 c->root_verity = mfree(c->root_verity);
93f59701 5641 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5642 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5643 c->tty_path = mfree(c->tty_path);
5644 c->syslog_identifier = mfree(c->syslog_identifier);
5645 c->user = mfree(c->user);
5646 c->group = mfree(c->group);
034c6ed7 5647
6796073e 5648 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5649
a1e58e8e 5650 c->pam_name = mfree(c->pam_name);
5b6319dc 5651
2a624c36
AP
5652 c->read_only_paths = strv_free(c->read_only_paths);
5653 c->read_write_paths = strv_free(c->read_write_paths);
5654 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5655 c->exec_paths = strv_free(c->exec_paths);
5656 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5657 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5658
d2d6c096 5659 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5660 c->bind_mounts = NULL;
5661 c->n_bind_mounts = 0;
2abd4e38
YW
5662 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5663 c->temporary_filesystems = NULL;
5664 c->n_temporary_filesystems = 0;
b3d13314 5665 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5666
0985c7c4 5667 cpu_set_reset(&c->cpu_set);
b070c7c0 5668 numa_policy_reset(&c->numa_policy);
86a3475b 5669
a1e58e8e
LP
5670 c->utmp_id = mfree(c->utmp_id);
5671 c->selinux_context = mfree(c->selinux_context);
5672 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5673 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5674
b1994387
ILG
5675 c->restrict_filesystems = set_free(c->restrict_filesystems);
5676
8cfa775f 5677 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5678 c->syscall_archs = set_free(c->syscall_archs);
5679 c->address_families = set_free(c->address_families);
e66cf1a3 5680
5b10116e 5681 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5682 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5683
5684 c->log_level_max = -1;
5685
5686 exec_context_free_log_extra_fields(c);
523ea123
QD
5687 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5688 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
08f3be7a 5689
5ac1530e
ZJS
5690 c->log_ratelimit_interval_usec = 0;
5691 c->log_ratelimit_burst = 0;
90fc172e 5692
08f3be7a
LP
5693 c->stdin_data = mfree(c->stdin_data);
5694 c->stdin_data_size = 0;
a8d08f39
LP
5695
5696 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5697 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5698
5699 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5700
43144be4 5701 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5702 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5703}
5704
34cf6c43 5705int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5706 assert(c);
5707
5708 if (!runtime_prefix)
5709 return 0;
5710
211a3d87 5711 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5712 _cleanup_free_ char *p = NULL;
e66cf1a3 5713
494d0247 5714 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5715 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5716 else
211a3d87 5717 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5718 if (!p)
5719 return -ENOMEM;
5720
7bc4bf4a
LP
5721 /* We execute this synchronously, since we need to be sure this is gone when we start the
5722 * service next. */
c6878637 5723 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5724
211a3d87
LB
5725 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5726 _cleanup_free_ char *symlink_abs = NULL;
5727
5728 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5729 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5730 else
5731 symlink_abs = path_join(runtime_prefix, *symlink);
5732 if (!symlink_abs)
5733 return -ENOMEM;
5734
5735 (void) unlink(symlink_abs);
5736 }
e66cf1a3
LP
5737 }
5738
5739 return 0;
5cb5a6ff
LP
5740}
5741
bb0c0d6f
LP
5742int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5743 _cleanup_free_ char *p = NULL;
5744
5745 assert(c);
5746
5747 if (!runtime_prefix || !unit)
5748 return 0;
5749
5750 p = path_join(runtime_prefix, "credentials", unit);
5751 if (!p)
5752 return -ENOMEM;
5753
5754 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5755 * unmount it, and afterwards remove the mount point */
5756 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5757 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5758
5759 return 0;
5760}
5761
b9f976fb
MK
5762int exec_context_destroy_mount_ns_dir(Unit *u) {
5763 _cleanup_free_ char *p = NULL;
5764
5765 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5766 return 0;
5767
5768 p = path_join("/run/systemd/propagate/", u->id);
5769 if (!p)
5770 return -ENOMEM;
5771
5772 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5773 if (rmdir(p) < 0 && errno != ENOENT)
5774 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5775
5776 return 0;
5777}
5778
34cf6c43 5779static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5780 assert(c);
5781
a1e58e8e 5782 c->path = mfree(c->path);
6796073e 5783 c->argv = strv_free(c->argv);
43d0fcbd
LP
5784}
5785
da6053d0 5786void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5787 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5788 exec_command_done(c+i);
5789}
5790
f1acf85a 5791ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5792 ExecCommand *i;
5793
5794 while ((i = c)) {
71fda00f 5795 LIST_REMOVE(command, c, i);
43d0fcbd 5796 exec_command_done(i);
5cb5a6ff
LP
5797 free(i);
5798 }
f1acf85a
ZJS
5799
5800 return NULL;
5cb5a6ff
LP
5801}
5802
da6053d0 5803void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5804 for (size_t i = 0; i < n; i++)
f1acf85a 5805 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5806}
5807
6a1d4d9f 5808void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5809 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5810 exec_status_reset(&c[i].exec_status);
5811}
5812
5813void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5814 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5815 LIST_FOREACH(command, z, c[i])
5816 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5817}
5818
039f0e70 5819typedef struct InvalidEnvInfo {
34cf6c43 5820 const Unit *unit;
039f0e70
LP
5821 const char *path;
5822} InvalidEnvInfo;
5823
5824static void invalid_env(const char *p, void *userdata) {
5825 InvalidEnvInfo *info = userdata;
5826
f2341e0a 5827 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5828}
5829
52c239d7
LB
5830const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5831 assert(c);
5832
5833 switch (fd_index) {
5073ff6b 5834
52c239d7
LB
5835 case STDIN_FILENO:
5836 if (c->std_input != EXEC_INPUT_NAMED_FD)
5837 return NULL;
5073ff6b 5838
52c239d7 5839 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5840
52c239d7
LB
5841 case STDOUT_FILENO:
5842 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5843 return NULL;
5073ff6b 5844
52c239d7 5845 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5846
52c239d7
LB
5847 case STDERR_FILENO:
5848 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5849 return NULL;
5073ff6b 5850
52c239d7 5851 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5852
52c239d7
LB
5853 default:
5854 return NULL;
5855 }
5856}
5857
2caa38e9
LP
5858static int exec_context_named_iofds(
5859 const ExecContext *c,
5860 const ExecParameters *p,
5861 int named_iofds[static 3]) {
5862
5b10116e 5863 size_t targets;
56fbd561 5864 const char* stdio_fdname[3];
da6053d0 5865 size_t n_fds;
52c239d7
LB
5866
5867 assert(c);
5868 assert(p);
2caa38e9 5869 assert(named_iofds);
52c239d7
LB
5870
5871 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5872 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5873 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5874
5b10116e 5875 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5876 stdio_fdname[i] = exec_context_fdname(c, i);
5877
4c47affc
FB
5878 n_fds = p->n_storage_fds + p->n_socket_fds;
5879
5b10116e 5880 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5881 if (named_iofds[STDIN_FILENO] < 0 &&
5882 c->std_input == EXEC_INPUT_NAMED_FD &&
5883 stdio_fdname[STDIN_FILENO] &&
5884 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5885
52c239d7
LB
5886 named_iofds[STDIN_FILENO] = p->fds[i];
5887 targets--;
56fbd561
ZJS
5888
5889 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5890 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5891 stdio_fdname[STDOUT_FILENO] &&
5892 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5893
52c239d7
LB
5894 named_iofds[STDOUT_FILENO] = p->fds[i];
5895 targets--;
56fbd561
ZJS
5896
5897 } else if (named_iofds[STDERR_FILENO] < 0 &&
5898 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5899 stdio_fdname[STDERR_FILENO] &&
5900 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5901
52c239d7
LB
5902 named_iofds[STDERR_FILENO] = p->fds[i];
5903 targets--;
5904 }
5905
56fbd561 5906 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5907}
5908
398a5009
ZJS
5909static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5910 _cleanup_strv_free_ char **v = NULL;
398a5009 5911 int r;
8c7be95e
LP
5912
5913 assert(c);
398a5009 5914 assert(ret);
8c7be95e
LP
5915
5916 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5917 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5918 bool ignore = false;
5919 char *fn = *i;
8c7be95e
LP
5920
5921 if (fn[0] == '-') {
5922 ignore = true;
313cefa1 5923 fn++;
8c7be95e
LP
5924 }
5925
5926 if (!path_is_absolute(fn)) {
8c7be95e
LP
5927 if (ignore)
5928 continue;
8c7be95e
LP
5929 return -EINVAL;
5930 }
5931
2bef10ab 5932 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5933 r = safe_glob(fn, 0, &pglob);
5934 if (r < 0) {
2bef10ab
PL
5935 if (ignore)
5936 continue;
398a5009 5937 return r;
2bef10ab 5938 }
8c7be95e 5939
d8c92e8b
ZJS
5940 /* When we don't match anything, -ENOENT should be returned */
5941 assert(pglob.gl_pathc > 0);
5942
5b10116e 5943 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5944 _cleanup_strv_free_ char **p = NULL;
5945
5946 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5947 if (r < 0) {
2bef10ab
PL
5948 if (ignore)
5949 continue;
398a5009 5950 return r;
e9c1ea9d 5951 }
398a5009 5952
ebc05a09 5953 /* Log invalid environment variables with filename */
039f0e70
LP
5954 if (p) {
5955 InvalidEnvInfo info = {
f2341e0a 5956 .unit = unit,
039f0e70
LP
5957 .path = pglob.gl_pathv[n]
5958 };
5959
5960 p = strv_env_clean_with_callback(p, invalid_env, &info);
5961 }
8c7be95e 5962
398a5009
ZJS
5963 if (!v)
5964 v = TAKE_PTR(p);
2bef10ab 5965 else {
398a5009 5966 char **m = strv_env_merge(v, p);
c84a9488 5967 if (!m)
2bef10ab 5968 return -ENOMEM;
2bef10ab 5969
398a5009 5970 strv_free_and_replace(v, m);
2bef10ab 5971 }
8c7be95e
LP
5972 }
5973 }
5974
398a5009 5975 *ret = TAKE_PTR(v);
8c7be95e
LP
5976
5977 return 0;
5978}
5979
6ac8fdc9 5980static bool tty_may_match_dev_console(const char *tty) {
7b912648 5981 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5982
1e22b5cd
LP
5983 if (!tty)
5984 return true;
5985
a119ec7c 5986 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5987
5988 /* trivial identity? */
5989 if (streq(tty, "console"))
5990 return true;
5991
7b912648
LP
5992 if (resolve_dev_console(&resolved) < 0)
5993 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5994
5995 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5996 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5997}
5998
6c0ae739
LP
5999static bool exec_context_may_touch_tty(const ExecContext *ec) {
6000 assert(ec);
1e22b5cd 6001
6c0ae739 6002 return ec->tty_reset ||
1e22b5cd
LP
6003 ec->tty_vhangup ||
6004 ec->tty_vt_disallocate ||
6ac8fdc9
MS
6005 is_terminal_input(ec->std_input) ||
6006 is_terminal_output(ec->std_output) ||
6c0ae739
LP
6007 is_terminal_output(ec->std_error);
6008}
6009
6010bool exec_context_may_touch_console(const ExecContext *ec) {
6011
6012 return exec_context_may_touch_tty(ec) &&
1e22b5cd 6013 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
6014}
6015
15ae422b 6016static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
6017 assert(f);
6018
6019 STRV_FOREACH(g, l)
6020 fprintf(f, " %s", *g);
6021}
6022
ddc155b2
TM
6023static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
6024 assert(f);
6025 assert(prefix);
6026 assert(name);
6027
6028 if (!strv_isempty(strv)) {
a7bd1656 6029 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
6030 strv_fprintf(f, strv);
6031 fputs("\n", f);
6032 }
6033}
6034
34cf6c43 6035void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 6036 int r;
9eba9da4 6037
5cb5a6ff
LP
6038 assert(c);
6039 assert(f);
6040
4ad49000 6041 prefix = strempty(prefix);
5cb5a6ff
LP
6042
6043 fprintf(f,
94f04347
LP
6044 "%sUMask: %04o\n"
6045 "%sWorkingDirectory: %s\n"
451a074f 6046 "%sRootDirectory: %s\n"
15ae422b 6047 "%sNonBlocking: %s\n"
64747e2d 6048 "%sPrivateTmp: %s\n"
7f112f50 6049 "%sPrivateDevices: %s\n"
59eeb84b 6050 "%sProtectKernelTunables: %s\n"
e66a2f65 6051 "%sProtectKernelModules: %s\n"
84703040 6052 "%sProtectKernelLogs: %s\n"
fc64760d 6053 "%sProtectClock: %s\n"
59eeb84b 6054 "%sProtectControlGroups: %s\n"
d251207d
LP
6055 "%sPrivateNetwork: %s\n"
6056 "%sPrivateUsers: %s\n"
1b8689f9
LP
6057 "%sProtectHome: %s\n"
6058 "%sProtectSystem: %s\n"
5d997827 6059 "%sMountAPIVFS: %s\n"
f3e43635 6060 "%sIgnoreSIGPIPE: %s\n"
f4170c67 6061 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 6062 "%sRestrictRealtime: %s\n"
f69567cb 6063 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 6064 "%sKeyringMode: %s\n"
4e399953
LP
6065 "%sProtectHostname: %s\n"
6066 "%sProtectProc: %s\n"
6067 "%sProcSubset: %s\n",
5cb5a6ff 6068 prefix, c->umask,
14eb3285
LP
6069 prefix, empty_to_root(c->working_directory),
6070 prefix, empty_to_root(c->root_directory),
15ae422b 6071 prefix, yes_no(c->non_blocking),
64747e2d 6072 prefix, yes_no(c->private_tmp),
7f112f50 6073 prefix, yes_no(c->private_devices),
59eeb84b 6074 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 6075 prefix, yes_no(c->protect_kernel_modules),
84703040 6076 prefix, yes_no(c->protect_kernel_logs),
fc64760d 6077 prefix, yes_no(c->protect_clock),
59eeb84b 6078 prefix, yes_no(c->protect_control_groups),
d251207d
LP
6079 prefix, yes_no(c->private_network),
6080 prefix, yes_no(c->private_users),
1b8689f9
LP
6081 prefix, protect_home_to_string(c->protect_home),
6082 prefix, protect_system_to_string(c->protect_system),
5e98086d 6083 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 6084 prefix, yes_no(c->ignore_sigpipe),
f4170c67 6085 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 6086 prefix, yes_no(c->restrict_realtime),
f69567cb 6087 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 6088 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
6089 prefix, yes_no(c->protect_hostname),
6090 prefix, protect_proc_to_string(c->protect_proc),
6091 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 6092
915e6d16
LP
6093 if (c->root_image)
6094 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
6095
18d73705 6096 if (c->root_image_options) {
18d73705
LB
6097 fprintf(f, "%sRootImageOptions:", prefix);
6098 LIST_FOREACH(mount_options, o, c->root_image_options)
6099 if (!isempty(o->options))
9ece6444
LB
6100 fprintf(f, " %s:%s",
6101 partition_designator_to_string(o->partition_designator),
6102 o->options);
18d73705
LB
6103 fprintf(f, "\n");
6104 }
6105
0389f4fa
LB
6106 if (c->root_hash) {
6107 _cleanup_free_ char *encoded = NULL;
6108 encoded = hexmem(c->root_hash, c->root_hash_size);
6109 if (encoded)
6110 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
6111 }
6112
6113 if (c->root_hash_path)
6114 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
6115
d4d55b0d
LB
6116 if (c->root_hash_sig) {
6117 _cleanup_free_ char *encoded = NULL;
6118 ssize_t len;
6119 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6120 if (len)
6121 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6122 }
6123
6124 if (c->root_hash_sig_path)
6125 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6126
0389f4fa
LB
6127 if (c->root_verity)
6128 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6129
8c7be95e
LP
6130 STRV_FOREACH(e, c->environment)
6131 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6132
6133 STRV_FOREACH(e, c->environment_files)
6134 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 6135
b4c14404
FB
6136 STRV_FOREACH(e, c->pass_environment)
6137 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6138
00819cc1
LP
6139 STRV_FOREACH(e, c->unset_environment)
6140 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6141
53f47dfc
YW
6142 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6143
5b10116e 6144 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
6145 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6146
211a3d87
LB
6147 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6148 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6149
6150 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6151 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6152 }
3536f49e 6153 }
c2bbd90b 6154
5291f26d 6155 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 6156
fb33a393 6157 if (c->nice_set)
5291f26d 6158 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 6159
dd6c17b1 6160 if (c->oom_score_adjust_set)
5291f26d 6161 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 6162
ad21e542 6163 if (c->coredump_filter_set)
5291f26d 6164 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 6165
5b10116e 6166 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 6167 if (c->rlimit[i]) {
4c3a2b84 6168 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 6169 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 6170 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
6171 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6172 }
94f04347 6173
f8b69d1d 6174 if (c->ioprio_set) {
1756a011 6175 _cleanup_free_ char *class_str = NULL;
f8b69d1d 6176
5bead76e 6177 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
6178 if (r >= 0)
6179 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6180
5bead76e 6181 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 6182 }
94f04347 6183
f8b69d1d 6184 if (c->cpu_sched_set) {
1756a011 6185 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 6186
837df140
YW
6187 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6188 if (r >= 0)
6189 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6190
94f04347 6191 fprintf(f,
38b48754
LP
6192 "%sCPUSchedulingPriority: %i\n"
6193 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
6194 prefix, c->cpu_sched_priority,
6195 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 6196 }
94f04347 6197
0985c7c4 6198 if (c->cpu_set.set) {
e7fca352
MS
6199 _cleanup_free_ char *affinity = NULL;
6200
6201 affinity = cpu_set_to_range_string(&c->cpu_set);
6202 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
6203 }
6204
b070c7c0
MS
6205 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6206 _cleanup_free_ char *nodes = NULL;
6207
6208 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6209 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6210 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6211 }
6212
3a43da28 6213 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 6214 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
6215
6216 fprintf(f,
80876c20
LP
6217 "%sStandardInput: %s\n"
6218 "%sStandardOutput: %s\n"
6219 "%sStandardError: %s\n",
6220 prefix, exec_input_to_string(c->std_input),
6221 prefix, exec_output_to_string(c->std_output),
6222 prefix, exec_output_to_string(c->std_error));
6223
befc4a80
LP
6224 if (c->std_input == EXEC_INPUT_NAMED_FD)
6225 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6226 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6227 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6228 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6229 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6230
6231 if (c->std_input == EXEC_INPUT_FILE)
6232 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6233 if (c->std_output == EXEC_OUTPUT_FILE)
6234 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
6235 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6236 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
6237 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6238 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
6239 if (c->std_error == EXEC_OUTPUT_FILE)
6240 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
6241 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6242 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
6243 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6244 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 6245
80876c20
LP
6246 if (c->tty_path)
6247 fprintf(f,
6ea832a2
LP
6248 "%sTTYPath: %s\n"
6249 "%sTTYReset: %s\n"
6250 "%sTTYVHangup: %s\n"
51462135
DDM
6251 "%sTTYVTDisallocate: %s\n"
6252 "%sTTYRows: %u\n"
6253 "%sTTYColumns: %u\n",
6ea832a2
LP
6254 prefix, c->tty_path,
6255 prefix, yes_no(c->tty_reset),
6256 prefix, yes_no(c->tty_vhangup),
51462135
DDM
6257 prefix, yes_no(c->tty_vt_disallocate),
6258 prefix, c->tty_rows,
6259 prefix, c->tty_cols);
94f04347 6260
9f6444eb 6261 if (IN_SET(c->std_output,
9f6444eb
LP
6262 EXEC_OUTPUT_KMSG,
6263 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6264 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6265 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6266 IN_SET(c->std_error,
9f6444eb
LP
6267 EXEC_OUTPUT_KMSG,
6268 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 6271
5ce70e5b 6272 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 6273
837df140
YW
6274 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6275 if (r >= 0)
6276 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 6277
837df140
YW
6278 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6279 if (r >= 0)
6280 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 6281 }
94f04347 6282
d3070fbd
LP
6283 if (c->log_level_max >= 0) {
6284 _cleanup_free_ char *t = NULL;
6285
6286 (void) log_level_to_string_alloc(c->log_level_max, &t);
6287
6288 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6289 }
6290
5291f26d 6291 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
6292 fprintf(f,
6293 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 6294 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 6295
5ac1530e
ZJS
6296 if (c->log_ratelimit_burst > 0)
6297 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 6298
523ea123
QD
6299 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6300 fprintf(f, "%sLogFilterPatterns:", prefix);
6301
6302 char *pattern;
6303 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6304 fprintf(f, " %s", pattern);
6305 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6306 fprintf(f, " ~%s", pattern);
6307 fputc('\n', f);
6308 }
6309
5b10116e
ZJS
6310 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6311 fprintf(f, "%sLogExtraFields: ", prefix);
6312 fwrite(c->log_extra_fields[j].iov_base,
6313 1, c->log_extra_fields[j].iov_len,
6314 f);
6315 fputc('\n', f);
d3070fbd
LP
6316 }
6317
91dd5f7c
LP
6318 if (c->log_namespace)
6319 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6320
07d46372
YW
6321 if (c->secure_bits) {
6322 _cleanup_free_ char *str = NULL;
6323
6324 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6325 if (r >= 0)
6326 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6327 }
94f04347 6328
3fd5190b 6329 if (c->capability_bounding_set != CAP_MASK_UNSET) {
dd1f5bd0 6330 _cleanup_free_ char *str = NULL;
94f04347 6331
8142d735 6332 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
6333 if (r >= 0)
6334 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6335 }
6336
6337 if (c->capability_ambient_set != 0) {
dd1f5bd0 6338 _cleanup_free_ char *str = NULL;
755d4b67 6339
8142d735 6340 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
6341 if (r >= 0)
6342 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6343 }
6344
6345 if (c->user)
f2d3769a 6346 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6347 if (c->group)
f2d3769a 6348 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6349
29206d46
LP
6350 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6351
ddc155b2 6352 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6353
5b6319dc 6354 if (c->pam_name)
f2d3769a 6355 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6356
ddc155b2
TM
6357 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6358 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6359 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6360 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6361 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6362 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6363
5b10116e
ZJS
6364 for (size_t i = 0; i < c->n_bind_mounts; i++)
6365 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6366 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6367 c->bind_mounts[i].ignore_enoent ? "-": "",
6368 c->bind_mounts[i].source,
6369 c->bind_mounts[i].destination,
6370 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6371
5b10116e
ZJS
6372 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6373 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6374
5b10116e
ZJS
6375 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6376 t->path,
6377 isempty(t->options) ? "" : ":",
6378 strempty(t->options));
6379 }
2abd4e38 6380
169c1bda
LP
6381 if (c->utmp_id)
6382 fprintf(f,
6383 "%sUtmpIdentifier: %s\n",
6384 prefix, c->utmp_id);
7b52a628
MS
6385
6386 if (c->selinux_context)
6387 fprintf(f,
5f8640fb
LP
6388 "%sSELinuxContext: %s%s\n",
6389 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6390
80c21aea
WC
6391 if (c->apparmor_profile)
6392 fprintf(f,
6393 "%sAppArmorProfile: %s%s\n",
6394 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6395
6396 if (c->smack_process_label)
6397 fprintf(f,
6398 "%sSmackProcessLabel: %s%s\n",
6399 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6400
050f7277 6401 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6402 fprintf(f,
6403 "%sPersonality: %s\n",
6404 prefix, strna(personality_to_string(c->personality)));
6405
78e864e5
TM
6406 fprintf(f,
6407 "%sLockPersonality: %s\n",
6408 prefix, yes_no(c->lock_personality));
6409
17df7223 6410 if (c->syscall_filter) {
17df7223 6411 fprintf(f,
57183d11 6412 "%sSystemCallFilter: ",
17df7223
LP
6413 prefix);
6414
6b000af4 6415 if (!c->syscall_allow_list)
17df7223
LP
6416 fputc('~', f);
6417
349cc4a5 6418#if HAVE_SECCOMP
d5a99b7c
JJ
6419 void *id, *val;
6420 bool first = true;
90e74a66 6421 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6422 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6423 const char *errno_name = NULL;
6424 int num = PTR_TO_INT(val);
17df7223
LP
6425
6426 if (first)
6427 first = false;
6428 else
6429 fputc(' ', f);
6430
57183d11 6431 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6432 fputs(strna(name), f);
8cfa775f
YW
6433
6434 if (num >= 0) {
005bfaf1 6435 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6436 if (errno_name)
6437 fprintf(f, ":%s", errno_name);
6438 else
6439 fprintf(f, ":%d", num);
6440 }
17df7223 6441 }
351a19b1 6442#endif
17df7223
LP
6443
6444 fputc('\n', f);
6445 }
6446
57183d11 6447 if (c->syscall_archs) {
57183d11
LP
6448 fprintf(f,
6449 "%sSystemCallArchitectures:",
6450 prefix);
6451
349cc4a5 6452#if HAVE_SECCOMP
d5a99b7c 6453 void *id;
90e74a66 6454 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6455 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6456#endif
6457 fputc('\n', f);
6458 }
6459
add00535
LP
6460 if (exec_context_restrict_namespaces_set(c)) {
6461 _cleanup_free_ char *s = NULL;
6462
86c2a9f1 6463 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6464 if (r >= 0)
6465 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6466 prefix, strna(s));
add00535
LP
6467 }
6468
b1994387 6469#if HAVE_LIBBPF
8fe84dc8
YW
6470 if (exec_context_restrict_filesystems_set(c)) {
6471 char *fs;
6472 SET_FOREACH(fs, c->restrict_filesystems)
6473 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6474 }
b1994387
ILG
6475#endif
6476
a8d08f39
LP
6477 if (c->network_namespace_path)
6478 fprintf(f,
6479 "%sNetworkNamespacePath: %s\n",
6480 prefix, c->network_namespace_path);
6481
3df90f24 6482 if (c->syscall_errno > 0) {
3df90f24
YW
6483 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6484
005bfaf1 6485#if HAVE_SECCOMP
d5a99b7c 6486 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6487 if (errno_name)
005bfaf1 6488 fputs(errno_name, f);
3df90f24 6489 else
005bfaf1
TM
6490 fprintf(f, "%d", c->syscall_errno);
6491#endif
6492 fputc('\n', f);
3df90f24 6493 }
b3d13314 6494
5b10116e 6495 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6496 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6497 c->mount_images[i].ignore_enoent ? "-": "",
6498 c->mount_images[i].source,
79e20ceb 6499 c->mount_images[i].destination);
427353f6 6500 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6501 fprintf(f, ":%s:%s",
427353f6 6502 partition_designator_to_string(o->partition_designator),
79e20ceb 6503 strempty(o->options));
427353f6
LB
6504 fprintf(f, "\n");
6505 }
93f59701
LB
6506
6507 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6508 fprintf(f, "%sExtensionImages: %s%s", prefix,
6509 c->extension_images[i].ignore_enoent ? "-": "",
6510 c->extension_images[i].source);
6511 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6512 fprintf(f, ":%s:%s",
6513 partition_designator_to_string(o->partition_designator),
6514 strempty(o->options));
6515 fprintf(f, "\n");
6516 }
a07b9926
LB
6517
6518 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6519}
6520
34cf6c43 6521bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6522 assert(c);
6523
61233823 6524 /* Returns true if the process forked off would run under
a931ad47
LP
6525 * an unchanged UID or as root. */
6526
6527 if (!c->user)
6528 return true;
6529
6530 if (streq(c->user, "root") || streq(c->user, "0"))
6531 return true;
6532
6533 return false;
6534}
6535
34cf6c43 6536int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6537 int p;
6538
6539 assert(c);
6540
6541 if (c->ioprio_set)
6542 return c->ioprio;
6543
6544 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6545 if (p < 0)
0692548c 6546 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6547
8b330d7d 6548 return ioprio_normalize(p);
7f452159
LP
6549}
6550
5e98086d
ZJS
6551bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6552 assert(c);
6553
61198784 6554 /* Explicit setting wins */
5e98086d
ZJS
6555 if (c->mount_apivfs_set)
6556 return c->mount_apivfs;
6557
61198784 6558 /* Default to "yes" if root directory or image are specified */
74e12520 6559 if (exec_context_with_rootfs(c))
61198784
ZJS
6560 return true;
6561
5e98086d
ZJS
6562 return false;
6563}
6564
d3070fbd 6565void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6566 assert(c);
6567
5b10116e 6568 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6569 free(c->log_extra_fields[l].iov_base);
6570 c->log_extra_fields = mfree(c->log_extra_fields);
6571 c->n_log_extra_fields = 0;
6572}
6573
6f765baf 6574void exec_context_revert_tty(ExecContext *c) {
254d1313 6575 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6576 const char *path;
6577 struct stat st;
6f765baf
LP
6578 int r;
6579
6580 assert(c);
6581
6582 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6583 exec_context_tty_reset(c, NULL);
6584
6585 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6586 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6587 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6588 if (!exec_context_may_touch_tty(c))
6589 return;
6f765baf 6590
0ba976e8
LP
6591 path = exec_context_tty_path(c);
6592 if (!path)
6593 return;
6f765baf 6594
0ba976e8
LP
6595 fd = open(path, O_PATH|O_CLOEXEC);
6596 if (fd < 0)
6597 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6598 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6599 path);
6600
6601 if (fstat(fd, &st) < 0)
6602 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6603
6604 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6605 * if things are a character device, since a proper check either means we'd have to open the TTY and
6606 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6607 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6608 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6609 if (!S_ISCHR(st.st_mode))
6610 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6611
6612 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6613 if (r < 0)
6614 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6615}
6616
4c2f5842
LP
6617int exec_context_get_clean_directories(
6618 ExecContext *c,
6619 char **prefix,
6620 ExecCleanMask mask,
6621 char ***ret) {
6622
6623 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6624 int r;
6625
6626 assert(c);
6627 assert(prefix);
6628 assert(ret);
6629
5b10116e 6630 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6631 if (!FLAGS_SET(mask, 1U << t))
6632 continue;
6633
6634 if (!prefix[t])
6635 continue;
6636
211a3d87 6637 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6638 char *j;
6639
211a3d87 6640 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6641 if (!j)
6642 return -ENOMEM;
6643
6644 r = strv_consume(&l, j);
6645 if (r < 0)
6646 return r;
7f622a19
YW
6647
6648 /* Also remove private directories unconditionally. */
6649 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6650 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6651 if (!j)
6652 return -ENOMEM;
6653
6654 r = strv_consume(&l, j);
6655 if (r < 0)
6656 return r;
6657 }
6658
211a3d87
LB
6659 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6660 j = path_join(prefix[t], *symlink);
7f622a19
YW
6661 if (!j)
6662 return -ENOMEM;
6663
6664 r = strv_consume(&l, j);
6665 if (r < 0)
6666 return r;
6667 }
4c2f5842
LP
6668 }
6669 }
6670
6671 *ret = TAKE_PTR(l);
6672 return 0;
6673}
6674
6675int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6676 ExecCleanMask mask = 0;
6677
6678 assert(c);
6679 assert(ret);
6680
6681 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6682 if (c->directories[t].n_items > 0)
4c2f5842
LP
6683 mask |= 1U << t;
6684
6685 *ret = mask;
6686 return 0;
6687}
6688
b58b4116 6689void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6690 assert(s);
5cb5a6ff 6691
2ed26ed0
LP
6692 *s = (ExecStatus) {
6693 .pid = pid,
6694 };
6695
b58b4116
LP
6696 dual_timestamp_get(&s->start_timestamp);
6697}
6698
34cf6c43 6699void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6700 assert(s);
6701
d46b79bb 6702 if (s->pid != pid)
2ed26ed0
LP
6703 *s = (ExecStatus) {
6704 .pid = pid,
6705 };
b58b4116 6706
63983207 6707 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6708
034c6ed7
LP
6709 s->code = code;
6710 s->status = status;
169c1bda 6711
6f765baf
LP
6712 if (context && context->utmp_id)
6713 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6714}
6715
6a1d4d9f
LP
6716void exec_status_reset(ExecStatus *s) {
6717 assert(s);
6718
6719 *s = (ExecStatus) {};
6720}
6721
34cf6c43 6722void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6723 assert(s);
6724 assert(f);
6725
9fb86720
LP
6726 if (s->pid <= 0)
6727 return;
6728
4c940960
LP
6729 prefix = strempty(prefix);
6730
9fb86720 6731 fprintf(f,
ccd06097
ZJS
6732 "%sPID: "PID_FMT"\n",
6733 prefix, s->pid);
9fb86720 6734
af9d16e1 6735 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6736 fprintf(f,
6737 "%sStart Timestamp: %s\n",
04f5c018 6738 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6739
af9d16e1 6740 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6741 fprintf(f,
6742 "%sExit Timestamp: %s\n"
6743 "%sExit Code: %s\n"
6744 "%sExit Status: %i\n",
04f5c018 6745 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6746 prefix, sigchld_code_to_string(s->code),
6747 prefix, s->status);
5cb5a6ff 6748}
44d8db9e 6749
34cf6c43 6750static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6751 _cleanup_free_ char *cmd = NULL;
4c940960 6752 const char *prefix2;
44d8db9e
LP
6753
6754 assert(c);
6755 assert(f);
6756
4c940960 6757 prefix = strempty(prefix);
63c372cb 6758 prefix2 = strjoina(prefix, "\t");
44d8db9e 6759
4ef15008 6760 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6761
44d8db9e
LP
6762 fprintf(f,
6763 "%sCommand Line: %s\n",
38553034 6764 prefix, strnull(cmd));
44d8db9e 6765
9fb86720 6766 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6767}
6768
6769void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6770 assert(f);
6771
4c940960 6772 prefix = strempty(prefix);
44d8db9e 6773
03677889
YW
6774 LIST_FOREACH(command, i, c)
6775 exec_command_dump(i, f, prefix);
44d8db9e 6776}
94f04347 6777
a6a80b4f
LP
6778void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6779 ExecCommand *end;
6780
6781 assert(l);
6782 assert(e);
6783
6784 if (*l) {
35b8ca3a 6785 /* It's kind of important, that we keep the order here */
cc232fa0 6786 end = LIST_FIND_TAIL(command, *l);
71fda00f 6787 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6788 } else
6789 *l = e;
6790}
6791
26fd040d
LP
6792int exec_command_set(ExecCommand *c, const char *path, ...) {
6793 va_list ap;
6794 char **l, *p;
6795
6796 assert(c);
6797 assert(path);
6798
6799 va_start(ap, path);
6800 l = strv_new_ap(path, ap);
6801 va_end(ap);
6802
6803 if (!l)
6804 return -ENOMEM;
6805
250a918d
LP
6806 p = strdup(path);
6807 if (!p) {
26fd040d
LP
6808 strv_free(l);
6809 return -ENOMEM;
6810 }
6811
6897dfe8 6812 free_and_replace(c->path, p);
26fd040d 6813
130d3d22 6814 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6815}
6816
86b23b07 6817int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6818 _cleanup_strv_free_ char **l = NULL;
86b23b07 6819 va_list ap;
86b23b07
JS
6820 int r;
6821
6822 assert(c);
6823 assert(path);
6824
6825 va_start(ap, path);
6826 l = strv_new_ap(path, ap);
6827 va_end(ap);
6828
6829 if (!l)
6830 return -ENOMEM;
6831
e287086b 6832 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6833 if (r < 0)
86b23b07 6834 return r;
86b23b07
JS
6835
6836 return 0;
6837}
6838
e8a565cb
YW
6839static void *remove_tmpdir_thread(void *p) {
6840 _cleanup_free_ char *path = p;
86b23b07 6841
e8a565cb
YW
6842 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6843 return NULL;
6844}
6845
6846static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6847 int r;
6848
6849 if (!rt)
6850 return NULL;
6851
6852 if (rt->manager)
6853 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6854
6855 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6856
6857 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6858 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6859
6860 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6861 if (r < 0)
e8a565cb 6862 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6863 else
6864 rt->tmp_dir = NULL;
e8a565cb 6865 }
613b411c 6866
56a13a49 6867 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6868 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6869
6870 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6871 if (r < 0)
e8a565cb 6872 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6873 else
6874 rt->var_tmp_dir = NULL;
e8a565cb
YW
6875 }
6876
6877 rt->id = mfree(rt->id);
6878 rt->tmp_dir = mfree(rt->tmp_dir);
6879 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6880 safe_close_pair(rt->netns_storage_socket);
a70581ff 6881 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6882 return mfree(rt);
6883}
6884
6885static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6886 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6887}
6888
56a13a49
ZJS
6889static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6890 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6891 ExecRuntime *n;
613b411c 6892
8e8009dc 6893 assert(ret);
613b411c 6894
56a13a49
ZJS
6895 id_copy = strdup(id);
6896 if (!id_copy)
6897 return -ENOMEM;
6898
8e8009dc
LP
6899 n = new(ExecRuntime, 1);
6900 if (!n)
613b411c
LP
6901 return -ENOMEM;
6902
8e8009dc 6903 *n = (ExecRuntime) {
56a13a49 6904 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6905 .netns_storage_socket = PIPE_EBADF,
6906 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6907 };
6908
6909 *ret = n;
613b411c
LP
6910 return 0;
6911}
6912
e8a565cb
YW
6913static int exec_runtime_add(
6914 Manager *m,
6915 const char *id,
56a13a49
ZJS
6916 char **tmp_dir,
6917 char **var_tmp_dir,
6918 int netns_storage_socket[2],
a70581ff 6919 int ipcns_storage_socket[2],
e8a565cb
YW
6920 ExecRuntime **ret) {
6921
6922 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6923 int r;
6924
e8a565cb 6925 assert(m);
613b411c
LP
6926 assert(id);
6927
a70581ff 6928 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6929
56a13a49 6930 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6931 if (r < 0)
6932 return r;
6933
63083706 6934 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6935 if (r < 0)
6936 return r;
e8a565cb 6937
56a13a49
ZJS
6938 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6939 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6940 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6941
6942 if (netns_storage_socket) {
56a13a49
ZJS
6943 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6944 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6945 }
6946
a70581ff
XR
6947 if (ipcns_storage_socket) {
6948 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6949 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6950 }
6951
e8a565cb
YW
6952 rt->manager = m;
6953
6954 if (ret)
6955 *ret = rt;
e8a565cb 6956 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6957 TAKE_PTR(rt);
e8a565cb
YW
6958 return 0;
6959}
6960
74aaf59b
LP
6961static int exec_runtime_make(
6962 Manager *m,
6963 const ExecContext *c,
6964 const char *id,
6965 ExecRuntime **ret) {
6966
56a13a49 6967 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6968 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6969 int r;
6970
6971 assert(m);
6972 assert(c);
6973 assert(id);
6974
6975 /* It is not necessary to create ExecRuntime object. */
fde36d25 6976 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
74aaf59b 6977 *ret = NULL;
e8a565cb 6978 return 0;
74aaf59b 6979 }
e8a565cb 6980
efa2f3a1
TM
6981 if (c->private_tmp &&
6982 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6983 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6984 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6985 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6986 if (r < 0)
6987 return r;
6988 }
6989
fbbb9697 6990 if (exec_needs_network_namespace(c)) {
e8a565cb
YW
6991 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6992 return -errno;
6993 }
6994
fde36d25 6995 if (exec_needs_ipc_namespace(c)) {
a70581ff
XR
6996 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6997 return -errno;
6998 }
6999
7000 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
7001 if (r < 0)
7002 return r;
7003
613b411c
LP
7004 return 1;
7005}
7006
e8a565cb
YW
7007int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
7008 ExecRuntime *rt;
7009 int r;
613b411c 7010
e8a565cb
YW
7011 assert(m);
7012 assert(id);
7013 assert(ret);
7014
7015 rt = hashmap_get(m->exec_runtime_by_id, id);
7016 if (rt)
387f6955 7017 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
7018 goto ref;
7019
74aaf59b
LP
7020 if (!create) {
7021 *ret = NULL;
e8a565cb 7022 return 0;
74aaf59b 7023 }
e8a565cb
YW
7024
7025 /* If not found, then create a new object. */
7026 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 7027 if (r < 0)
e8a565cb 7028 return r;
74aaf59b
LP
7029 if (r == 0) {
7030 /* When r == 0, it is not necessary to create ExecRuntime object. */
7031 *ret = NULL;
7032 return 0;
7033 }
613b411c 7034
e8a565cb
YW
7035ref:
7036 /* increment reference counter. */
7037 rt->n_ref++;
7038 *ret = rt;
7039 return 1;
7040}
613b411c 7041
e8a565cb
YW
7042ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
7043 if (!rt)
613b411c
LP
7044 return NULL;
7045
e8a565cb 7046 assert(rt->n_ref > 0);
613b411c 7047
e8a565cb
YW
7048 rt->n_ref--;
7049 if (rt->n_ref > 0)
f2341e0a
LP
7050 return NULL;
7051
e8a565cb 7052 return exec_runtime_free(rt, destroy);
613b411c
LP
7053}
7054
e8a565cb
YW
7055int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
7056 ExecRuntime *rt;
e8a565cb
YW
7057
7058 assert(m);
613b411c
LP
7059 assert(f);
7060 assert(fds);
7061
90e74a66 7062 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 7063 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 7064
e8a565cb
YW
7065 if (rt->tmp_dir)
7066 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 7067
e8a565cb
YW
7068 if (rt->var_tmp_dir)
7069 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 7070
e8a565cb
YW
7071 if (rt->netns_storage_socket[0] >= 0) {
7072 int copy;
613b411c 7073
e8a565cb
YW
7074 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
7075 if (copy < 0)
7076 return copy;
613b411c 7077
e8a565cb
YW
7078 fprintf(f, " netns-socket-0=%i", copy);
7079 }
613b411c 7080
e8a565cb
YW
7081 if (rt->netns_storage_socket[1] >= 0) {
7082 int copy;
613b411c 7083
e8a565cb
YW
7084 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
7085 if (copy < 0)
7086 return copy;
613b411c 7087
e8a565cb
YW
7088 fprintf(f, " netns-socket-1=%i", copy);
7089 }
7090
a70581ff
XR
7091 if (rt->ipcns_storage_socket[0] >= 0) {
7092 int copy;
7093
7094 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
7095 if (copy < 0)
7096 return copy;
7097
7098 fprintf(f, " ipcns-socket-0=%i", copy);
7099 }
7100
7101 if (rt->ipcns_storage_socket[1] >= 0) {
7102 int copy;
7103
7104 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
7105 if (copy < 0)
7106 return copy;
7107
7108 fprintf(f, " ipcns-socket-1=%i", copy);
7109 }
7110
e8a565cb 7111 fputc('\n', f);
613b411c
LP
7112 }
7113
7114 return 0;
7115}
7116
e8a565cb
YW
7117int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
7118 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
7119 ExecRuntime *rt;
613b411c
LP
7120 int r;
7121
e8a565cb
YW
7122 /* This is for the migration from old (v237 or earlier) deserialization text.
7123 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7124 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
7125 * so or not from the serialized text, then we always creates a new object owned by this. */
7126
7127 assert(u);
613b411c
LP
7128 assert(key);
7129 assert(value);
7130
e8a565cb
YW
7131 /* Manager manages ExecRuntime objects by the unit id.
7132 * So, we omit the serialized text when the unit does not have id (yet?)... */
7133 if (isempty(u->id)) {
7134 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7135 return 0;
7136 }
613b411c 7137
cbc165d1
ZJS
7138 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
7139 return log_oom();
e8a565cb
YW
7140
7141 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
7142 if (!rt) {
cbc165d1 7143 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 7144 return log_oom();
613b411c 7145
e8a565cb
YW
7146 rt = rt_create;
7147 }
7148
7149 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
7150 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7151 return -ENOMEM;
613b411c
LP
7152
7153 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
7154 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7155 return -ENOMEM;
613b411c
LP
7156
7157 } else if (streq(key, "netns-socket-0")) {
7158 int fd;
7159
e8a565cb 7160 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7161 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7162 return 0;
613b411c 7163 }
e8a565cb
YW
7164
7165 safe_close(rt->netns_storage_socket[0]);
7166 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7167
613b411c
LP
7168 } else if (streq(key, "netns-socket-1")) {
7169 int fd;
7170
e8a565cb 7171 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7172 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7173 return 0;
613b411c 7174 }
e8a565cb
YW
7175
7176 safe_close(rt->netns_storage_socket[1]);
7177 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 7178
613b411c
LP
7179 } else
7180 return 0;
7181
e8a565cb
YW
7182 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7183 if (rt_create) {
7184 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
7185 if (r < 0) {
3fe91079 7186 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
7187 return 0;
7188 }
613b411c 7189
e8a565cb 7190 rt_create->manager = u->manager;
613b411c 7191
e8a565cb 7192 /* Avoid cleanup */
56a13a49 7193 TAKE_PTR(rt_create);
e8a565cb 7194 }
98b47d54 7195
e8a565cb
YW
7196 return 1;
7197}
613b411c 7198
56a13a49
ZJS
7199int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7200 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7201 char *id = NULL;
a70581ff 7202 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 7203 const char *p, *v = ASSERT_PTR(value);
e8a565cb 7204 size_t n;
613b411c 7205
e8a565cb 7206 assert(m);
e8a565cb 7207 assert(fds);
98b47d54 7208
e8a565cb 7209 n = strcspn(v, " ");
2f82562b 7210 id = strndupa_safe(v, n);
e8a565cb
YW
7211 if (v[n] != ' ')
7212 goto finalize;
7213 p = v + n + 1;
7214
7215 v = startswith(p, "tmp-dir=");
7216 if (v) {
7217 n = strcspn(v, " ");
56a13a49
ZJS
7218 tmp_dir = strndup(v, n);
7219 if (!tmp_dir)
7220 return log_oom();
e8a565cb
YW
7221 if (v[n] != ' ')
7222 goto finalize;
7223 p = v + n + 1;
7224 }
7225
7226 v = startswith(p, "var-tmp-dir=");
7227 if (v) {
7228 n = strcspn(v, " ");
56a13a49
ZJS
7229 var_tmp_dir = strndup(v, n);
7230 if (!var_tmp_dir)
7231 return log_oom();
e8a565cb
YW
7232 if (v[n] != ' ')
7233 goto finalize;
7234 p = v + n + 1;
7235 }
7236
7237 v = startswith(p, "netns-socket-0=");
7238 if (v) {
7239 char *buf;
7240
7241 n = strcspn(v, " ");
2f82562b 7242 buf = strndupa_safe(v, n);
c413bb28 7243
a70581ff 7244 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
7245 if (r < 0)
7246 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 7247 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 7248 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7249 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7250 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
7251 if (v[n] != ' ')
7252 goto finalize;
7253 p = v + n + 1;
613b411c
LP
7254 }
7255
e8a565cb
YW
7256 v = startswith(p, "netns-socket-1=");
7257 if (v) {
7258 char *buf;
98b47d54 7259
e8a565cb 7260 n = strcspn(v, " ");
2f82562b 7261 buf = strndupa_safe(v, n);
a70581ff
XR
7262
7263 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
7264 if (r < 0)
7265 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
7266 if (!fdset_contains(fds, netns_fdpair[1]))
7267 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7268 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7269 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7270 if (v[n] != ' ')
7271 goto finalize;
7272 p = v + n + 1;
7273 }
7274
7275 v = startswith(p, "ipcns-socket-0=");
7276 if (v) {
7277 char *buf;
7278
7279 n = strcspn(v, " ");
2f82562b 7280 buf = strndupa_safe(v, n);
a70581ff
XR
7281
7282 r = safe_atoi(buf, &ipcns_fdpair[0]);
7283 if (r < 0)
7284 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7285 if (!fdset_contains(fds, ipcns_fdpair[0]))
7286 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7287 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7288 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7289 if (v[n] != ' ')
7290 goto finalize;
7291 p = v + n + 1;
7292 }
7293
7294 v = startswith(p, "ipcns-socket-1=");
7295 if (v) {
7296 char *buf;
7297
7298 n = strcspn(v, " ");
2f82562b 7299 buf = strndupa_safe(v, n);
a70581ff
XR
7300
7301 r = safe_atoi(buf, &ipcns_fdpair[1]);
7302 if (r < 0)
7303 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7304 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 7305 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7306 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7307 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 7308 }
98b47d54 7309
e8a565cb 7310finalize:
a70581ff 7311 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 7312 if (r < 0)
56a13a49
ZJS
7313 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7314 return 0;
e8a565cb 7315}
613b411c 7316
e8a565cb
YW
7317void exec_runtime_vacuum(Manager *m) {
7318 ExecRuntime *rt;
e8a565cb
YW
7319
7320 assert(m);
7321
7322 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7323
90e74a66 7324 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
7325 if (rt->n_ref > 0)
7326 continue;
7327
7328 (void) exec_runtime_free(rt, false);
7329 }
613b411c
LP
7330}
7331
b9c04eaf
YW
7332void exec_params_clear(ExecParameters *p) {
7333 if (!p)
7334 return;
7335
c3f8a065
LP
7336 p->environment = strv_free(p->environment);
7337 p->fd_names = strv_free(p->fd_names);
7338 p->fds = mfree(p->fds);
7339 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7340}
7341
bb0c0d6f
LP
7342ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7343 if (!sc)
7344 return NULL;
7345
7346 free(sc->id);
7347 free(sc->data);
7348 return mfree(sc);
7349}
7350
43144be4
LP
7351ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7352 if (!lc)
7353 return NULL;
7354
7355 free(lc->id);
7356 free(lc->path);
7357 return mfree(lc);
7358}
7359
211a3d87
LB
7360void exec_directory_done(ExecDirectory *d) {
7361 if (!d)
7362 return;
7363
7364 for (size_t i = 0; i < d->n_items; i++) {
7365 free(d->items[i].path);
7366 strv_free(d->items[i].symlinks);
7367 }
7368
7369 d->items = mfree(d->items);
7370 d->n_items = 0;
7371 d->mode = 0755;
7372}
7373
564e5c98
YW
7374static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7375 assert(d);
7376 assert(path);
7377
7378 for (size_t i = 0; i < d->n_items; i++)
7379 if (path_equal(d->items[i].path, path))
7380 return &d->items[i];
7381
7382 return NULL;
7383}
7384
7385int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7386 _cleanup_strv_free_ char **s = NULL;
7387 _cleanup_free_ char *p = NULL;
564e5c98
YW
7388 ExecDirectoryItem *existing;
7389 int r;
211a3d87
LB
7390
7391 assert(d);
211a3d87
LB
7392 assert(path);
7393
564e5c98
YW
7394 existing = exec_directory_find(d, path);
7395 if (existing) {
7396 r = strv_extend(&existing->symlinks, symlink);
7397 if (r < 0)
7398 return r;
7399
7400 return 0; /* existing item is updated */
7401 }
7402
211a3d87
LB
7403 p = strdup(path);
7404 if (!p)
7405 return -ENOMEM;
7406
564e5c98
YW
7407 if (symlink) {
7408 s = strv_new(symlink);
211a3d87
LB
7409 if (!s)
7410 return -ENOMEM;
7411 }
7412
564e5c98 7413 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7414 return -ENOMEM;
7415
564e5c98 7416 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7417 .path = TAKE_PTR(p),
7418 .symlinks = TAKE_PTR(s),
7419 };
7420
564e5c98 7421 return 1; /* new item is added */
211a3d87
LB
7422}
7423
a2ab603c
YW
7424static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7425 assert(a);
7426 assert(b);
7427
7428 return path_compare(a->path, b->path);
7429}
7430
7431void exec_directory_sort(ExecDirectory *d) {
7432 assert(d);
7433
7434 /* Sort the exec directories to make always parent directories processed at first in
7435 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7436 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7437 * list. See also comments in setup_exec_directory() and issue #24783. */
7438
7439 if (d->n_items <= 1)
7440 return;
7441
7442 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7443
7444 for (size_t i = 1; i < d->n_items; i++)
7445 for (size_t j = 0; j < i; j++)
7446 if (path_startswith(d->items[i].path, d->items[j].path)) {
7447 d->items[i].only_create = true;
7448 break;
7449 }
211a3d87
LB
7450}
7451
bb0c0d6f 7452DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7453DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7454
80876c20
LP
7455static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7456 [EXEC_INPUT_NULL] = "null",
7457 [EXEC_INPUT_TTY] = "tty",
7458 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7459 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7460 [EXEC_INPUT_SOCKET] = "socket",
7461 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7462 [EXEC_INPUT_DATA] = "data",
2038c3f5 7463 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7464};
7465
8a0867d6
LP
7466DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7467
94f04347 7468static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7469 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7470 [EXEC_OUTPUT_NULL] = "null",
80876c20 7471 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7472 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7473 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7474 [EXEC_OUTPUT_JOURNAL] = "journal",
7475 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7476 [EXEC_OUTPUT_SOCKET] = "socket",
7477 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7478 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7479 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7480 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7481};
7482
7483DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7484
7485static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7486 [EXEC_UTMP_INIT] = "init",
7487 [EXEC_UTMP_LOGIN] = "login",
7488 [EXEC_UTMP_USER] = "user",
7489};
7490
7491DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7492
7493static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7494 [EXEC_PRESERVE_NO] = "no",
7495 [EXEC_PRESERVE_YES] = "yes",
7496 [EXEC_PRESERVE_RESTART] = "restart",
7497};
7498
7499DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7500
6b7b2ed9 7501/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7502static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7503 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7504 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7505 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7506 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7507 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7508};
7509
7510DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7511
211a3d87
LB
7512/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7513static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7514 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7515 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7516 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7517 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7518 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7519};
7520
7521DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7522
6b7b2ed9
LP
7523/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7524 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7525 * directories, specifically .timer units with their timestamp touch file. */
7526static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7527 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7528 [EXEC_DIRECTORY_STATE] = "state",
7529 [EXEC_DIRECTORY_CACHE] = "cache",
7530 [EXEC_DIRECTORY_LOGS] = "logs",
7531 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7532};
7533
7534DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7535
7536/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7537 * the service payload in. */
fb2042dd
YW
7538static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7539 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7540 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7541 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7542 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7543 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7544};
7545
7546DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7547
b1edf445
LP
7548static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7549 [EXEC_KEYRING_INHERIT] = "inherit",
7550 [EXEC_KEYRING_PRIVATE] = "private",
7551 [EXEC_KEYRING_SHARED] = "shared",
7552};
7553
7554DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);