]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
cap-list: make sure never to accidentally return more than 63 caps
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
ee617a4e 42#include "argv-util.h"
8dd4c05b
LP
43#include "async.h"
44#include "barrier.h"
b1994387 45#include "bpf-lsm.h"
8dd4c05b 46#include "cap-list.h"
430f0182 47#include "capability-util.h"
fdb3deca 48#include "cgroup-setup.h"
f4351959 49#include "chase-symlinks.h"
bb0c0d6f 50#include "chown-recursive.h"
28db6fbf 51#include "constants.h"
da681e1b 52#include "cpu-set-util.h"
43144be4 53#include "creds-util.h"
6a818c3c 54#include "data-fd-util.h"
686d13b9 55#include "env-file.h"
4d1a6904 56#include "env-util.h"
17df7223 57#include "errno-list.h"
8a62620e 58#include "escape.h"
3ffd4af2 59#include "execute.h"
8dd4c05b 60#include "exit-status.h"
3ffd4af2 61#include "fd-util.h"
bb0c0d6f 62#include "fileio.h"
f97b34a6 63#include "format-util.h"
7d50b32a 64#include "glob-util.h"
0389f4fa 65#include "hexdecoct.h"
c004493c 66#include "io-util.h"
032b3afb 67#include "ioprio-util.h"
a1164ae3 68#include "label.h"
8dd4c05b
LP
69#include "log.h"
70#include "macro.h"
e8a565cb 71#include "manager.h"
2a341bb9 72#include "manager-dump.h"
0a970718 73#include "memory-util.h"
f5947a5e 74#include "missing_fs.h"
5bead76e 75#include "missing_ioprio.h"
35cd0ba5 76#include "mkdir-label.h"
21935150 77#include "mount-util.h"
bb0c0d6f 78#include "mountpoint-util.h"
8dd4c05b 79#include "namespace.h"
6bedfcbb 80#include "parse-util.h"
8dd4c05b 81#include "path-util.h"
0b452006 82#include "process-util.h"
d3dcf4e3 83#include "random-util.h"
3989bdc1 84#include "recurse-dir.h"
78f22b97 85#include "rlimit-util.h"
8dd4c05b 86#include "rm-rf.h"
349cc4a5 87#if HAVE_SECCOMP
3ffd4af2
LP
88#include "seccomp-util.h"
89#endif
07d46372 90#include "securebits-util.h"
8dd4c05b 91#include "selinux-util.h"
24882e06 92#include "signal-util.h"
8dd4c05b 93#include "smack-util.h"
57b7a260 94#include "socket-util.h"
a2ab603c 95#include "sort-util.h"
fd63e712 96#include "special.h"
949befd3 97#include "stat-util.h"
8b43440b 98#include "string-table.h"
07630cea 99#include "string-util.h"
8dd4c05b 100#include "strv.h"
7ccbd1ae 101#include "syslog-util.h"
8dd4c05b 102#include "terminal-util.h"
bb0c0d6f 103#include "tmpfile-util.h"
566b7d23 104#include "umask-util.h"
2d3b784d 105#include "unit-serialize.h"
b1d4f8e1 106#include "user-util.h"
8dd4c05b 107#include "utmp-wtmp.h"
5cb5a6ff 108
e056b01d 109#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 110#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 111
531dca78
LP
112#define SNDBUF_SIZE (8*1024*1024)
113
da6053d0 114static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
115 if (n_fds <= 0)
116 return 0;
117
a0d40ac5
LP
118 /* Modifies the fds array! (sorts it) */
119
034c6ed7
LP
120 assert(fds);
121
5b10116e
ZJS
122 for (int start = 0;;) {
123 int restart_from = -1;
034c6ed7 124
5b10116e 125 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
126 int nfd;
127
128 /* Already at right index? */
129 if (fds[i] == i+3)
130 continue;
131
3cc2aff1
LP
132 nfd = fcntl(fds[i], F_DUPFD, i + 3);
133 if (nfd < 0)
034c6ed7
LP
134 return -errno;
135
03e334a1 136 safe_close(fds[i]);
034c6ed7
LP
137 fds[i] = nfd;
138
139 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 140 * let's remember that and try again from here */
034c6ed7
LP
141 if (nfd != i+3 && restart_from < 0)
142 restart_from = i;
143 }
144
145 if (restart_from < 0)
146 break;
147
148 start = restart_from;
149 }
150
151 return 0;
152}
153
cd48e23f
RP
154static int flags_fds(
155 const int fds[],
156 size_t n_socket_fds,
157 size_t n_fds,
158 bool nonblock) {
159
e2c76839 160 int r;
47a71eed
LP
161
162 if (n_fds <= 0)
163 return 0;
164
165 assert(fds);
166
9b141911
FB
167 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
168 * O_NONBLOCK only applies to socket activation though. */
47a71eed 169
5b10116e 170 for (size_t i = 0; i < n_fds; i++) {
47a71eed 171
9b141911
FB
172 if (i < n_socket_fds) {
173 r = fd_nonblock(fds[i], nonblock);
174 if (r < 0)
175 return r;
176 }
47a71eed 177
451a074f
LP
178 /* We unconditionally drop FD_CLOEXEC from the fds,
179 * since after all we want to pass these fds to our
180 * children */
47a71eed 181
3cc2aff1
LP
182 r = fd_cloexec(fds[i], false);
183 if (r < 0)
e2c76839 184 return r;
47a71eed
LP
185 }
186
187 return 0;
188}
189
1e22b5cd 190static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
191 assert(context);
192
1e22b5cd
LP
193 if (context->stdio_as_fds)
194 return NULL;
195
80876c20
LP
196 if (context->tty_path)
197 return context->tty_path;
198
199 return "/dev/console";
200}
201
1e22b5cd
LP
202static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
203 const char *path;
204
6ea832a2
LP
205 assert(context);
206
1e22b5cd 207 path = exec_context_tty_path(context);
6ea832a2 208
1e22b5cd
LP
209 if (context->tty_vhangup) {
210 if (p && p->stdin_fd >= 0)
211 (void) terminal_vhangup_fd(p->stdin_fd);
212 else if (path)
213 (void) terminal_vhangup(path);
214 }
6ea832a2 215
1e22b5cd
LP
216 if (context->tty_reset) {
217 if (p && p->stdin_fd >= 0)
218 (void) reset_terminal_fd(p->stdin_fd, true);
219 else if (path)
220 (void) reset_terminal(path);
221 }
222
51462135
DDM
223 if (p && p->stdin_fd >= 0)
224 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
225
1e22b5cd
LP
226 if (context->tty_vt_disallocate && path)
227 (void) vt_disallocate(path);
6ea832a2
LP
228}
229
6af760f3
LP
230static bool is_terminal_input(ExecInput i) {
231 return IN_SET(i,
232 EXEC_INPUT_TTY,
233 EXEC_INPUT_TTY_FORCE,
234 EXEC_INPUT_TTY_FAIL);
235}
236
3a1286b6 237static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
238 return IN_SET(o,
239 EXEC_OUTPUT_TTY,
6af760f3
LP
240 EXEC_OUTPUT_KMSG_AND_CONSOLE,
241 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
242}
243
aac8c0c3
LP
244static bool is_kmsg_output(ExecOutput o) {
245 return IN_SET(o,
246 EXEC_OUTPUT_KMSG,
247 EXEC_OUTPUT_KMSG_AND_CONSOLE);
248}
249
6af760f3
LP
250static bool exec_context_needs_term(const ExecContext *c) {
251 assert(c);
252
253 /* Return true if the execution context suggests we should set $TERM to something useful. */
254
255 if (is_terminal_input(c->std_input))
256 return true;
257
258 if (is_terminal_output(c->std_output))
259 return true;
260
261 if (is_terminal_output(c->std_error))
262 return true;
263
264 return !!c->tty_path;
3a1286b6
MS
265}
266
80876c20 267static int open_null_as(int flags, int nfd) {
046a82c1 268 int fd;
071830ff 269
80876c20 270 assert(nfd >= 0);
071830ff 271
613b411c
LP
272 fd = open("/dev/null", flags|O_NOCTTY);
273 if (fd < 0)
071830ff
LP
274 return -errno;
275
046a82c1 276 return move_fd(fd, nfd, false);
071830ff
LP
277}
278
91dd5f7c
LP
279static int connect_journal_socket(
280 int fd,
281 const char *log_namespace,
282 uid_t uid,
283 gid_t gid) {
284
524daa8c
ZJS
285 uid_t olduid = UID_INVALID;
286 gid_t oldgid = GID_INVALID;
91dd5f7c 287 const char *j;
524daa8c
ZJS
288 int r;
289
91dd5f7c
LP
290 j = log_namespace ?
291 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
292 "/run/systemd/journal/stdout";
91dd5f7c 293
cad93f29 294 if (gid_is_valid(gid)) {
524daa8c
ZJS
295 oldgid = getgid();
296
92a17af9 297 if (setegid(gid) < 0)
524daa8c
ZJS
298 return -errno;
299 }
300
cad93f29 301 if (uid_is_valid(uid)) {
524daa8c
ZJS
302 olduid = getuid();
303
92a17af9 304 if (seteuid(uid) < 0) {
524daa8c
ZJS
305 r = -errno;
306 goto restore_gid;
307 }
308 }
309
1861986a 310 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 311
1861986a
LP
312 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
313 an LSM interferes. */
524daa8c 314
cad93f29 315 if (uid_is_valid(uid))
524daa8c
ZJS
316 (void) seteuid(olduid);
317
318 restore_gid:
cad93f29 319 if (gid_is_valid(gid))
524daa8c
ZJS
320 (void) setegid(oldgid);
321
322 return r;
323}
324
fd1f9c89 325static int connect_logger_as(
34cf6c43 326 const Unit *unit,
fd1f9c89 327 const ExecContext *context,
af635cf3 328 const ExecParameters *params,
fd1f9c89
LP
329 ExecOutput output,
330 const char *ident,
fd1f9c89
LP
331 int nfd,
332 uid_t uid,
333 gid_t gid) {
334
254d1313 335 _cleanup_close_ int fd = -EBADF;
2ac1ff68 336 int r;
071830ff
LP
337
338 assert(context);
af635cf3 339 assert(params);
80876c20
LP
340 assert(output < _EXEC_OUTPUT_MAX);
341 assert(ident);
342 assert(nfd >= 0);
071830ff 343
54fe0cdb
LP
344 fd = socket(AF_UNIX, SOCK_STREAM, 0);
345 if (fd < 0)
80876c20 346 return -errno;
071830ff 347
91dd5f7c 348 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
349 if (r < 0)
350 return r;
071830ff 351
2ac1ff68 352 if (shutdown(fd, SHUT_RD) < 0)
80876c20 353 return -errno;
071830ff 354
fd1f9c89 355 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 356
2ac1ff68 357 if (dprintf(fd,
62bca2c6 358 "%s\n"
80876c20
LP
359 "%s\n"
360 "%i\n"
54fe0cdb
LP
361 "%i\n"
362 "%i\n"
363 "%i\n"
4f4a1dbf 364 "%i\n",
c867611e 365 context->syslog_identifier ?: ident,
af635cf3 366 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
367 context->syslog_priority,
368 !!context->syslog_level_prefix,
f3dc6af2 369 false,
aac8c0c3 370 is_kmsg_output(output),
2ac1ff68
EV
371 is_terminal_output(output)) < 0)
372 return -errno;
80876c20 373
2ac1ff68 374 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 375}
2ac1ff68 376
3a274a21 377static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 378 int fd;
071830ff 379
80876c20
LP
380 assert(path);
381 assert(nfd >= 0);
fd1f9c89 382
3a274a21 383 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 384 if (fd < 0)
80876c20 385 return fd;
071830ff 386
046a82c1 387 return move_fd(fd, nfd, false);
80876c20 388}
071830ff 389
2038c3f5 390static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 391 _cleanup_close_ int fd = -EBADF;
86fca584 392 int r;
071830ff 393
80876c20 394 assert(path);
071830ff 395
2038c3f5
LP
396 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
397 flags |= O_CREAT;
398
399 fd = open(path, flags|O_NOCTTY, mode);
400 if (fd >= 0)
15a3e96f 401 return TAKE_FD(fd);
071830ff 402
2038c3f5
LP
403 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
404 return -errno;
2038c3f5
LP
405
406 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
407
408 fd = socket(AF_UNIX, SOCK_STREAM, 0);
409 if (fd < 0)
410 return -errno;
411
1861986a
LP
412 r = connect_unix_path(fd, AT_FDCWD, path);
413 if (IN_SET(r, -ENOTSOCK, -EINVAL))
414 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
415 * wasn't an AF_UNIX socket after all */
416 return -ENXIO;
417 if (r < 0)
418 return r;
071830ff 419
2038c3f5
LP
420 if ((flags & O_ACCMODE) == O_RDONLY)
421 r = shutdown(fd, SHUT_WR);
422 else if ((flags & O_ACCMODE) == O_WRONLY)
423 r = shutdown(fd, SHUT_RD);
424 else
86fca584 425 r = 0;
15a3e96f 426 if (r < 0)
2038c3f5 427 return -errno;
2038c3f5 428
15a3e96f 429 return TAKE_FD(fd);
80876c20 430}
071830ff 431
08f3be7a
LP
432static int fixup_input(
433 const ExecContext *context,
434 int socket_fd,
435 bool apply_tty_stdin) {
436
437 ExecInput std_input;
438
439 assert(context);
440
441 std_input = context->std_input;
1e3ad081
LP
442
443 if (is_terminal_input(std_input) && !apply_tty_stdin)
444 return EXEC_INPUT_NULL;
071830ff 445
03fd9c49 446 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
447 return EXEC_INPUT_NULL;
448
08f3be7a
LP
449 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
450 return EXEC_INPUT_NULL;
451
03fd9c49 452 return std_input;
4f2d528d
LP
453}
454
7966a916 455static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 456
7966a916 457 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
458 return EXEC_OUTPUT_INHERIT;
459
7966a916 460 return output;
4f2d528d
LP
461}
462
a34ceba6
LP
463static int setup_input(
464 const ExecContext *context,
465 const ExecParameters *params,
52c239d7 466 int socket_fd,
2caa38e9 467 const int named_iofds[static 3]) {
a34ceba6 468
4f2d528d 469 ExecInput i;
51462135 470 int r;
4f2d528d
LP
471
472 assert(context);
a34ceba6 473 assert(params);
2caa38e9 474 assert(named_iofds);
a34ceba6
LP
475
476 if (params->stdin_fd >= 0) {
477 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
478 return -errno;
479
480 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
481 if (isatty(STDIN_FILENO)) {
482 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
483 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 484 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 485 }
a34ceba6
LP
486
487 return STDIN_FILENO;
488 }
4f2d528d 489
08f3be7a 490 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
491
492 switch (i) {
071830ff 493
80876c20
LP
494 case EXEC_INPUT_NULL:
495 return open_null_as(O_RDONLY, STDIN_FILENO);
496
497 case EXEC_INPUT_TTY:
498 case EXEC_INPUT_TTY_FORCE:
499 case EXEC_INPUT_TTY_FAIL: {
046a82c1 500 int fd;
071830ff 501
1e22b5cd 502 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
503 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
504 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
505 ACQUIRE_TERMINAL_WAIT,
3a43da28 506 USEC_INFINITY);
970edce6 507 if (fd < 0)
80876c20
LP
508 return fd;
509
51462135
DDM
510 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
511 if (r < 0)
512 return r;
513
046a82c1 514 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
515 }
516
4f2d528d 517 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
518 assert(socket_fd >= 0);
519
7c248223 520 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 521
52c239d7 522 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
523 assert(named_iofds[STDIN_FILENO] >= 0);
524
52c239d7 525 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 526 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 527
08f3be7a
LP
528 case EXEC_INPUT_DATA: {
529 int fd;
530
531 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
532 if (fd < 0)
533 return fd;
534
535 return move_fd(fd, STDIN_FILENO, false);
536 }
537
2038c3f5
LP
538 case EXEC_INPUT_FILE: {
539 bool rw;
540 int fd;
541
542 assert(context->stdio_file[STDIN_FILENO]);
543
544 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
545 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
546
547 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
548 if (fd < 0)
549 return fd;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
80876c20 554 default:
04499a70 555 assert_not_reached();
80876c20
LP
556 }
557}
558
41fc585a
LP
559static bool can_inherit_stderr_from_stdout(
560 const ExecContext *context,
561 ExecOutput o,
562 ExecOutput e) {
563
564 assert(context);
565
566 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
567 * stderr fd */
568
569 if (e == EXEC_OUTPUT_INHERIT)
570 return true;
571 if (e != o)
572 return false;
573
574 if (e == EXEC_OUTPUT_NAMED_FD)
575 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
576
8d7dab1f 577 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
578 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
579
580 return true;
581}
582
a34ceba6 583static int setup_output(
34cf6c43 584 const Unit *unit,
a34ceba6
LP
585 const ExecContext *context,
586 const ExecParameters *params,
587 int fileno,
588 int socket_fd,
2caa38e9 589 const int named_iofds[static 3],
a34ceba6 590 const char *ident,
7bce046b
LP
591 uid_t uid,
592 gid_t gid,
593 dev_t *journal_stream_dev,
594 ino_t *journal_stream_ino) {
a34ceba6 595
4f2d528d
LP
596 ExecOutput o;
597 ExecInput i;
47c1d80d 598 int r;
4f2d528d 599
f2341e0a 600 assert(unit);
80876c20 601 assert(context);
a34ceba6 602 assert(params);
80876c20 603 assert(ident);
7bce046b
LP
604 assert(journal_stream_dev);
605 assert(journal_stream_ino);
80876c20 606
a34ceba6
LP
607 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
608
609 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
610 return -errno;
611
612 return STDOUT_FILENO;
613 }
614
615 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
616 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
617 return -errno;
618
619 return STDERR_FILENO;
620 }
621
08f3be7a 622 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 623 o = fixup_output(context->std_output, socket_fd);
4f2d528d 624
eb17e935
MS
625 if (fileno == STDERR_FILENO) {
626 ExecOutput e;
627 e = fixup_output(context->std_error, socket_fd);
80876c20 628
eb17e935
MS
629 /* This expects the input and output are already set up */
630
631 /* Don't change the stderr file descriptor if we inherit all
632 * the way and are not on a tty */
633 if (e == EXEC_OUTPUT_INHERIT &&
634 o == EXEC_OUTPUT_INHERIT &&
635 i == EXEC_INPUT_NULL &&
636 !is_terminal_input(context->std_input) &&
7966a916 637 getppid() != 1)
eb17e935
MS
638 return fileno;
639
640 /* Duplicate from stdout if possible */
41fc585a 641 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 642 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 643
eb17e935 644 o = e;
80876c20 645
eb17e935 646 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
647 /* If input got downgraded, inherit the original value */
648 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 649 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 650
08f3be7a
LP
651 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
652 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 653 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 654
acb591e4
LP
655 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
656 if (getppid() != 1)
eb17e935 657 return fileno;
94f04347 658
eb17e935
MS
659 /* We need to open /dev/null here anew, to get the right access mode. */
660 return open_null_as(O_WRONLY, fileno);
071830ff 661 }
94f04347 662
eb17e935 663 switch (o) {
80876c20
LP
664
665 case EXEC_OUTPUT_NULL:
eb17e935 666 return open_null_as(O_WRONLY, fileno);
80876c20
LP
667
668 case EXEC_OUTPUT_TTY:
4f2d528d 669 if (is_terminal_input(i))
7c248223 670 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
671
672 /* We don't reset the terminal if this is just about output */
1e22b5cd 673 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 674
9a6bca7a 675 case EXEC_OUTPUT_KMSG:
28dbc1e8 676 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
677 case EXEC_OUTPUT_JOURNAL:
678 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 679 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 680 if (r < 0) {
7966a916
ZJS
681 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
682 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 683 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
684 } else {
685 struct stat st;
686
687 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
688 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
689 * services to detect whether they are connected to the journal or not.
690 *
691 * If both stdout and stderr are connected to a stream then let's make sure to store the data
692 * about STDERR as that's usually the best way to do logging. */
7bce046b 693
ab2116b1
LP
694 if (fstat(fileno, &st) >= 0 &&
695 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
696 *journal_stream_dev = st.st_dev;
697 *journal_stream_ino = st.st_ino;
698 }
47c1d80d
MS
699 }
700 return r;
4f2d528d
LP
701
702 case EXEC_OUTPUT_SOCKET:
703 assert(socket_fd >= 0);
e75a9ed1 704
7c248223 705 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 706
52c239d7 707 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
708 assert(named_iofds[fileno] >= 0);
709
52c239d7 710 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 711 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 712
566b7d23 713 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
714 case EXEC_OUTPUT_FILE_APPEND:
715 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 716 bool rw;
566b7d23 717 int fd, flags;
2038c3f5
LP
718
719 assert(context->stdio_file[fileno]);
720
721 rw = context->std_input == EXEC_INPUT_FILE &&
722 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
723
724 if (rw)
7c248223 725 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 726
566b7d23
ZD
727 flags = O_WRONLY;
728 if (o == EXEC_OUTPUT_FILE_APPEND)
729 flags |= O_APPEND;
8d7dab1f
LW
730 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
731 flags |= O_TRUNC;
566b7d23
ZD
732
733 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
734 if (fd < 0)
735 return fd;
736
566b7d23 737 return move_fd(fd, fileno, 0);
2038c3f5
LP
738 }
739
94f04347 740 default:
04499a70 741 assert_not_reached();
94f04347 742 }
071830ff
LP
743}
744
02a51aba 745static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 746 int r;
02a51aba
LP
747
748 assert(fd >= 0);
02a51aba 749
1ff74fb6 750 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
751 if (isatty(fd) < 1) {
752 if (IN_SET(errno, EINVAL, ENOTTY))
753 return 0; /* not a tty */
1ff74fb6 754
02a51aba 755 return -errno;
4b3b5bc7 756 }
02a51aba 757
4b3b5bc7 758 /* This might fail. What matters are the results. */
f2df231f 759 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
760 if (r < 0)
761 return r;
02a51aba 762
4b3b5bc7 763 return 1;
02a51aba
LP
764}
765
aedec452 766static int setup_confirm_stdio(
51462135 767 const ExecContext *context,
aedec452
LP
768 const char *vc,
769 int *ret_saved_stdin,
770 int *ret_saved_stdout) {
771
254d1313 772 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
3d18b167 773 int r;
80876c20 774
aedec452
LP
775 assert(ret_saved_stdin);
776 assert(ret_saved_stdout);
80876c20 777
af6da548
LP
778 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
779 if (saved_stdin < 0)
780 return -errno;
80876c20 781
af6da548 782 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
783 if (saved_stdout < 0)
784 return -errno;
80876c20 785
8854d795 786 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
787 if (fd < 0)
788 return fd;
80876c20 789
af6da548
LP
790 r = chown_terminal(fd, getuid());
791 if (r < 0)
3d18b167 792 return r;
02a51aba 793
3d18b167
LP
794 r = reset_terminal_fd(fd, true);
795 if (r < 0)
796 return r;
80876c20 797
51462135
DDM
798 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
799 if (r < 0)
800 return r;
801
aedec452
LP
802 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
803 TAKE_FD(fd);
2b33ab09
LP
804 if (r < 0)
805 return r;
80876c20 806
aedec452
LP
807 *ret_saved_stdin = TAKE_FD(saved_stdin);
808 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 809 return 0;
80876c20
LP
810}
811
63d77c92 812static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
813 assert(err < 0);
814
815 if (err == -ETIMEDOUT)
63d77c92 816 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
817 else {
818 errno = -err;
63d77c92 819 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
820 }
821}
822
63d77c92 823static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 824 _cleanup_close_ int fd = -EBADF;
80876c20 825
3b20f877 826 assert(vc);
80876c20 827
7d5ceb64 828 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 829 if (fd < 0)
3b20f877 830 return;
80876c20 831
63d77c92 832 write_confirm_error_fd(err, fd, u);
af6da548 833}
80876c20 834
3d18b167 835static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 836 int r = 0;
80876c20 837
af6da548
LP
838 assert(saved_stdin);
839 assert(saved_stdout);
840
841 release_terminal();
842
843 if (*saved_stdin >= 0)
80876c20 844 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 845 r = -errno;
80876c20 846
af6da548 847 if (*saved_stdout >= 0)
80876c20 848 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 849 r = -errno;
80876c20 850
3d18b167
LP
851 *saved_stdin = safe_close(*saved_stdin);
852 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
853
854 return r;
855}
856
3b20f877
FB
857enum {
858 CONFIRM_PRETEND_FAILURE = -1,
859 CONFIRM_PRETEND_SUCCESS = 0,
860 CONFIRM_EXECUTE = 1,
861};
862
51462135 863static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 864 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 865 _cleanup_free_ char *e = NULL;
3b20f877 866 char c;
af6da548 867
3b20f877 868 /* For any internal errors, assume a positive response. */
51462135 869 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 870 if (r < 0) {
63d77c92 871 write_confirm_error(r, vc, u);
3b20f877
FB
872 return CONFIRM_EXECUTE;
873 }
af6da548 874
b0eb2944
FB
875 /* confirm_spawn might have been disabled while we were sleeping. */
876 if (manager_is_confirm_spawn_disabled(u->manager)) {
877 r = 1;
878 goto restore_stdio;
879 }
af6da548 880
2bcd3c26
FB
881 e = ellipsize(cmdline, 60, 100);
882 if (!e) {
883 log_oom();
884 r = CONFIRM_EXECUTE;
885 goto restore_stdio;
886 }
af6da548 887
d172b175 888 for (;;) {
539622bd 889 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 890 if (r < 0) {
63d77c92 891 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
892 r = CONFIRM_EXECUTE;
893 goto restore_stdio;
894 }
af6da548 895
d172b175 896 switch (c) {
b0eb2944
FB
897 case 'c':
898 printf("Resuming normal execution.\n");
899 manager_disable_confirm_spawn();
900 r = 1;
901 break;
dd6f9ac0
FB
902 case 'D':
903 unit_dump(u, stdout, " ");
904 continue; /* ask again */
d172b175
FB
905 case 'f':
906 printf("Failing execution.\n");
907 r = CONFIRM_PRETEND_FAILURE;
908 break;
909 case 'h':
b0eb2944
FB
910 printf(" c - continue, proceed without asking anymore\n"
911 " D - dump, show the state of the unit\n"
dd6f9ac0 912 " f - fail, don't execute the command and pretend it failed\n"
d172b175 913 " h - help\n"
eedf223a 914 " i - info, show a short summary of the unit\n"
56fde33a 915 " j - jobs, show jobs that are in progress\n"
d172b175
FB
916 " s - skip, don't execute the command and pretend it succeeded\n"
917 " y - yes, execute the command\n");
dd6f9ac0 918 continue; /* ask again */
eedf223a
FB
919 case 'i':
920 printf(" Description: %s\n"
921 " Unit: %s\n"
922 " Command: %s\n",
923 u->id, u->description, cmdline);
924 continue; /* ask again */
56fde33a 925 case 'j':
d1d8786c 926 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 927 continue; /* ask again */
539622bd
FB
928 case 'n':
929 /* 'n' was removed in favor of 'f'. */
930 printf("Didn't understand 'n', did you mean 'f'?\n");
931 continue; /* ask again */
d172b175
FB
932 case 's':
933 printf("Skipping execution.\n");
934 r = CONFIRM_PRETEND_SUCCESS;
935 break;
936 case 'y':
937 r = CONFIRM_EXECUTE;
938 break;
939 default:
04499a70 940 assert_not_reached();
d172b175 941 }
3b20f877 942 break;
3b20f877 943 }
af6da548 944
3b20f877 945restore_stdio:
af6da548 946 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 947 return r;
80876c20
LP
948}
949
4d885bd3
DH
950static int get_fixed_user(const ExecContext *c, const char **user,
951 uid_t *uid, gid_t *gid,
952 const char **home, const char **shell) {
81a2b7ce 953 int r;
4d885bd3 954 const char *name;
81a2b7ce 955
4d885bd3 956 assert(c);
81a2b7ce 957
23deef88
LP
958 if (!c->user)
959 return 0;
960
4d885bd3
DH
961 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
962 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 963
23deef88 964 name = c->user;
fafff8f1 965 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
966 if (r < 0)
967 return r;
81a2b7ce 968
4d885bd3
DH
969 *user = name;
970 return 0;
971}
972
973static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
974 int r;
975 const char *name;
976
977 assert(c);
978
979 if (!c->group)
980 return 0;
981
982 name = c->group;
fafff8f1 983 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
984 if (r < 0)
985 return r;
986
987 *group = name;
988 return 0;
989}
990
cdc5d5c5
DH
991static int get_supplementary_groups(const ExecContext *c, const char *user,
992 const char *group, gid_t gid,
993 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
994 int r, k = 0;
995 int ngroups_max;
996 bool keep_groups = false;
997 gid_t *groups = NULL;
998 _cleanup_free_ gid_t *l_gids = NULL;
999
1000 assert(c);
1001
bbeea271
DH
1002 /*
1003 * If user is given, then lookup GID and supplementary groups list.
1004 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1005 * here and as early as possible so we keep the list of supplementary
1006 * groups of the caller.
bbeea271
DH
1007 */
1008 if (user && gid_is_valid(gid) && gid != 0) {
1009 /* First step, initialize groups from /etc/groups */
1010 if (initgroups(user, gid) < 0)
1011 return -errno;
1012
1013 keep_groups = true;
1014 }
1015
ac6e8be6 1016 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1017 return 0;
1018
366ddd25
DH
1019 /*
1020 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1021 * be positive, otherwise fail.
1022 */
1023 errno = 0;
1024 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1025 if (ngroups_max <= 0)
1026 return errno_or_else(EOPNOTSUPP);
366ddd25 1027
4d885bd3
DH
1028 l_gids = new(gid_t, ngroups_max);
1029 if (!l_gids)
1030 return -ENOMEM;
81a2b7ce 1031
4d885bd3
DH
1032 if (keep_groups) {
1033 /*
1034 * Lookup the list of groups that the user belongs to, we
1035 * avoid NSS lookups here too for gid=0.
1036 */
1037 k = ngroups_max;
1038 if (getgrouplist(user, gid, l_gids, &k) < 0)
1039 return -EINVAL;
1040 } else
1041 k = 0;
81a2b7ce 1042
4d885bd3
DH
1043 STRV_FOREACH(i, c->supplementary_groups) {
1044 const char *g;
81a2b7ce 1045
4d885bd3
DH
1046 if (k >= ngroups_max)
1047 return -E2BIG;
81a2b7ce 1048
4d885bd3 1049 g = *i;
fafff8f1 1050 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1051 if (r < 0)
1052 return r;
81a2b7ce 1053
4d885bd3
DH
1054 k++;
1055 }
81a2b7ce 1056
4d885bd3
DH
1057 /*
1058 * Sets ngids to zero to drop all supplementary groups, happens
1059 * when we are under root and SupplementaryGroups= is empty.
1060 */
1061 if (k == 0) {
1062 *ngids = 0;
1063 return 0;
1064 }
81a2b7ce 1065
4d885bd3
DH
1066 /* Otherwise get the final list of supplementary groups */
1067 groups = memdup(l_gids, sizeof(gid_t) * k);
1068 if (!groups)
1069 return -ENOMEM;
1070
1071 *supplementary_gids = groups;
1072 *ngids = k;
1073
1074 groups = NULL;
1075
1076 return 0;
1077}
1078
34cf6c43 1079static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1080 int r;
1081
709dbeac
YW
1082 /* Handle SupplementaryGroups= if it is not empty */
1083 if (ngids > 0) {
4d885bd3
DH
1084 r = maybe_setgroups(ngids, supplementary_gids);
1085 if (r < 0)
97f0e76f 1086 return r;
4d885bd3 1087 }
81a2b7ce 1088
4d885bd3
DH
1089 if (gid_is_valid(gid)) {
1090 /* Then set our gids */
1091 if (setresgid(gid, gid, gid) < 0)
1092 return -errno;
81a2b7ce
LP
1093 }
1094
1095 return 0;
1096}
1097
dbdc4098
TK
1098static int set_securebits(int bits, int mask) {
1099 int current, applied;
1100 current = prctl(PR_GET_SECUREBITS);
1101 if (current < 0)
1102 return -errno;
1103 /* Clear all securebits defined in mask and set bits */
1104 applied = (current & ~mask) | bits;
1105 if (current == applied)
1106 return 0;
1107 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1108 return -errno;
1109 return 1;
1110}
1111
81a2b7ce 1112static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1113 assert(context);
dbdc4098 1114 int r;
81a2b7ce 1115
4d885bd3
DH
1116 if (!uid_is_valid(uid))
1117 return 0;
1118
479050b3 1119 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1120 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1121 * required, so we also need keep-caps in this case.
1122 */
81a2b7ce 1123
dbdc4098 1124 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1125
1126 /* First step: If we need to keep capabilities but
1127 * drop privileges we need to make sure we keep our
cbb21cca 1128 * caps, while we drop privileges. */
693ced48 1129 if (uid != 0) {
dbdc4098
TK
1130 /* Add KEEP_CAPS to the securebits */
1131 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1132 if (r < 0)
1133 return r;
693ced48 1134 }
81a2b7ce
LP
1135 }
1136
479050b3 1137 /* Second step: actually set the uids */
81a2b7ce
LP
1138 if (setresuid(uid, uid, uid) < 0)
1139 return -errno;
1140
1141 /* At this point we should have all necessary capabilities but
1142 are otherwise a normal user. However, the caps might got
1143 corrupted due to the setresuid() so we need clean them up
1144 later. This is done outside of this call. */
1145
1146 return 0;
1147}
1148
349cc4a5 1149#if HAVE_PAM
5b6319dc
LP
1150
1151static int null_conv(
1152 int num_msg,
1153 const struct pam_message **msg,
1154 struct pam_response **resp,
1155 void *appdata_ptr) {
1156
1157 /* We don't support conversations */
1158
1159 return PAM_CONV_ERR;
1160}
1161
cefc33ae
LP
1162#endif
1163
5b6319dc
LP
1164static int setup_pam(
1165 const char *name,
1166 const char *user,
940c5210 1167 uid_t uid,
2d6fce8d 1168 gid_t gid,
5b6319dc 1169 const char *tty,
421bb42d 1170 char ***env, /* updated on success */
5b8d1f6b 1171 const int fds[], size_t n_fds) {
5b6319dc 1172
349cc4a5 1173#if HAVE_PAM
cefc33ae 1174
5b6319dc
LP
1175 static const struct pam_conv conv = {
1176 .conv = null_conv,
1177 .appdata_ptr = NULL
1178 };
1179
2d7c6aa2 1180 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1181 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1182 pam_handle_t *handle = NULL;
d6e5f3ad 1183 sigset_t old_ss;
7bb70b6e 1184 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1185 bool close_session = false;
1186 pid_t pam_pid = 0, parent_pid;
970edce6 1187 int flags = 0;
5b6319dc
LP
1188
1189 assert(name);
1190 assert(user);
2065ca69 1191 assert(env);
5b6319dc
LP
1192
1193 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1194 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1195 * systemd via the cgroup logic. It will then remove the PAM
1196 * session again. The parent process will exec() the actual
1197 * daemon. We do things this way to ensure that the main PID
1198 * of the daemon is the one we initially fork()ed. */
1199
7bb70b6e
LP
1200 r = barrier_create(&barrier);
1201 if (r < 0)
2d7c6aa2
DH
1202 goto fail;
1203
553d2243 1204 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1205 flags |= PAM_SILENT;
1206
f546241b
ZJS
1207 pam_code = pam_start(name, user, &conv, &handle);
1208 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1209 handle = NULL;
1210 goto fail;
1211 }
1212
3cd24c1a
LP
1213 if (!tty) {
1214 _cleanup_free_ char *q = NULL;
1215
1216 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1217 * out if that's the case, and read the TTY off it. */
1218
1219 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1220 tty = strjoina("/dev/", q);
1221 }
1222
513cf7da
MS
1223 if (tty) {
1224 pam_code = pam_set_item(handle, PAM_TTY, tty);
1225 if (pam_code != PAM_SUCCESS)
1226 goto fail;
1227 }
5b6319dc 1228
84eada2f
JW
1229 STRV_FOREACH(nv, *env) {
1230 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1231 if (pam_code != PAM_SUCCESS)
1232 goto fail;
1233 }
1234
970edce6 1235 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1236 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1237 goto fail;
1238
3bb39ea9
DG
1239 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1240 if (pam_code != PAM_SUCCESS)
46d7c6af 1241 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1242
970edce6 1243 pam_code = pam_open_session(handle, flags);
f546241b 1244 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1245 goto fail;
1246
1247 close_session = true;
1248
f546241b
ZJS
1249 e = pam_getenvlist(handle);
1250 if (!e) {
5b6319dc
LP
1251 pam_code = PAM_BUF_ERR;
1252 goto fail;
1253 }
1254
cafc5ca1 1255 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1256
72c0a2c2 1257 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1258
df0ff127 1259 parent_pid = getpid_cached();
5b6319dc 1260
4c253ed1
LP
1261 r = safe_fork("(sd-pam)", 0, &pam_pid);
1262 if (r < 0)
5b6319dc 1263 goto fail;
4c253ed1 1264 if (r == 0) {
7bb70b6e 1265 int sig, ret = EXIT_PAM;
5b6319dc 1266
cafc5ca1 1267 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1268 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1269
1da37e58
ZJS
1270 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1271 * those fds are open here that have been opened by PAM. */
4c253ed1 1272 (void) close_many(fds, n_fds);
5b6319dc 1273
cafc5ca1
LP
1274 /* Drop privileges - we don't need any to pam_close_session and this will make
1275 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1276 * threads to fail to exit normally */
2d6fce8d 1277
97f0e76f
LP
1278 r = maybe_setgroups(0, NULL);
1279 if (r < 0)
1280 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1281 if (setresgid(gid, gid, gid) < 0)
1282 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1283 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1284 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1285
9c274488 1286 (void) ignore_signals(SIGPIPE);
ce30c8dc 1287
cafc5ca1
LP
1288 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1289 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1290 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1291 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1292 goto child_finish;
1293
cafc5ca1
LP
1294 /* Tell the parent that our setup is done. This is especially important regarding dropping
1295 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1296 *
cafc5ca1 1297 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1298 (void) barrier_place(&barrier);
2d7c6aa2 1299
643f4706 1300 /* Check if our parent process might already have died? */
5b6319dc 1301 if (getppid() == parent_pid) {
d6e5f3ad
DM
1302 sigset_t ss;
1303
1304 assert_se(sigemptyset(&ss) >= 0);
1305 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1306
3dead8d9
LP
1307 for (;;) {
1308 if (sigwait(&ss, &sig) < 0) {
1309 if (errno == EINTR)
1310 continue;
1311
1312 goto child_finish;
1313 }
5b6319dc 1314
3dead8d9
LP
1315 assert(sig == SIGTERM);
1316 break;
1317 }
5b6319dc
LP
1318 }
1319
3bb39ea9
DG
1320 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1321 if (pam_code != PAM_SUCCESS)
1322 goto child_finish;
1323
3dead8d9 1324 /* If our parent died we'll end the session */
f546241b 1325 if (getppid() != parent_pid) {
970edce6 1326 pam_code = pam_close_session(handle, flags);
f546241b 1327 if (pam_code != PAM_SUCCESS)
5b6319dc 1328 goto child_finish;
f546241b 1329 }
5b6319dc 1330
7bb70b6e 1331 ret = 0;
5b6319dc
LP
1332
1333 child_finish:
7feb2b57
LP
1334 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1335 * know about this. See pam_end(3) */
1336 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1337 _exit(ret);
5b6319dc
LP
1338 }
1339
2d7c6aa2
DH
1340 barrier_set_role(&barrier, BARRIER_PARENT);
1341
cafc5ca1
LP
1342 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1343 * here. */
5b6319dc
LP
1344 handle = NULL;
1345
3b8bddde 1346 /* Unblock SIGTERM again in the parent */
72c0a2c2 1347 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1348
cafc5ca1
LP
1349 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1350 * this fd around. */
5b6319dc
LP
1351 closelog();
1352
cafc5ca1
LP
1353 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1354 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1355 if (!barrier_place_and_sync(&barrier))
1356 log_error("PAM initialization failed");
1357
130d3d22 1358 return strv_free_and_replace(*env, e);
5b6319dc
LP
1359
1360fail:
970edce6
ZJS
1361 if (pam_code != PAM_SUCCESS) {
1362 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1363 r = -EPERM; /* PAM errors do not map to errno */
1364 } else
1365 log_error_errno(r, "PAM failed: %m");
9ba35398 1366
5b6319dc
LP
1367 if (handle) {
1368 if (close_session)
970edce6 1369 pam_code = pam_close_session(handle, flags);
5b6319dc 1370
7feb2b57 1371 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1372 }
1373
5b6319dc 1374 closelog();
7bb70b6e 1375 return r;
cefc33ae
LP
1376#else
1377 return 0;
5b6319dc 1378#endif
cefc33ae 1379}
5b6319dc 1380
5d6b1584 1381static void rename_process_from_path(const char *path) {
a99626c1 1382 _cleanup_free_ char *buf = NULL;
5d6b1584 1383 const char *p;
5d6b1584 1384
a99626c1
LP
1385 assert(path);
1386
1387 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1388 * /bin/ps */
5d6b1584 1389
a99626c1 1390 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1391 rename_process("(...)");
1392 return;
1393 }
1394
a99626c1 1395 size_t l = strlen(buf);
5d6b1584 1396 if (l > 8) {
a99626c1 1397 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1398 * "systemd-" */
a99626c1 1399 p = buf + l - 8;
5d6b1584 1400 l = 8;
a99626c1
LP
1401 } else
1402 p = buf;
5d6b1584 1403
a99626c1 1404 char process_name[11];
5d6b1584
LP
1405 process_name[0] = '(';
1406 memcpy(process_name+1, p, l);
1407 process_name[1+l] = ')';
1408 process_name[1+l+1] = 0;
1409
1410 rename_process(process_name);
1411}
1412
469830d1
LP
1413static bool context_has_address_families(const ExecContext *c) {
1414 assert(c);
1415
6b000af4 1416 return c->address_families_allow_list ||
469830d1
LP
1417 !set_isempty(c->address_families);
1418}
1419
1420static bool context_has_syscall_filters(const ExecContext *c) {
1421 assert(c);
1422
6b000af4 1423 return c->syscall_allow_list ||
8cfa775f 1424 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1425}
1426
9df2cdd8
TM
1427static bool context_has_syscall_logs(const ExecContext *c) {
1428 assert(c);
1429
1430 return c->syscall_log_allow_list ||
1431 !hashmap_isempty(c->syscall_log);
1432}
1433
469830d1
LP
1434static bool context_has_no_new_privileges(const ExecContext *c) {
1435 assert(c);
1436
1437 if (c->no_new_privileges)
1438 return true;
1439
26c45a6c 1440 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1441 return false;
1442
1443 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1444 return c->lock_personality ||
469830d1 1445 c->memory_deny_write_execute ||
0538d2a8 1446 c->private_devices ||
fc64760d 1447 c->protect_clock ||
0538d2a8 1448 c->protect_hostname ||
469830d1
LP
1449 c->protect_kernel_tunables ||
1450 c->protect_kernel_modules ||
84703040 1451 c->protect_kernel_logs ||
0538d2a8
YW
1452 context_has_address_families(c) ||
1453 exec_context_restrict_namespaces_set(c) ||
1454 c->restrict_realtime ||
1455 c->restrict_suid_sgid ||
78e864e5 1456 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1457 context_has_syscall_filters(c) ||
1458 context_has_syscall_logs(c);
469830d1
LP
1459}
1460
bb0c0d6f
LP
1461static bool exec_context_has_credentials(const ExecContext *context) {
1462
1463 assert(context);
1464
1465 return !hashmap_isempty(context->set_credentials) ||
43144be4 1466 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1467}
1468
349cc4a5 1469#if HAVE_SECCOMP
17df7223 1470
83f12b27 1471static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1472
1473 if (is_seccomp_available())
1474 return false;
1475
f673b62d 1476 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1477 return true;
83f12b27
FS
1478}
1479
165a31c0 1480static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1481 uint32_t negative_action, default_action, action;
165a31c0 1482 int r;
8351ceae 1483
469830d1 1484 assert(u);
c0467cf3 1485 assert(c);
8351ceae 1486
469830d1 1487 if (!context_has_syscall_filters(c))
83f12b27
FS
1488 return 0;
1489
469830d1
LP
1490 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1491 return 0;
e9642be2 1492
005bfaf1 1493 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1494
6b000af4 1495 if (c->syscall_allow_list) {
469830d1
LP
1496 default_action = negative_action;
1497 action = SCMP_ACT_ALLOW;
7c66bae2 1498 } else {
469830d1
LP
1499 default_action = SCMP_ACT_ALLOW;
1500 action = negative_action;
57183d11 1501 }
8351ceae 1502
165a31c0 1503 if (needs_ambient_hack) {
6b000af4 1504 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1505 if (r < 0)
1506 return r;
1507 }
1508
b54f36c6 1509 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1510}
1511
9df2cdd8
TM
1512static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1513#ifdef SCMP_ACT_LOG
1514 uint32_t default_action, action;
1515#endif
1516
1517 assert(u);
1518 assert(c);
1519
1520 if (!context_has_syscall_logs(c))
1521 return 0;
1522
1523#ifdef SCMP_ACT_LOG
1524 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1525 return 0;
1526
1527 if (c->syscall_log_allow_list) {
1528 /* Log nothing but the ones listed */
1529 default_action = SCMP_ACT_ALLOW;
1530 action = SCMP_ACT_LOG;
1531 } else {
1532 /* Log everything but the ones listed */
1533 default_action = SCMP_ACT_LOG;
1534 action = SCMP_ACT_ALLOW;
1535 }
1536
1537 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1538#else
1539 /* old libseccomp */
1540 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1541 return 0;
1542#endif
1543}
1544
469830d1
LP
1545static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1546 assert(u);
4298d0b5
LP
1547 assert(c);
1548
469830d1 1549 if (set_isempty(c->syscall_archs))
83f12b27
FS
1550 return 0;
1551
469830d1
LP
1552 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1553 return 0;
4298d0b5 1554
469830d1
LP
1555 return seccomp_restrict_archs(c->syscall_archs);
1556}
4298d0b5 1557
469830d1
LP
1558static int apply_address_families(const Unit* u, const ExecContext *c) {
1559 assert(u);
1560 assert(c);
4298d0b5 1561
469830d1
LP
1562 if (!context_has_address_families(c))
1563 return 0;
4298d0b5 1564
469830d1
LP
1565 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1566 return 0;
4298d0b5 1567
6b000af4 1568 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1569}
4298d0b5 1570
83f12b27 1571static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1572 assert(u);
f3e43635
TM
1573 assert(c);
1574
469830d1 1575 if (!c->memory_deny_write_execute)
83f12b27
FS
1576 return 0;
1577
469830d1
LP
1578 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1579 return 0;
f3e43635 1580
469830d1 1581 return seccomp_memory_deny_write_execute();
f3e43635
TM
1582}
1583
83f12b27 1584static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1585 assert(u);
f4170c67
LP
1586 assert(c);
1587
469830d1 1588 if (!c->restrict_realtime)
83f12b27
FS
1589 return 0;
1590
469830d1
LP
1591 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1592 return 0;
f4170c67 1593
469830d1 1594 return seccomp_restrict_realtime();
f4170c67
LP
1595}
1596
f69567cb
LP
1597static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1598 assert(u);
1599 assert(c);
1600
1601 if (!c->restrict_suid_sgid)
1602 return 0;
1603
1604 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1605 return 0;
1606
1607 return seccomp_restrict_suid_sgid();
1608}
1609
59e856c7 1610static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1611 assert(u);
59eeb84b
LP
1612 assert(c);
1613
1614 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1615 * let's protect even those systems where this is left on in the kernel. */
1616
469830d1 1617 if (!c->protect_kernel_tunables)
59eeb84b
LP
1618 return 0;
1619
469830d1
LP
1620 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1621 return 0;
59eeb84b 1622
469830d1 1623 return seccomp_protect_sysctl();
59eeb84b
LP
1624}
1625
59e856c7 1626static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1627 assert(u);
502d704e
DH
1628 assert(c);
1629
25a8d8a0 1630 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1631
469830d1
LP
1632 if (!c->protect_kernel_modules)
1633 return 0;
1634
502d704e
DH
1635 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1636 return 0;
1637
b54f36c6 1638 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1639}
1640
84703040
KK
1641static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1642 assert(u);
1643 assert(c);
1644
1645 if (!c->protect_kernel_logs)
1646 return 0;
1647
1648 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1649 return 0;
1650
1651 return seccomp_protect_syslog();
1652}
1653
daf8f72b 1654static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1655 assert(u);
1656 assert(c);
1657
1658 if (!c->protect_clock)
1659 return 0;
1660
1661 if (skip_seccomp_unavailable(u, "ProtectClock="))
1662 return 0;
1663
1664 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1665}
1666
59e856c7 1667static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1668 assert(u);
ba128bb8
LP
1669 assert(c);
1670
8f81a5f6 1671 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1672
469830d1
LP
1673 if (!c->private_devices)
1674 return 0;
1675
ba128bb8
LP
1676 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1677 return 0;
1678
b54f36c6 1679 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1680}
1681
34cf6c43 1682static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1683 assert(u);
add00535
LP
1684 assert(c);
1685
1686 if (!exec_context_restrict_namespaces_set(c))
1687 return 0;
1688
1689 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1690 return 0;
1691
1692 return seccomp_restrict_namespaces(c->restrict_namespaces);
1693}
1694
78e864e5 1695static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1696 unsigned long personality;
1697 int r;
78e864e5
TM
1698
1699 assert(u);
1700 assert(c);
1701
1702 if (!c->lock_personality)
1703 return 0;
1704
1705 if (skip_seccomp_unavailable(u, "LockPersonality="))
1706 return 0;
1707
e8132d63
LP
1708 personality = c->personality;
1709
1710 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1711 if (personality == PERSONALITY_INVALID) {
1712
1713 r = opinionated_personality(&personality);
1714 if (r < 0)
1715 return r;
1716 }
78e864e5
TM
1717
1718 return seccomp_lock_personality(personality);
1719}
1720
c0467cf3 1721#endif
8351ceae 1722
7a8288f6 1723#if HAVE_LIBBPF
7a8288f6
DM
1724static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1725 assert(u);
1726 assert(c);
1727
1728 if (!exec_context_restrict_filesystems_set(c))
1729 return 0;
1730
46004616
ZJS
1731 if (!u->manager->restrict_fs) {
1732 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1733 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1734 return 0;
46004616 1735 }
7a8288f6
DM
1736
1737 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1738}
1739#endif
1740
daf8f72b 1741static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1742 assert(u);
1743 assert(c);
1744
1745 if (!c->protect_hostname)
1746 return 0;
1747
1748 if (ns_type_supported(NAMESPACE_UTS)) {
1749 if (unshare(CLONE_NEWUTS) < 0) {
1750 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1751 *ret_exit_status = EXIT_NAMESPACE;
1752 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1753 }
1754
1755 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1756 }
1757 } else
1758 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1759
1760#if HAVE_SECCOMP
8f3e342f
ZJS
1761 int r;
1762
daf8f72b
LP
1763 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1764 return 0;
1765
1766 r = seccomp_protect_hostname();
1767 if (r < 0) {
1768 *ret_exit_status = EXIT_SECCOMP;
1769 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1770 }
1771#endif
1772
1773 return 0;
1774}
1775
3042bbeb 1776static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1777 assert(idle_pipe);
1778
54eb2300
LP
1779 idle_pipe[1] = safe_close(idle_pipe[1]);
1780 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1781
1782 if (idle_pipe[0] >= 0) {
1783 int r;
1784
1785 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1786
1787 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1788 ssize_t n;
1789
31a7eb86 1790 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1791 n = write(idle_pipe[3], "x", 1);
1792 if (n > 0)
cd972d69 1793 /* Wait for systemd to react to the signal above. */
54756dce 1794 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1795 }
1796
54eb2300 1797 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1798
1799 }
1800
54eb2300 1801 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1802}
1803
fb2042dd
YW
1804static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1805
7cae38c4 1806static int build_environment(
34cf6c43 1807 const Unit *u,
9fa95f85 1808 const ExecContext *c,
1e22b5cd 1809 const ExecParameters *p,
da6053d0 1810 size_t n_fds,
cd48e23f 1811 char **fdnames,
7cae38c4
LP
1812 const char *home,
1813 const char *username,
1814 const char *shell,
7bce046b
LP
1815 dev_t journal_stream_dev,
1816 ino_t journal_stream_ino,
7cae38c4
LP
1817 char ***ret) {
1818
1819 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1820 size_t n_env = 0;
7cae38c4
LP
1821 char *x;
1822
4b58153d 1823 assert(u);
7cae38c4 1824 assert(c);
7c1cb6f1 1825 assert(p);
7cae38c4
LP
1826 assert(ret);
1827
dc4e2940 1828#define N_ENV_VARS 17
8d5bb13d 1829 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1830 if (!our_env)
1831 return -ENOMEM;
1832
1833 if (n_fds > 0) {
8dd4c05b
LP
1834 _cleanup_free_ char *joined = NULL;
1835
df0ff127 1836 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1837 return -ENOMEM;
1838 our_env[n_env++] = x;
1839
da6053d0 1840 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1841 return -ENOMEM;
1842 our_env[n_env++] = x;
8dd4c05b 1843
cd48e23f 1844 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1845 if (!joined)
1846 return -ENOMEM;
1847
605405c6 1848 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1849 if (!x)
1850 return -ENOMEM;
1851 our_env[n_env++] = x;
7cae38c4
LP
1852 }
1853
b08af3b1 1854 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1855 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1856 return -ENOMEM;
1857 our_env[n_env++] = x;
1858
1e22b5cd 1859 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1860 return -ENOMEM;
1861 our_env[n_env++] = x;
1862 }
1863
de90700f
LP
1864 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1865 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1866 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1867 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1868 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1869 if (!x)
1870 return -ENOMEM;
1871 our_env[n_env++] = x;
1872 }
1873
7cae38c4 1874 if (home) {
b910cc72 1875 x = strjoin("HOME=", home);
7cae38c4
LP
1876 if (!x)
1877 return -ENOMEM;
7bbead1d 1878
4ff361cc 1879 path_simplify(x + 5);
7cae38c4
LP
1880 our_env[n_env++] = x;
1881 }
1882
1883 if (username) {
b910cc72 1884 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1885 if (!x)
1886 return -ENOMEM;
1887 our_env[n_env++] = x;
1888
b910cc72 1889 x = strjoin("USER=", username);
7cae38c4
LP
1890 if (!x)
1891 return -ENOMEM;
1892 our_env[n_env++] = x;
1893 }
1894
1895 if (shell) {
b910cc72 1896 x = strjoin("SHELL=", shell);
7cae38c4
LP
1897 if (!x)
1898 return -ENOMEM;
7bbead1d 1899
4ff361cc 1900 path_simplify(x + 6);
7cae38c4
LP
1901 our_env[n_env++] = x;
1902 }
1903
4b58153d
LP
1904 if (!sd_id128_is_null(u->invocation_id)) {
1905 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1906 return -ENOMEM;
1907
1908 our_env[n_env++] = x;
1909 }
1910
6af760f3
LP
1911 if (exec_context_needs_term(c)) {
1912 const char *tty_path, *term = NULL;
1913
1914 tty_path = exec_context_tty_path(c);
1915
e8cf09b2
LP
1916 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1917 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1918 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1919
e8cf09b2 1920 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1921 term = getenv("TERM");
e8cf09b2 1922
6af760f3
LP
1923 if (!term)
1924 term = default_term_for_tty(tty_path);
7cae38c4 1925
b910cc72 1926 x = strjoin("TERM=", term);
7cae38c4
LP
1927 if (!x)
1928 return -ENOMEM;
1929 our_env[n_env++] = x;
1930 }
1931
7bce046b
LP
1932 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1933 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1934 return -ENOMEM;
1935
1936 our_env[n_env++] = x;
1937 }
1938
91dd5f7c
LP
1939 if (c->log_namespace) {
1940 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1941 if (!x)
1942 return -ENOMEM;
1943
1944 our_env[n_env++] = x;
1945 }
1946
5b10116e 1947 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1948 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1949 const char *n;
1950
1951 if (!p->prefix[t])
1952 continue;
1953
211a3d87 1954 if (c->directories[t].n_items == 0)
fb2042dd
YW
1955 continue;
1956
1957 n = exec_directory_env_name_to_string(t);
1958 if (!n)
1959 continue;
1960
211a3d87
LB
1961 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1962 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1963
211a3d87
LB
1964 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1965 if (!prefixed)
1966 return -ENOMEM;
1967
1968 if (!strextend_with_separator(&joined, ":", prefixed))
1969 return -ENOMEM;
1970 }
fb2042dd
YW
1971
1972 x = strjoin(n, "=", joined);
1973 if (!x)
1974 return -ENOMEM;
1975
1976 our_env[n_env++] = x;
1977 }
1978
bb0c0d6f
LP
1979 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1980 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1981 if (!x)
1982 return -ENOMEM;
1983
1984 our_env[n_env++] = x;
1985 }
1986
dc4e2940
YW
1987 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1988 return -ENOMEM;
1989
1990 our_env[n_env++] = x;
1991
7cae38c4 1992 our_env[n_env++] = NULL;
8d5bb13d
LP
1993 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1994#undef N_ENV_VARS
7cae38c4 1995
ae2a15bc 1996 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1997
1998 return 0;
1999}
2000
b4c14404
FB
2001static int build_pass_environment(const ExecContext *c, char ***ret) {
2002 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2003 size_t n_env = 0;
b4c14404
FB
2004
2005 STRV_FOREACH(i, c->pass_environment) {
2006 _cleanup_free_ char *x = NULL;
2007 char *v;
2008
2009 v = getenv(*i);
2010 if (!v)
2011 continue;
605405c6 2012 x = strjoin(*i, "=", v);
b4c14404
FB
2013 if (!x)
2014 return -ENOMEM;
00819cc1 2015
319a4f4b 2016 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2017 return -ENOMEM;
00819cc1 2018
1cc6c93a 2019 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2020 pass_env[n_env] = NULL;
b4c14404
FB
2021 }
2022
ae2a15bc 2023 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2024
2025 return 0;
2026}
2027
5e8deb94 2028bool exec_needs_mount_namespace(
8b44a3d2
LP
2029 const ExecContext *context,
2030 const ExecParameters *params,
4657abb5 2031 const ExecRuntime *runtime) {
8b44a3d2
LP
2032
2033 assert(context);
8b44a3d2 2034
915e6d16
LP
2035 if (context->root_image)
2036 return true;
2037
2a624c36
AP
2038 if (!strv_isempty(context->read_write_paths) ||
2039 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2040 !strv_isempty(context->inaccessible_paths) ||
2041 !strv_isempty(context->exec_paths) ||
2042 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2043 return true;
2044
42b1d8e0 2045 if (context->n_bind_mounts > 0)
d2d6c096
LP
2046 return true;
2047
2abd4e38
YW
2048 if (context->n_temporary_filesystems > 0)
2049 return true;
2050
b3d13314
LB
2051 if (context->n_mount_images > 0)
2052 return true;
2053
93f59701
LB
2054 if (context->n_extension_images > 0)
2055 return true;
2056
a07b9926
LB
2057 if (!strv_isempty(context->extension_directories))
2058 return true;
2059
37ed15d7 2060 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2061 return true;
2062
2063 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2064 return true;
2065
8b44a3d2 2066 if (context->private_devices ||
228af36f 2067 context->private_mounts ||
8b44a3d2 2068 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2069 context->protect_home != PROTECT_HOME_NO ||
2070 context->protect_kernel_tunables ||
c575770b 2071 context->protect_kernel_modules ||
94a7b275 2072 context->protect_kernel_logs ||
4e399953
LP
2073 context->protect_control_groups ||
2074 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2075 context->proc_subset != PROC_SUBSET_ALL ||
2076 context->private_ipc ||
2077 context->ipc_namespace_path)
8b44a3d2
LP
2078 return true;
2079
37c56f89 2080 if (context->root_directory) {
5e98086d 2081 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2082 return true;
2083
5b10116e 2084 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2085 if (params && !params->prefix[t])
37c56f89
YW
2086 continue;
2087
211a3d87 2088 if (context->directories[t].n_items > 0)
37c56f89
YW
2089 return true;
2090 }
2091 }
5d997827 2092
42b1d8e0 2093 if (context->dynamic_user &&
211a3d87
LB
2094 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2095 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2096 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2097 return true;
2098
91dd5f7c
LP
2099 if (context->log_namespace)
2100 return true;
2101
8b44a3d2
LP
2102 return false;
2103}
2104
5749f855 2105static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2106 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2107 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2108 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2109 _cleanup_(sigkill_waitp) pid_t pid = 0;
2110 uint64_t c = 1;
d251207d
LP
2111 ssize_t n;
2112 int r;
2113
5749f855
AZ
2114 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2115 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2116 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2117 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2118 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2119 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2120 * continues execution normally.
2121 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2122 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2123
5749f855 2124 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2125 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2126 r = asprintf(&uid_map,
5749f855 2127 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2128 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2129 ouid, ouid, uid, uid);
2130 else
2131 r = asprintf(&uid_map,
2132 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2133 ouid, ouid);
d251207d 2134
5749f855
AZ
2135 if (r < 0)
2136 return -ENOMEM;
2137
2138 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2139 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2140 r = asprintf(&gid_map,
5749f855 2141 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2142 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2143 ogid, ogid, gid, gid);
2144 else
2145 r = asprintf(&gid_map,
2146 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2147 ogid, ogid);
2148
2149 if (r < 0)
2150 return -ENOMEM;
d251207d
LP
2151
2152 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2153 * namespace. */
2154 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2155 if (unshare_ready_fd < 0)
2156 return -errno;
2157
2158 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2159 * failed. */
2160 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2161 return -errno;
2162
4c253ed1
LP
2163 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2164 if (r < 0)
2165 return r;
2166 if (r == 0) {
254d1313 2167 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2168 const char *a;
2169 pid_t ppid;
2170
2171 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2172 * here, after the parent opened its own user namespace. */
2173
2174 ppid = getppid();
2175 errno_pipe[0] = safe_close(errno_pipe[0]);
2176
2177 /* Wait until the parent unshared the user namespace */
2178 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2179 r = -errno;
2180 goto child_fail;
2181 }
2182
2183 /* Disable the setgroups() system call in the child user namespace, for good. */
2184 a = procfs_file_alloca(ppid, "setgroups");
2185 fd = open(a, O_WRONLY|O_CLOEXEC);
2186 if (fd < 0) {
2187 if (errno != ENOENT) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191
2192 /* If the file is missing the kernel is too old, let's continue anyway. */
2193 } else {
2194 if (write(fd, "deny\n", 5) < 0) {
2195 r = -errno;
2196 goto child_fail;
2197 }
2198
2199 fd = safe_close(fd);
2200 }
2201
2202 /* First write the GID map */
2203 a = procfs_file_alloca(ppid, "gid_map");
2204 fd = open(a, O_WRONLY|O_CLOEXEC);
2205 if (fd < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2210 r = -errno;
2211 goto child_fail;
2212 }
2213 fd = safe_close(fd);
2214
2215 /* The write the UID map */
2216 a = procfs_file_alloca(ppid, "uid_map");
2217 fd = open(a, O_WRONLY|O_CLOEXEC);
2218 if (fd < 0) {
2219 r = -errno;
2220 goto child_fail;
2221 }
2222 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2223 r = -errno;
2224 goto child_fail;
2225 }
2226
2227 _exit(EXIT_SUCCESS);
2228
2229 child_fail:
2230 (void) write(errno_pipe[1], &r, sizeof(r));
2231 _exit(EXIT_FAILURE);
2232 }
2233
2234 errno_pipe[1] = safe_close(errno_pipe[1]);
2235
2236 if (unshare(CLONE_NEWUSER) < 0)
2237 return -errno;
2238
2239 /* Let the child know that the namespace is ready now */
2240 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2241 return -errno;
2242
2243 /* Try to read an error code from the child */
2244 n = read(errno_pipe[0], &r, sizeof(r));
2245 if (n < 0)
2246 return -errno;
2247 if (n == sizeof(r)) { /* an error code was sent to us */
2248 if (r < 0)
2249 return r;
2250 return -EIO;
2251 }
2252 if (n != 0) /* on success we should have read 0 bytes */
2253 return -EIO;
2254
8f03de53 2255 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2256 if (r < 0)
2257 return r;
2e87a1fd 2258 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2259 return -EIO;
2260
2261 return 0;
2262}
2263
494d0247
YW
2264static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2265 if (!context->dynamic_user)
2266 return false;
2267
2268 if (type == EXEC_DIRECTORY_CONFIGURATION)
2269 return false;
2270
2271 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2272 return false;
2273
2274 return true;
2275}
2276
211a3d87
LB
2277static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2278 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2279 int r;
2280
2281 assert(source);
2282
2283 src_abs = path_join(root, source);
2284 if (!src_abs)
2285 return -ENOMEM;
2286
2287 STRV_FOREACH(dst, symlinks) {
2288 _cleanup_free_ char *dst_abs = NULL;
2289
2290 dst_abs = path_join(root, *dst);
2291 if (!dst_abs)
2292 return -ENOMEM;
2293
2294 r = mkdir_parents_label(dst_abs, 0755);
2295 if (r < 0)
2296 return r;
2297
2298 r = symlink_idempotent(src_abs, dst_abs, true);
2299 if (r < 0)
2300 return r;
2301 }
2302
2303 return 0;
2304}
2305
3536f49e 2306static int setup_exec_directory(
07689d5d
LP
2307 const ExecContext *context,
2308 const ExecParameters *params,
2309 uid_t uid,
3536f49e 2310 gid_t gid,
3536f49e 2311 ExecDirectoryType type,
211a3d87 2312 bool needs_mount_namespace,
3536f49e 2313 int *exit_status) {
07689d5d 2314
72fd1768 2315 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2316 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2317 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2318 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2319 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2320 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2321 };
07689d5d
LP
2322 int r;
2323
2324 assert(context);
2325 assert(params);
72fd1768 2326 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2327 assert(exit_status);
07689d5d 2328
3536f49e
YW
2329 if (!params->prefix[type])
2330 return 0;
2331
8679efde 2332 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2333 if (!uid_is_valid(uid))
2334 uid = 0;
2335 if (!gid_is_valid(gid))
2336 gid = 0;
2337 }
2338
211a3d87 2339 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2340 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2341
211a3d87 2342 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2343 if (!p) {
2344 r = -ENOMEM;
2345 goto fail;
2346 }
07689d5d 2347
23a7448e
YW
2348 r = mkdir_parents_label(p, 0755);
2349 if (r < 0)
3536f49e 2350 goto fail;
23a7448e 2351
494d0247 2352 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2353 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2354 * case we want to avoid leaving a directory around fully accessible that is owned by
2355 * a dynamic user whose UID is later on reused. To lock this down we use the same
2356 * trick used by container managers to prohibit host users to get access to files of
2357 * the same UID in containers: we place everything inside a directory that has an
2358 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2359 * for unprivileged host code. We then use fs namespacing to make this directory
2360 * permeable for the service itself.
6c47cd7d 2361 *
3f5b1508
LP
2362 * Specifically: for a service which wants a special directory "foo/" we first create
2363 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2364 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2365 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2366 * unprivileged host users can't look into it. Inside of the namespace of the unit
2367 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2368 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2369 * for the service and making sure it only gets access to the dirs it needs but no
2370 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2371 *
3f5b1508
LP
2372 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2373 * to be owned by the service itself.
2374 *
2375 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2376 * for sharing files or sockets with other services. */
6c47cd7d 2377
4ede9802
LP
2378 pp = path_join(params->prefix[type], "private");
2379 if (!pp) {
6c47cd7d
LP
2380 r = -ENOMEM;
2381 goto fail;
2382 }
2383
2384 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2385 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2386 if (r < 0)
2387 goto fail;
2388
211a3d87 2389 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2390 r = -ENOMEM;
2391 goto fail;
2392 }
2393
2394 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2395 r = mkdir_parents_label(pp, 0755);
2396 if (r < 0)
2397 goto fail;
2398
949befd3
LP
2399 if (is_dir(p, false) > 0 &&
2400 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2401
2402 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2403 * it over. Most likely the service has been upgraded from one that didn't use
2404 * DynamicUser=1, to one that does. */
2405
cf52c45d
LP
2406 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2407 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2408 exec_directory_type_to_string(type), p, pp);
2409
949befd3
LP
2410 if (rename(p, pp) < 0) {
2411 r = -errno;
2412 goto fail;
2413 }
2414 } else {
2415 /* Otherwise, create the actual directory for the service */
2416
2417 r = mkdir_label(pp, context->directories[type].mode);
2418 if (r < 0 && r != -EEXIST)
2419 goto fail;
2420 }
6c47cd7d 2421
a2ab603c
YW
2422 if (!context->directories[type].items[i].only_create) {
2423 /* And link it up from the original place.
2424 * Notes
2425 * 1) If a mount namespace is going to be used, then this symlink remains on
2426 * the host, and a new one for the child namespace will be created later.
2427 * 2) It is not necessary to create this symlink when one of its parent
2428 * directories is specified and already created. E.g.
2429 * StateDirectory=foo foo/bar
2430 * In that case, the inode points to pp and p for "foo/bar" are the same:
2431 * pp = "/var/lib/private/foo/bar"
2432 * p = "/var/lib/foo/bar"
2433 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2434 * we do not need to create the symlink, but we cannot create the symlink.
2435 * See issue #24783. */
2436 r = symlink_idempotent(pp, p, true);
2437 if (r < 0)
2438 goto fail;
2439 }
6c47cd7d 2440
6c47cd7d 2441 } else {
5c6d40d1
LP
2442 _cleanup_free_ char *target = NULL;
2443
2444 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2445 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2446 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2447
2448 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2449 * by DynamicUser=1 (see above)?
2450 *
2451 * We do this for all directory types except for ConfigurationDirectory=,
2452 * since they all support the private/ symlink logic at least in some
2453 * configurations, see above. */
5c6d40d1 2454
578dc69f
YW
2455 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2456 if (r < 0)
2457 goto fail;
2458
211a3d87 2459 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2460 if (!q) {
2461 r = -ENOMEM;
2462 goto fail;
2463 }
2464
578dc69f
YW
2465 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2466 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2467 if (r < 0)
2468 goto fail;
2469
2470 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2471
2472 /* Hmm, apparently DynamicUser= was once turned on for this service,
2473 * but is no longer. Let's move the directory back up. */
2474
cf52c45d
LP
2475 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2476 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2477 exec_directory_type_to_string(type), q, p);
2478
5c6d40d1
LP
2479 if (unlink(p) < 0) {
2480 r = -errno;
2481 goto fail;
2482 }
2483
2484 if (rename(q, p) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488 }
2489 }
2490
6c47cd7d 2491 r = mkdir_label(p, context->directories[type].mode);
d484580c 2492 if (r < 0) {
d484580c
LP
2493 if (r != -EEXIST)
2494 goto fail;
2495
206e9864
LP
2496 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2497 struct stat st;
2498
2499 /* Don't change the owner/access mode of the configuration directory,
2500 * as in the common case it is not written to by a service, and shall
2501 * not be writable. */
2502
2503 if (stat(p, &st) < 0) {
2504 r = -errno;
2505 goto fail;
2506 }
2507
2508 /* Still complain if the access mode doesn't match */
2509 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2510 log_warning("%s \'%s\' already exists but the mode is different. "
2511 "(File system: %o %sMode: %o)",
211a3d87 2512 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2513 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2514
6cff72eb 2515 continue;
206e9864 2516 }
6cff72eb 2517 }
a1164ae3 2518 }
07689d5d 2519
206e9864 2520 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2521 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2522 * current UID/GID ownership.) */
2523 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2524 if (r < 0)
2525 goto fail;
c71b2eb7 2526
607b358e
LP
2527 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2528 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2529 * assignments to exist. */
607b358e 2530 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2531 if (r < 0)
3536f49e 2532 goto fail;
07689d5d
LP
2533 }
2534
211a3d87
LB
2535 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2536 * they are set up later, to allow configuring empty var/run/etc. */
2537 if (!needs_mount_namespace)
2538 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2539 r = create_many_symlinks(params->prefix[type],
2540 context->directories[type].items[i].path,
2541 context->directories[type].items[i].symlinks);
2542 if (r < 0)
2543 goto fail;
2544 }
2545
07689d5d 2546 return 0;
3536f49e
YW
2547
2548fail:
2549 *exit_status = exit_status_table[type];
3536f49e 2550 return r;
07689d5d
LP
2551}
2552
bb0c0d6f
LP
2553static int write_credential(
2554 int dfd,
2555 const char *id,
2556 const void *data,
2557 size_t size,
2558 uid_t uid,
2559 bool ownership_ok) {
2560
2561 _cleanup_(unlink_and_freep) char *tmp = NULL;
254d1313 2562 _cleanup_close_ int fd = -EBADF;
bb0c0d6f
LP
2563 int r;
2564
2565 r = tempfn_random_child("", "cred", &tmp);
2566 if (r < 0)
2567 return r;
2568
2569 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2570 if (fd < 0) {
2571 tmp = mfree(tmp);
2572 return -errno;
2573 }
2574
43144be4 2575 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2576 if (r < 0)
2577 return r;
2578
2579 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2580 return -errno;
2581
2582 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2583 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2584 if (r < 0) {
2585 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2586 return r;
2587
2588 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2589 * to express: that the user gets read access and nothing
2590 * else. But if the backing fs can't support that (e.g. ramfs)
2591 * then we can use file ownership instead. But that's only safe if
2592 * we can then re-mount the whole thing read-only, so that the
2593 * user can no longer chmod() the file to gain write access. */
2594 return r;
2595
f5fbe71d 2596 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2597 return -errno;
2598 }
2599 }
2600
2601 if (renameat(dfd, tmp, dfd, id) < 0)
2602 return -errno;
2603
2604 tmp = mfree(tmp);
2605 return 0;
2606}
2607
2ad591a3
LP
2608static char **credential_search_path(
2609 const ExecParameters *params,
2610 bool encrypted) {
2611
2612 _cleanup_strv_free_ char **l = NULL;
2613
2614 assert(params);
2615
2616 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2617 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2618 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2619
2620 if (encrypted) {
2621 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2622 return NULL;
2623
2624 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2625 return NULL;
2626 }
2627
2628 if (params->received_credentials_directory)
2629 if (strv_extend(&l, params->received_credentials_directory) < 0)
2630 return NULL;
2631
2632 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2633 return NULL;
2634
2635 if (DEBUG_LOGGING) {
2636 _cleanup_free_ char *t = strv_join(l, ":");
2637
2638 log_debug("Credential search path is: %s", t);
2639 }
2640
2641 return TAKE_PTR(l);
2642}
2643
3989bdc1
AB
2644static int load_credential(
2645 const ExecContext *context,
2646 const ExecParameters *params,
10b44e1d
LP
2647 const char *id,
2648 const char *path,
2649 bool encrypted,
3989bdc1
AB
2650 const char *unit,
2651 int read_dfd,
2652 int write_dfd,
2653 uid_t uid,
2654 bool ownership_ok,
2655 uint64_t *left) {
2656
3989bdc1 2657 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2658 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2659 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2660 _cleanup_free_ char *bindname = NULL;
2661 const char *source = NULL;
3989bdc1 2662 bool missing_ok = true;
2ad591a3 2663 size_t size, add, maxsz;
3989bdc1
AB
2664 int r;
2665
10b44e1d
LP
2666 assert(context);
2667 assert(params);
2668 assert(id);
2669 assert(path);
2670 assert(unit);
661e4251 2671 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
10b44e1d
LP
2672 assert(write_dfd >= 0);
2673 assert(left);
2674
2ad591a3
LP
2675 if (read_dfd >= 0) {
2676 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2677 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2678 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2679 * open it. */
2680
2681 if (!filename_is_valid(path)) /* safety check */
2682 return -EINVAL;
2683
2684 missing_ok = true;
10b44e1d 2685 source = path;
2ad591a3
LP
2686
2687 } else if (path_is_absolute(path)) {
2688 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2689 * sockets */
2690
2691 if (!path_is_valid(path)) /* safety check */
2692 return -EINVAL;
2693
3989bdc1
AB
2694 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2695
2696 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2697 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2698 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2699 return -ENOMEM;
2700
2701 missing_ok = false;
2ad591a3 2702 source = path;
3989bdc1 2703
2ad591a3
LP
2704 } else if (credential_name_valid(path)) {
2705 /* If this is a relative path, take it as credential name relative to the credentials
2706 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2707 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2708
2709 search_path = credential_search_path(params, encrypted);
2710 if (!search_path)
3989bdc1
AB
2711 return -ENOMEM;
2712
2ad591a3 2713 missing_ok = true;
3989bdc1
AB
2714 } else
2715 source = NULL;
2716
2ad591a3
LP
2717 if (encrypted)
2718 flags |= READ_FULL_FILE_UNBASE64;
2719
2720 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2721
2722 if (search_path) {
2723 STRV_FOREACH(d, search_path) {
2724 _cleanup_free_ char *j = NULL;
2725
2726 j = path_join(*d, path);
2727 if (!j)
2728 return -ENOMEM;
2729
2730 r = read_full_file_full(
2731 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2732 UINT64_MAX,
2733 maxsz,
2734 flags,
2735 NULL,
2736 &data, &size);
2737 if (r != -ENOENT)
2738 break;
2739 }
2740 } else if (source)
3989bdc1
AB
2741 r = read_full_file_full(
2742 read_dfd, source,
2743 UINT64_MAX,
2ad591a3
LP
2744 maxsz,
2745 flags,
3989bdc1
AB
2746 bindname,
2747 &data, &size);
2748 else
2749 r = -ENOENT;
2750
10b44e1d 2751 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2752 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2753 * will get clear errors if we don't pass such a missing credential on as they
2754 * themselves will get ENOENT when trying to read them, which should not be much
2755 * worse than when we handle the error here and make it fatal.
2756 *
2757 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2758 * we are fine, too. */
10b44e1d 2759 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2760 return 0;
2761 }
2762 if (r < 0)
10b44e1d 2763 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2764
10b44e1d 2765 if (encrypted) {
3989bdc1
AB
2766 _cleanup_free_ void *plaintext = NULL;
2767 size_t plaintext_size = 0;
2768
6a0779cb 2769 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2770 if (r < 0)
2771 return r;
2772
2773 free_and_replace(data, plaintext);
2774 size = plaintext_size;
2775 }
2776
10b44e1d 2777 add = strlen(id) + size;
3989bdc1
AB
2778 if (add > *left)
2779 return -E2BIG;
2780
10b44e1d 2781 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2782 if (r < 0)
94602bff 2783 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2784
2785 *left -= add;
2786 return 0;
2787}
2788
2789struct load_cred_args {
3989bdc1
AB
2790 const ExecContext *context;
2791 const ExecParameters *params;
461345a1 2792 bool encrypted;
3989bdc1
AB
2793 const char *unit;
2794 int dfd;
2795 uid_t uid;
2796 bool ownership_ok;
2797 uint64_t *left;
2798};
2799
2800static int load_cred_recurse_dir_cb(
2801 RecurseDirEvent event,
2802 const char *path,
2803 int dir_fd,
2804 int inode_fd,
2805 const struct dirent *de,
2806 const struct statx *sx,
2807 void *userdata) {
2808
6394e5cd 2809 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2810 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2811 int r;
2812
2813 if (event != RECURSE_DIR_ENTRY)
2814 return RECURSE_DIR_CONTINUE;
2815
2816 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2817 return RECURSE_DIR_CONTINUE;
2818
11348386 2819 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2820 if (!sub_id)
2821 return -ENOMEM;
2822
2823 if (!credential_name_valid(sub_id))
1451435c 2824 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2825
5bec447a 2826 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2827 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2828 return RECURSE_DIR_CONTINUE;
2829 }
5bec447a
LP
2830 if (errno != ENOENT)
2831 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2832
10b44e1d
LP
2833 r = load_credential(
2834 args->context,
2835 args->params,
2836 sub_id,
2837 de->d_name,
461345a1 2838 args->encrypted,
10b44e1d
LP
2839 args->unit,
2840 dir_fd,
2841 args->dfd,
2842 args->uid,
2843 args->ownership_ok,
2844 args->left);
3989bdc1
AB
2845 if (r < 0)
2846 return r;
2847
2848 return RECURSE_DIR_CONTINUE;
2849}
2850
bb0c0d6f
LP
2851static int acquire_credentials(
2852 const ExecContext *context,
2853 const ExecParameters *params,
d3dcf4e3 2854 const char *unit,
bb0c0d6f
LP
2855 const char *p,
2856 uid_t uid,
2857 bool ownership_ok) {
2858
43144be4 2859 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
254d1313 2860 _cleanup_close_ int dfd = -EBADF;
43144be4 2861 ExecLoadCredential *lc;
bb0c0d6f 2862 ExecSetCredential *sc;
bb0c0d6f
LP
2863 int r;
2864
2865 assert(context);
2866 assert(p);
2867
2868 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2869 if (dfd < 0)
2870 return -errno;
2871
43144be4
LP
2872 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2873 HASHMAP_FOREACH(lc, context->load_credentials) {
254d1313 2874 _cleanup_close_ int sub_fd = -EBADF;
d3dcf4e3 2875
f344f7fd
LP
2876 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2877 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2878 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2879 * propagate a credential passed to us from further up. */
43144be4 2880
f344f7fd
LP
2881 if (path_is_absolute(lc->path)) {
2882 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2883 if (sub_fd < 0 && !IN_SET(errno,
2884 ENOTDIR, /* Not a directory */
2885 ENOENT)) /* Doesn't exist? */
2886 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2887 }
43144be4 2888
61c5a49e 2889 if (sub_fd < 0)
f344f7fd 2890 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2891 r = load_credential(
2892 context,
2893 params,
2894 lc->id,
2895 lc->path,
2896 lc->encrypted,
2897 unit,
661e4251 2898 AT_FDCWD,
10b44e1d
LP
2899 dfd,
2900 uid,
2901 ownership_ok,
2902 &left);
61c5a49e 2903 else
10b44e1d 2904 /* Directory */
3989bdc1
AB
2905 r = recurse_dir(
2906 sub_fd,
11348386 2907 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2908 /* statx_mask= */ 0,
2909 /* n_depth_max= */ UINT_MAX,
9883cbb2 2910 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2911 load_cred_recurse_dir_cb,
2912 &(struct load_cred_args) {
3989bdc1
AB
2913 .context = context,
2914 .params = params,
461345a1 2915 .encrypted = lc->encrypted,
3989bdc1
AB
2916 .unit = unit,
2917 .dfd = dfd,
2918 .uid = uid,
2919 .ownership_ok = ownership_ok,
2920 .left = &left,
2921 });
61c5a49e
LP
2922 if (r < 0)
2923 return r;
bb0c0d6f
LP
2924 }
2925
9e6e9d61
LP
2926 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2927 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2928 HASHMAP_FOREACH(sc, context->set_credentials) {
2929 _cleanup_(erase_and_freep) void *plaintext = NULL;
2930 const char *data;
2931 size_t size, add;
2932
9e6e9d61
LP
2933 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2934 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2935 * slow and involved, hence it's nice to be able to skip that if the credential already
2936 * exists anyway. */
43144be4
LP
2937 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2938 continue;
2939 if (errno != ENOENT)
2940 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2941
2942 if (sc->encrypted) {
6a0779cb 2943 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
43144be4
LP
2944 if (r < 0)
2945 return r;
2946
2947 data = plaintext;
2948 } else {
2949 data = sc->data;
2950 size = sc->size;
2951 }
2952
2953 add = strlen(sc->id) + size;
2954 if (add > left)
2955 return -E2BIG;
2956
2957 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2958 if (r < 0)
2959 return r;
2960
43144be4
LP
2961 left -= add;
2962 }
2963
bb0c0d6f
LP
2964 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2965 return -errno;
2966
2967 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2968 * accessible */
2969
2970 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2971 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2972 if (r < 0) {
2973 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2974 return r;
2975
2976 if (!ownership_ok)
2977 return r;
2978
f5fbe71d 2979 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2980 return -errno;
2981 }
2982 }
2983
2984 return 0;
2985}
2986
2987static int setup_credentials_internal(
2988 const ExecContext *context,
2989 const ExecParameters *params,
d3dcf4e3 2990 const char *unit,
bb0c0d6f
LP
2991 const char *final, /* This is where the credential store shall eventually end up at */
2992 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2993 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2994 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2995 uid_t uid) {
2996
2997 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2998 * if we mounted something; false if we definitely can't mount anything */
2999 bool final_mounted;
3000 const char *where;
3001
3002 assert(context);
3003 assert(final);
3004 assert(workspace);
3005
3006 if (reuse_workspace) {
3007 r = path_is_mount_point(workspace, NULL, 0);
3008 if (r < 0)
3009 return r;
3010 if (r > 0)
3011 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3012 else
3013 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3014 } else
3015 workspace_mounted = -1; /* ditto */
3016
3017 r = path_is_mount_point(final, NULL, 0);
3018 if (r < 0)
3019 return r;
3020 if (r > 0) {
3021 /* If the final place already has something mounted, we use that. If the workspace also has
3022 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3023 * different). */
3024 final_mounted = true;
3025
3026 if (workspace_mounted < 0) {
f0353cf2 3027 /* If the final place is mounted, but the workspace isn't, then let's bind mount
bb0c0d6f
LP
3028 * the final version to the workspace, and make it writable, so that we can make
3029 * changes */
3030
21935150
LP
3031 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3032 if (r < 0)
3033 return r;
bb0c0d6f 3034
21935150
LP
3035 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3036 if (r < 0)
3037 return r;
bb0c0d6f
LP
3038
3039 workspace_mounted = true;
3040 }
3041 } else
3042 final_mounted = false;
3043
3044 if (workspace_mounted < 0) {
3045 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3046 for (int try = 0;; try++) {
3047
3048 if (try == 0) {
3049 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3050 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3051 if (r >= 0) {
bb0c0d6f
LP
3052 workspace_mounted = true;
3053 break;
3054 }
3055
3056 } else if (try == 1) {
3057 _cleanup_free_ char *opts = NULL;
3058
43144be4 3059 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3060 return -ENOMEM;
3061
3062 /* Fall back to "tmpfs" otherwise */
21935150
LP
3063 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3064 if (r >= 0) {
bb0c0d6f
LP
3065 workspace_mounted = true;
3066 break;
3067 }
3068
3069 } else {
3070 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3071 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3072 if (r < 0) {
3073 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3074 return r;
bb0c0d6f
LP
3075
3076 if (must_mount) /* If we it's not OK to use the plain directory
3077 * fallback, propagate all errors too */
21935150 3078 return r;
bb0c0d6f
LP
3079
3080 /* If we lack privileges to bind mount stuff, then let's gracefully
3081 * proceed for compat with container envs, and just use the final dir
3082 * as is. */
3083
3084 workspace_mounted = false;
3085 break;
3086 }
3087
3088 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3089 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3090 if (r < 0)
3091 return r;
bb0c0d6f
LP
3092
3093 workspace_mounted = true;
3094 break;
3095 }
3096 }
3097 }
3098
3099 assert(!must_mount || workspace_mounted > 0);
3100 where = workspace_mounted ? workspace : final;
3101
03bc11d1 3102 (void) label_fix_full(AT_FDCWD, where, final, 0);
e3a0a862 3103
d3dcf4e3 3104 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3105 if (r < 0)
3106 return r;
3107
3108 if (workspace_mounted) {
3109 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3110 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3111 if (r < 0)
3112 return r;
bb0c0d6f
LP
3113
3114 /* And mount it to the final place, read-only */
21935150
LP
3115 if (final_mounted)
3116 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3117 else
3118 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3119 if (r < 0)
3120 return r;
bb0c0d6f
LP
3121 } else {
3122 _cleanup_free_ char *parent = NULL;
3123
3124 /* If we do not have our own mount put used the plain directory fallback, then we need to
3125 * open access to the top-level credential directory and the per-service directory now */
3126
45519d13
LP
3127 r = path_extract_directory(final, &parent);
3128 if (r < 0)
3129 return r;
bb0c0d6f
LP
3130 if (chmod(parent, 0755) < 0)
3131 return -errno;
3132 }
3133
3134 return 0;
3135}
3136
3137static int setup_credentials(
3138 const ExecContext *context,
3139 const ExecParameters *params,
3140 const char *unit,
3141 uid_t uid) {
3142
3143 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3144 int r;
3145
3146 assert(context);
3147 assert(params);
3148
3149 if (!exec_context_has_credentials(context))
3150 return 0;
3151
3152 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3153 return -EINVAL;
3154
3155 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3156 * and the subdir we mount over with a read-only file system readable by the service's user */
3157 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3158 if (!q)
3159 return -ENOMEM;
3160
3161 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3162 if (r < 0 && r != -EEXIST)
3163 return r;
3164
3165 p = path_join(q, unit);
3166 if (!p)
3167 return -ENOMEM;
3168
3169 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3170 if (r < 0 && r != -EEXIST)
3171 return r;
3172
3173 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3174 if (r < 0) {
3175 _cleanup_free_ char *t = NULL, *u = NULL;
3176
3177 /* If this is not a privilege or support issue then propagate the error */
3178 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3179 return r;
3180
3181 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3182 * it into place, so that users can't access half-initialized credential stores. */
3183 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3184 if (!t)
3185 return -ENOMEM;
3186
3187 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3188 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3189 * after it is fully set up */
3190 u = path_join(t, unit);
3191 if (!u)
3192 return -ENOMEM;
3193
3194 FOREACH_STRING(i, t, u) {
3195 r = mkdir_label(i, 0700);
3196 if (r < 0 && r != -EEXIST)
3197 return r;
3198 }
3199
3200 r = setup_credentials_internal(
3201 context,
3202 params,
d3dcf4e3 3203 unit,
bb0c0d6f
LP
3204 p, /* final mount point */
3205 u, /* temporary workspace to overmount */
3206 true, /* reuse the workspace if it is already a mount */
3207 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3208 uid);
3209
3210 (void) rmdir(u); /* remove the workspace again if we can. */
3211
3212 if (r < 0)
3213 return r;
3214
3215 } else if (r == 0) {
3216
3217 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3218 * we can use the same directory for all cases, after turning off propagation. Question
3219 * though is: where do we turn off propagation exactly, and where do we place the workspace
3220 * directory? We need some place that is guaranteed to be a mount point in the host, and
3221 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3222 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3223 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3224 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3225 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3226 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3227 * propagation on the former, and then overmount the latter.
3228 *
3229 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3230 * for this purpose, but there are few other candidates that work equally well for us, and
3231 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3232 * that no one else sees this should be OK to do. */
bb0c0d6f 3233
21935150
LP
3234 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3235 if (r < 0)
bb0c0d6f
LP
3236 goto child_fail;
3237
3238 r = setup_credentials_internal(
3239 context,
3240 params,
d3dcf4e3 3241 unit,
bb0c0d6f
LP
3242 p, /* final mount point */
3243 "/dev/shm", /* temporary workspace to overmount */
3244 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3245 true, /* insist that something is mounted, do not allow fallback to plain directory */
3246 uid);
3247 if (r < 0)
3248 goto child_fail;
3249
3250 _exit(EXIT_SUCCESS);
3251
3252 child_fail:
3253 _exit(EXIT_FAILURE);
3254 }
3255
3256 return 0;
3257}
3258
92b423b9 3259#if ENABLE_SMACK
cefc33ae 3260static int setup_smack(
aa5ae971 3261 const Manager *manager,
cefc33ae 3262 const ExecContext *context,
b83d5050 3263 int executable_fd) {
cefc33ae
LP
3264 int r;
3265
3266 assert(context);
b83d5050 3267 assert(executable_fd >= 0);
cefc33ae 3268
cefc33ae
LP
3269 if (context->smack_process_label) {
3270 r = mac_smack_apply_pid(0, context->smack_process_label);
3271 if (r < 0)
3272 return r;
aa5ae971 3273 } else if (manager->default_smack_process_label) {
cefc33ae
LP
3274 _cleanup_free_ char *exec_label = NULL;
3275
b83d5050 3276 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 3277 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
3278 return r;
3279
aa5ae971 3280 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
cefc33ae
LP
3281 if (r < 0)
3282 return r;
3283 }
cefc33ae
LP
3284
3285 return 0;
3286}
92b423b9 3287#endif
cefc33ae 3288
6c47cd7d
LP
3289static int compile_bind_mounts(
3290 const ExecContext *context,
3291 const ExecParameters *params,
3292 BindMount **ret_bind_mounts,
da6053d0 3293 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3294 char ***ret_empty_directories) {
3295
3296 _cleanup_strv_free_ char **empty_directories = NULL;
3297 BindMount *bind_mounts;
5b10116e 3298 size_t n, h = 0;
6c47cd7d
LP
3299 int r;
3300
3301 assert(context);
3302 assert(params);
3303 assert(ret_bind_mounts);
3304 assert(ret_n_bind_mounts);
3305 assert(ret_empty_directories);
3306
3307 n = context->n_bind_mounts;
5b10116e 3308 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3309 if (!params->prefix[t])
3310 continue;
3311
a2ab603c
YW
3312 for (size_t i = 0; i < context->directories[t].n_items; i++)
3313 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
3314 }
3315
3316 if (n <= 0) {
3317 *ret_bind_mounts = NULL;
3318 *ret_n_bind_mounts = 0;
3319 *ret_empty_directories = NULL;
3320 return 0;
3321 }
3322
3323 bind_mounts = new(BindMount, n);
3324 if (!bind_mounts)
3325 return -ENOMEM;
3326
5b10116e 3327 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3328 BindMount *item = context->bind_mounts + i;
3329 char *s, *d;
3330
3331 s = strdup(item->source);
3332 if (!s) {
3333 r = -ENOMEM;
3334 goto finish;
3335 }
3336
3337 d = strdup(item->destination);
3338 if (!d) {
3339 free(s);
3340 r = -ENOMEM;
3341 goto finish;
3342 }
3343
3344 bind_mounts[h++] = (BindMount) {
3345 .source = s,
3346 .destination = d,
3347 .read_only = item->read_only,
3348 .recursive = item->recursive,
3349 .ignore_enoent = item->ignore_enoent,
3350 };
3351 }
3352
5b10116e 3353 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3354 if (!params->prefix[t])
3355 continue;
3356
211a3d87 3357 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3358 continue;
3359
494d0247 3360 if (exec_directory_is_private(context, t) &&
74e12520 3361 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3362 char *private_root;
3363
3364 /* So this is for a dynamic user, and we need to make sure the process can access its own
3365 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3366 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3367
657ee2d8 3368 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3369 if (!private_root) {
3370 r = -ENOMEM;
3371 goto finish;
3372 }
3373
3374 r = strv_consume(&empty_directories, private_root);
a635a7ae 3375 if (r < 0)
6c47cd7d 3376 goto finish;
6c47cd7d
LP
3377 }
3378
211a3d87 3379 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3380 char *s, *d;
3381
a2ab603c
YW
3382 /* When one of the parent directories is in the list, we cannot create the symlink
3383 * for the child directory. See also the comments in setup_exec_directory(). */
3384 if (context->directories[t].items[i].only_create)
3385 continue;
3386
494d0247 3387 if (exec_directory_is_private(context, t))
211a3d87 3388 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3389 else
211a3d87 3390 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3391 if (!s) {
3392 r = -ENOMEM;
3393 goto finish;
3394 }
3395
494d0247 3396 if (exec_directory_is_private(context, t) &&
74e12520 3397 exec_context_with_rootfs(context))
5609f688
YW
3398 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3399 * directory is not created on the root directory. So, let's bind-mount the directory
3400 * on the 'non-private' place. */
211a3d87 3401 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3402 else
3403 d = strdup(s);
6c47cd7d
LP
3404 if (!d) {
3405 free(s);
3406 r = -ENOMEM;
3407 goto finish;
3408 }
3409
3410 bind_mounts[h++] = (BindMount) {
3411 .source = s,
3412 .destination = d,
3413 .read_only = false,
9ce4e4b0 3414 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3415 .recursive = true,
3416 .ignore_enoent = false,
3417 };
3418 }
3419 }
3420
3421 assert(h == n);
3422
3423 *ret_bind_mounts = bind_mounts;
3424 *ret_n_bind_mounts = n;
ae2a15bc 3425 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3426
3427 return (int) n;
3428
3429finish:
3430 bind_mount_free_many(bind_mounts, h);
3431 return r;
3432}
3433
df61e79a
LB
3434/* ret_symlinks will contain a list of pairs src:dest that describes
3435 * the symlinks to create later on. For example, the symlinks needed
3436 * to safely give private directories to DynamicUser=1 users. */
3437static int compile_symlinks(
3438 const ExecContext *context,
3439 const ExecParameters *params,
3440 char ***ret_symlinks) {
3441
3442 _cleanup_strv_free_ char **symlinks = NULL;
3443 int r;
3444
3445 assert(context);
3446 assert(params);
3447 assert(ret_symlinks);
3448
3449 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3450 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3451 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3452
211a3d87
LB
3453 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3454 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3455
211a3d87
LB
3456 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3457 dst_abs = path_join(params->prefix[dt], *symlink);
3458 if (!src_abs || !dst_abs)
3459 return -ENOMEM;
df61e79a 3460
211a3d87
LB
3461 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3462 if (r < 0)
3463 return r;
3464 }
3465
a2ab603c
YW
3466 if (!exec_directory_is_private(context, dt) ||
3467 exec_context_with_rootfs(context) ||
3468 context->directories[dt].items[i].only_create)
211a3d87
LB
3469 continue;
3470
3471 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3472 if (!private_path)
3473 return -ENOMEM;
3474
211a3d87 3475 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3476 if (!path)
3477 return -ENOMEM;
3478
3479 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3480 if (r < 0)
3481 return r;
3482 }
3483 }
3484
3485 *ret_symlinks = TAKE_PTR(symlinks);
3486
3487 return 0;
3488}
3489
4e677599
LP
3490static bool insist_on_sandboxing(
3491 const ExecContext *context,
3492 const char *root_dir,
3493 const char *root_image,
3494 const BindMount *bind_mounts,
3495 size_t n_bind_mounts) {
3496
4e677599
LP
3497 assert(context);
3498 assert(n_bind_mounts == 0 || bind_mounts);
3499
3500 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3501 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3502 * rearrange stuff in a way we cannot ignore gracefully. */
3503
3504 if (context->n_temporary_filesystems > 0)
3505 return true;
3506
3507 if (root_dir || root_image)
3508 return true;
3509
b3d13314
LB
3510 if (context->n_mount_images > 0)
3511 return true;
3512
4e677599
LP
3513 if (context->dynamic_user)
3514 return true;
3515
4355c04f
LB
3516 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3517 return true;
3518
4e677599
LP
3519 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3520 * essential. */
5b10116e 3521 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3522 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3523 return true;
3524
91dd5f7c
LP
3525 if (context->log_namespace)
3526 return true;
3527
4e677599
LP
3528 return false;
3529}
3530
6818c54c 3531static int apply_mount_namespace(
34cf6c43 3532 const Unit *u,
9f71ba8d 3533 ExecCommandFlags command_flags,
6818c54c
LP
3534 const ExecContext *context,
3535 const ExecParameters *params,
7cc5ef5f
ZJS
3536 const ExecRuntime *runtime,
3537 char **error_path) {
6818c54c 3538
df61e79a 3539 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3540 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3541 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3542 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3543 *extension_dir = NULL;
228af36f 3544 NamespaceInfo ns_info;
165a31c0 3545 bool needs_sandboxing;
6c47cd7d 3546 BindMount *bind_mounts = NULL;
da6053d0 3547 size_t n_bind_mounts = 0;
6818c54c 3548 int r;
93c6bb51 3549
2b3c1b9e
DH
3550 assert(context);
3551
915e6d16
LP
3552 if (params->flags & EXEC_APPLY_CHROOT) {
3553 root_image = context->root_image;
3554
3555 if (!root_image)
3556 root_dir = context->root_directory;
3557 }
93c6bb51 3558
6c47cd7d
LP
3559 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3560 if (r < 0)
3561 return r;
3562
211a3d87 3563 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3564 r = compile_symlinks(context, params, &symlinks);
3565 if (r < 0)
41abd7f6 3566 goto finalize;
df61e79a 3567
9f71ba8d 3568 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3569 if (needs_sandboxing) {
3570 /* The runtime struct only contains the parent of the private /tmp,
3571 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3572 * that is sticky, and that's the one we want to use here.
3573 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3574
3575 if (context->private_tmp && runtime) {
56a13a49
ZJS
3576 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3577 tmp_dir = runtime->tmp_dir;
3578 else if (runtime->tmp_dir)
3579 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3580
3581 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3582 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3583 else if (runtime->var_tmp_dir)
56a13a49 3584 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3585 }
3586
b5a33299
YW
3587 ns_info = (NamespaceInfo) {
3588 .ignore_protect_paths = false,
3589 .private_dev = context->private_devices,
3590 .protect_control_groups = context->protect_control_groups,
3591 .protect_kernel_tunables = context->protect_kernel_tunables,
3592 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3593 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3594 .protect_hostname = context->protect_hostname,
5e98086d 3595 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3596 .private_mounts = context->private_mounts,
52b3d652
LP
3597 .protect_home = context->protect_home,
3598 .protect_system = context->protect_system,
4e399953
LP
3599 .protect_proc = context->protect_proc,
3600 .proc_subset = context->proc_subset,
80271a44 3601 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3602 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3603 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3604 };
ecf63c91 3605 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3606 /*
3607 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3608 * sandbox info, otherwise enforce it, don't ignore protected paths and
3609 * fail if we are enable to apply the sandbox inside the mount namespace.
3610 */
3611 ns_info = (NamespaceInfo) {
3612 .ignore_protect_paths = true,
3613 };
3614 else
3615 ns_info = (NamespaceInfo) {};
b5a33299 3616
37ed15d7
FB
3617 if (context->mount_flags == MS_SHARED)
3618 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3619
a631cbfa
LP
3620 if (exec_context_has_credentials(context) &&
3621 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3622 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3623 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3624 if (!creds_path) {
3625 r = -ENOMEM;
3626 goto finalize;
3627 }
bbb4e7f3
LP
3628 }
3629
5e8deb94
LB
3630 if (MANAGER_IS_SYSTEM(u->manager)) {
3631 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3632 if (!propagate_dir) {
3633 r = -ENOMEM;
3634 goto finalize;
3635 }
3636
5e8deb94 3637 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3638 if (!incoming_dir) {
3639 r = -ENOMEM;
3640 goto finalize;
3641 }
24759d8f
LB
3642
3643 extension_dir = strdup("/run/systemd/unit-extensions");
3644 if (!extension_dir) {
3645 r = -ENOMEM;
3646 goto finalize;
3647 }
3648 } else
3649 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3650 r = -ENOMEM;
3651 goto finalize;
3652 }
5e8deb94 3653
18d73705 3654 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3655 &ns_info, context->read_write_paths,
165a31c0
LP
3656 needs_sandboxing ? context->read_only_paths : NULL,
3657 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3658 needs_sandboxing ? context->exec_paths : NULL,
3659 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3660 empty_directories,
df61e79a 3661 symlinks,
6c47cd7d
LP
3662 bind_mounts,
3663 n_bind_mounts,
2abd4e38
YW
3664 context->temporary_filesystems,
3665 context->n_temporary_filesystems,
b3d13314
LB
3666 context->mount_images,
3667 context->n_mount_images,
56a13a49
ZJS
3668 tmp_dir,
3669 var_tmp_dir,
bbb4e7f3 3670 creds_path,
91dd5f7c 3671 context->log_namespace,
915e6d16 3672 context->mount_flags,
d4d55b0d
LB
3673 context->root_hash, context->root_hash_size, context->root_hash_path,
3674 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3675 context->root_verity,
93f59701
LB
3676 context->extension_images,
3677 context->n_extension_images,
a07b9926 3678 context->extension_directories,
5e8deb94
LB
3679 propagate_dir,
3680 incoming_dir,
24759d8f 3681 extension_dir,
3bdc25a4 3682 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3683 error_path);
93c6bb51 3684
1beab8b0 3685 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3686 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3687 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3688 * completely different execution environment. */
aca835ed 3689 if (r == -ENOANO) {
4e677599
LP
3690 if (insist_on_sandboxing(
3691 context,
3692 root_dir, root_image,
3693 bind_mounts,
3694 n_bind_mounts)) {
3695 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3696 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3697 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3698
3699 r = -EOPNOTSUPP;
3700 } else {
aca835ed 3701 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3702 r = 0;
aca835ed 3703 }
93c6bb51
DH
3704 }
3705
8062e643 3706finalize:
4e677599 3707 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3708 return r;
3709}
3710
915e6d16
LP
3711static int apply_working_directory(
3712 const ExecContext *context,
3713 const ExecParameters *params,
3714 const char *home,
376fecf6 3715 int *exit_status) {
915e6d16 3716
6732edab 3717 const char *d, *wd;
2b3c1b9e
DH
3718
3719 assert(context);
376fecf6 3720 assert(exit_status);
2b3c1b9e 3721
6732edab
LP
3722 if (context->working_directory_home) {
3723
376fecf6
LP
3724 if (!home) {
3725 *exit_status = EXIT_CHDIR;
6732edab 3726 return -ENXIO;
376fecf6 3727 }
6732edab 3728
2b3c1b9e 3729 wd = home;
6732edab 3730
14eb3285
LP
3731 } else
3732 wd = empty_to_root(context->working_directory);
e7f1e7c6 3733
fa97f630 3734 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3735 d = wd;
fa97f630 3736 else
3b0e5bb5 3737 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3738
376fecf6
LP
3739 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3740 *exit_status = EXIT_CHDIR;
2b3c1b9e 3741 return -errno;
376fecf6 3742 }
e7f1e7c6
DH
3743
3744 return 0;
3745}
3746
fa97f630
JB
3747static int apply_root_directory(
3748 const ExecContext *context,
3749 const ExecParameters *params,
3750 const bool needs_mount_ns,
3751 int *exit_status) {
3752
3753 assert(context);
3754 assert(exit_status);
3755
5b10116e 3756 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3757 if (!needs_mount_ns && context->root_directory)
3758 if (chroot(context->root_directory) < 0) {
3759 *exit_status = EXIT_CHROOT;
3760 return -errno;
3761 }
fa97f630
JB
3762
3763 return 0;
3764}
3765
b1edf445 3766static int setup_keyring(
34cf6c43 3767 const Unit *u,
b1edf445
LP
3768 const ExecContext *context,
3769 const ExecParameters *p,
3770 uid_t uid, gid_t gid) {
3771
74dd6b51 3772 key_serial_t keyring;
e64c2d0b
DJL
3773 int r = 0;
3774 uid_t saved_uid;
3775 gid_t saved_gid;
74dd6b51
LP
3776
3777 assert(u);
b1edf445 3778 assert(context);
74dd6b51
LP
3779 assert(p);
3780
3781 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3782 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3783 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3784 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3785 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3786 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3787
b1edf445
LP
3788 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3789 return 0;
3790
e64c2d0b
DJL
3791 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3792 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3793 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3794 * & group is just as nasty as acquiring a reference to the user keyring. */
3795
3796 saved_uid = getuid();
3797 saved_gid = getgid();
3798
3799 if (gid_is_valid(gid) && gid != saved_gid) {
3800 if (setregid(gid, -1) < 0)
3801 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3802 }
3803
3804 if (uid_is_valid(uid) && uid != saved_uid) {
3805 if (setreuid(uid, -1) < 0) {
3806 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3807 goto out;
3808 }
3809 }
3810
74dd6b51
LP
3811 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3812 if (keyring == -1) {
3813 if (errno == ENOSYS)
8002fb97 3814 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3815 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3816 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3817 else if (errno == EDQUOT)
8002fb97 3818 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3819 else
e64c2d0b 3820 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3821
e64c2d0b 3822 goto out;
74dd6b51
LP
3823 }
3824
e64c2d0b
DJL
3825 /* When requested link the user keyring into the session keyring. */
3826 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3827
3828 if (keyctl(KEYCTL_LINK,
3829 KEY_SPEC_USER_KEYRING,
3830 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3831 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3832 goto out;
3833 }
3834 }
3835
3836 /* Restore uid/gid back */
3837 if (uid_is_valid(uid) && uid != saved_uid) {
3838 if (setreuid(saved_uid, -1) < 0) {
3839 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3840 goto out;
3841 }
3842 }
3843
3844 if (gid_is_valid(gid) && gid != saved_gid) {
3845 if (setregid(saved_gid, -1) < 0)
3846 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3847 }
3848
3849 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3850 if (!sd_id128_is_null(u->invocation_id)) {
3851 key_serial_t key;
3852
3853 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3854 if (key == -1)
8002fb97 3855 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3856 else {
3857 if (keyctl(KEYCTL_SETPERM, key,
3858 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3859 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3860 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3861 }
3862 }
3863
e64c2d0b 3864out:
37b22b3b 3865 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3866 /* no extra logging, as only the first already reported error matters */
3867 if (getuid() != saved_uid)
3868 (void) setreuid(saved_uid, -1);
b1edf445 3869
e64c2d0b
DJL
3870 if (getgid() != saved_gid)
3871 (void) setregid(saved_gid, -1);
b1edf445 3872
e64c2d0b 3873 return r;
74dd6b51
LP
3874}
3875
3042bbeb 3876static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3877 assert(array);
3878 assert(n);
2caa38e9 3879 assert(pair);
29206d46
LP
3880
3881 if (pair[0] >= 0)
3882 array[(*n)++] = pair[0];
3883 if (pair[1] >= 0)
3884 array[(*n)++] = pair[1];
3885}
3886
a34ceba6
LP
3887static int close_remaining_fds(
3888 const ExecParameters *params,
34cf6c43
YW
3889 const ExecRuntime *runtime,
3890 const DynamicCreds *dcreds,
00d9ef85 3891 int user_lookup_fd,
a34ceba6 3892 int socket_fd,
5b8d1f6b 3893 const int *fds, size_t n_fds) {
a34ceba6 3894
da6053d0 3895 size_t n_dont_close = 0;
00d9ef85 3896 int dont_close[n_fds + 12];
a34ceba6
LP
3897
3898 assert(params);
3899
3900 if (params->stdin_fd >= 0)
3901 dont_close[n_dont_close++] = params->stdin_fd;
3902 if (params->stdout_fd >= 0)
3903 dont_close[n_dont_close++] = params->stdout_fd;
3904 if (params->stderr_fd >= 0)
3905 dont_close[n_dont_close++] = params->stderr_fd;
3906
3907 if (socket_fd >= 0)
3908 dont_close[n_dont_close++] = socket_fd;
3909 if (n_fds > 0) {
3910 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3911 n_dont_close += n_fds;
3912 }
3913
a70581ff 3914 if (runtime) {
29206d46 3915 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3916 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3917 }
29206d46
LP
3918
3919 if (dcreds) {
3920 if (dcreds->user)
3921 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3922 if (dcreds->group)
3923 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3924 }
3925
00d9ef85
LP
3926 if (user_lookup_fd >= 0)
3927 dont_close[n_dont_close++] = user_lookup_fd;
3928
a34ceba6
LP
3929 return close_all_fds(dont_close, n_dont_close);
3930}
3931
00d9ef85
LP
3932static int send_user_lookup(
3933 Unit *unit,
3934 int user_lookup_fd,
3935 uid_t uid,
3936 gid_t gid) {
3937
3938 assert(unit);
3939
3940 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3941 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3942 * specified. */
3943
3944 if (user_lookup_fd < 0)
3945 return 0;
3946
3947 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3948 return 0;
3949
3950 if (writev(user_lookup_fd,
3951 (struct iovec[]) {
e6a7ec4b
LP
3952 IOVEC_INIT(&uid, sizeof(uid)),
3953 IOVEC_INIT(&gid, sizeof(gid)),
3954 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3955 return -errno;
3956
3957 return 0;
3958}
3959
6732edab
LP
3960static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3961 int r;
3962
3963 assert(c);
3964 assert(home);
3965 assert(buf);
3966
3967 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3968
3969 if (*home)
3970 return 0;
3971
3972 if (!c->working_directory_home)
3973 return 0;
3974
6732edab
LP
3975 r = get_home_dir(buf);
3976 if (r < 0)
3977 return r;
3978
3979 *home = *buf;
3980 return 1;
3981}
3982
da50b85a
LP
3983static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3984 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3985 int r;
3986
3987 assert(c);
3988 assert(p);
3989 assert(ret);
3990
3991 assert(c->dynamic_user);
3992
3993 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3994 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3995 * directories. */
3996
5b10116e 3997 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3998 if (t == EXEC_DIRECTORY_CONFIGURATION)
3999 continue;
4000
4001 if (!p->prefix[t])
4002 continue;
4003
211a3d87 4004 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
4005 char *e;
4006
494d0247 4007 if (exec_directory_is_private(c, t))
211a3d87 4008 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 4009 else
211a3d87 4010 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
4011 if (!e)
4012 return -ENOMEM;
4013
4014 r = strv_consume(&list, e);
4015 if (r < 0)
4016 return r;
4017 }
4018 }
4019
ae2a15bc 4020 *ret = TAKE_PTR(list);
da50b85a
LP
4021
4022 return 0;
4023}
4024
78f93209
LP
4025static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4026 bool using_subcgroup;
4027 char *p;
4028
4029 assert(params);
4030 assert(ret);
4031
4032 if (!params->cgroup_path)
4033 return -EINVAL;
4034
4035 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4036 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4037 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4038 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4039 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4040 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4041 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4042 * flag, which is only passed for the former statements, not for the latter. */
4043
4044 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4045 if (using_subcgroup)
657ee2d8 4046 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4047 else
4048 p = strdup(params->cgroup_path);
4049 if (!p)
4050 return -ENOMEM;
4051
4052 *ret = p;
4053 return using_subcgroup;
4054}
4055
e2b2fb7f
MS
4056static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4057 _cleanup_(cpu_set_reset) CPUSet s = {};
4058 int r;
4059
4060 assert(c);
4061 assert(ret);
4062
4063 if (!c->numa_policy.nodes.set) {
4064 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4065 return 0;
4066 }
4067
4068 r = numa_to_cpu_set(&c->numa_policy, &s);
4069 if (r < 0)
4070 return r;
4071
4072 cpu_set_reset(ret);
4073
4074 return cpu_set_add_all(ret, &s);
4075}
4076
4077bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4078 assert(c);
4079
4080 return c->cpu_affinity_from_numa;
4081}
4082
1da37e58
ZJS
4083static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4084 int r;
4085
4086 assert(fds);
4087 assert(n_fds);
4088 assert(*n_fds < fds_size);
4089 assert(ret_fd);
4090
4091 if (fd < 0) {
254d1313 4092 *ret_fd = -EBADF;
1da37e58
ZJS
4093 return 0;
4094 }
4095
4096 if (fd < 3 + (int) *n_fds) {
4097 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4098 * the fds we pass to the process (or which are closed only during execve). */
4099
4100 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4101 if (r < 0)
4102 return -errno;
4103
ee3455cf 4104 close_and_replace(fd, r);
1da37e58
ZJS
4105 }
4106
4107 *ret_fd = fds[*n_fds] = fd;
4108 (*n_fds) ++;
4109 return 1;
4110}
4111
cd48e23f
RP
4112static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4113 union sockaddr_union addr = {
4114 .un.sun_family = AF_UNIX,
4115 };
4116 socklen_t sa_len;
4117 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4118 int r;
4119
4120 assert(u);
4121 assert(of);
4122 assert(ofd >= 0);
4123
4124 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4125 if (r < 0)
4126 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4127
4128 sa_len = r;
4129
4130 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4131 _cleanup_close_ int fd = -EBADF;
4132
4133 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4134 if (fd < 0)
4135 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4136
4137 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4138 if (r == -EPROTOTYPE)
4139 continue;
4140 if (r < 0)
4141 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4142
4143 return TAKE_FD(fd);
4144 }
4145
4146 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4147}
4148
4149static int get_open_file_fd(Unit *u, const OpenFile *of) {
4150 struct stat st;
4151 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4152
4153 assert(u);
4154 assert(of);
4155
4156 ofd = open(of->path, O_PATH | O_CLOEXEC);
4157 if (ofd < 0)
4158 return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
4159 if (fstat(ofd, &st) < 0)
4160 return log_error_errno(errno, "Failed to stat %s: %m", of->path);
4161
4162 if (S_ISSOCK(st.st_mode)) {
4163 fd = connect_unix_harder(u, of, ofd);
4164 if (fd < 0)
4165 return fd;
4166
4167 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4168 return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
4169
4170 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4171 } else {
4172 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4173 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4174 flags |= O_APPEND;
4175 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4176 flags |= O_TRUNC;
4177
4178 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4179 if (fd < 0)
4180 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4181
4182 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4183 }
4184
4185 return TAKE_FD(fd);
4186}
4187
4188static int collect_open_file_fds(
4189 Unit *u,
4190 OpenFile* open_files,
4191 int **fds,
4192 char ***fdnames,
4193 size_t *n_fds) {
4194 int r;
4195
4196 assert(u);
4197 assert(fds);
4198 assert(fdnames);
4199 assert(n_fds);
4200
4201 LIST_FOREACH(open_files, of, open_files) {
4202 _cleanup_close_ int fd = -EBADF;
4203
4204 fd = get_open_file_fd(u, of);
4205 if (fd < 0) {
4206 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4207 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4208 continue;
4209 }
4210
4211 return fd;
4212 }
4213
4214 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4215 return -ENOMEM;
4216
4217 r = strv_extend(fdnames, of->fdname);
4218 if (r < 0)
4219 return r;
4220
4221 (*fds)[*n_fds] = TAKE_FD(fd);
4222
4223 (*n_fds)++;
4224 }
4225
4226 return 0;
4227}
4228
ff0af2a1 4229static int exec_child(
f2341e0a 4230 Unit *unit,
34cf6c43 4231 const ExecCommand *command,
ff0af2a1
LP
4232 const ExecContext *context,
4233 const ExecParameters *params,
4234 ExecRuntime *runtime,
29206d46 4235 DynamicCreds *dcreds,
ff0af2a1 4236 int socket_fd,
2caa38e9 4237 const int named_iofds[static 3],
cd48e23f 4238 int *params_fds,
da6053d0 4239 size_t n_socket_fds,
25b583d7 4240 size_t n_storage_fds,
ff0af2a1 4241 char **files_env,
00d9ef85 4242 int user_lookup_fd,
12145637 4243 int *exit_status) {
d35fbf6b 4244
8c35c10d 4245 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4246 int r, ngids = 0, exec_fd;
4d885bd3
DH
4247 _cleanup_free_ gid_t *supplementary_gids = NULL;
4248 const char *username = NULL, *groupname = NULL;
5686391b 4249 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 4250 const char *home = NULL, *shell = NULL;
7ca69792 4251 char **final_argv = NULL;
7bce046b
LP
4252 dev_t journal_stream_dev = 0;
4253 ino_t journal_stream_ino = 0;
5749f855 4254 bool userns_set_up = false;
165a31c0
LP
4255 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4256 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4257 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4258 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4259#if HAVE_SELINUX
7f59dd35 4260 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4261 bool use_selinux = false;
ecfbc84f 4262#endif
f9fa32f0 4263#if ENABLE_SMACK
43b1f709 4264 bool use_smack = false;
ecfbc84f 4265#endif
349cc4a5 4266#if HAVE_APPARMOR
43b1f709 4267 bool use_apparmor = false;
ecfbc84f 4268#endif
5749f855
AZ
4269 uid_t saved_uid = getuid();
4270 gid_t saved_gid = getgid();
fed1e721
LP
4271 uid_t uid = UID_INVALID;
4272 gid_t gid = GID_INVALID;
1da37e58
ZJS
4273 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4274 n_keep_fds; /* total number of fds not to close */
165a31c0 4275 int secure_bits;
afb11bf1
DG
4276 _cleanup_free_ gid_t *gids_after_pam = NULL;
4277 int ngids_after_pam = 0;
cd48e23f
RP
4278 _cleanup_free_ int *fds = NULL;
4279 _cleanup_strv_free_ char **fdnames = NULL;
034c6ed7 4280
f2341e0a 4281 assert(unit);
5cb5a6ff
LP
4282 assert(command);
4283 assert(context);
d35fbf6b 4284 assert(params);
ff0af2a1 4285 assert(exit_status);
d35fbf6b 4286
69339ae9
LP
4287 /* Explicitly test for CVE-2021-4034 inspired invocations */
4288 assert(command->path);
4289 assert(!strv_isempty(command->argv));
4290
d35fbf6b
DM
4291 rename_process_from_path(command->path);
4292
9c274488
LP
4293 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4294 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4295 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4296 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4297 SIGNALS_IGNORE);
d35fbf6b
DM
4298
4299 if (context->ignore_sigpipe)
9c274488 4300 (void) ignore_signals(SIGPIPE);
d35fbf6b 4301
ff0af2a1
LP
4302 r = reset_signal_mask();
4303 if (r < 0) {
4304 *exit_status = EXIT_SIGNAL_MASK;
12145637 4305 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4306 }
034c6ed7 4307
d35fbf6b
DM
4308 if (params->idle_pipe)
4309 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4310
2c027c62
LP
4311 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4312 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4313 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4314 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4315
d35fbf6b 4316 log_forget_fds();
2c027c62 4317 log_set_open_when_needed(true);
4f2d528d 4318
40a80078
LP
4319 /* In case anything used libc syslog(), close this here, too */
4320 closelog();
4321
cd48e23f
RP
4322 fds = newdup(int, params_fds, n_fds);
4323 if (!fds) {
4324 *exit_status = EXIT_MEMORY;
4325 return log_oom();
4326 }
4327
4328 fdnames = strv_copy((char**) params->fd_names);
4329 if (!fdnames) {
4330 *exit_status = EXIT_MEMORY;
4331 return log_oom();
4332 }
4333
4334 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4335 if (r < 0) {
4336 *exit_status = EXIT_FDS;
4337 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4338 }
4339
b1994387 4340 int keep_fds[n_fds + 3];
1da37e58
ZJS
4341 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4342 n_keep_fds = n_fds;
4343
4344 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4345 if (r < 0) {
4346 *exit_status = EXIT_FDS;
4347 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4348 }
4349
b1994387 4350#if HAVE_LIBBPF
46004616
ZJS
4351 if (unit->manager->restrict_fs) {
4352 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4353 if (bpf_map_fd < 0) {
4354 *exit_status = EXIT_FDS;
46004616 4355 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4356 }
4357
4358 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4359 if (r < 0) {
4360 *exit_status = EXIT_FDS;
4361 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4362 }
4363 }
4364#endif
4365
1da37e58 4366 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4367 if (r < 0) {
4368 *exit_status = EXIT_FDS;
12145637 4369 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4370 }
4371
0af07108
ZJS
4372 if (!context->same_pgrp &&
4373 setsid() < 0) {
4374 *exit_status = EXIT_SETSID;
4375 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4376 }
9e2f7c11 4377
1e22b5cd 4378 exec_context_tty_reset(context, params);
d35fbf6b 4379
c891efaf 4380 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4381 _cleanup_free_ char *cmdline = NULL;
4382
4ef15008 4383 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4384 if (!cmdline) {
0460aa5c 4385 *exit_status = EXIT_MEMORY;
12145637 4386 return log_oom();
3b20f877 4387 }
d35fbf6b 4388
4ef15008 4389 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4390 if (r != CONFIRM_EXECUTE) {
4391 if (r == CONFIRM_PRETEND_SUCCESS) {
4392 *exit_status = EXIT_SUCCESS;
4393 return 0;
4394 }
ff0af2a1 4395 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4396 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4397 "Execution cancelled by the user");
d35fbf6b
DM
4398 }
4399 }
1a63a750 4400
d521916d
LP
4401 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4402 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4403 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4404 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4405 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4406 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4407 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4408 *exit_status = EXIT_MEMORY;
4409 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4410 }
4411
29206d46 4412 if (context->dynamic_user && dcreds) {
da50b85a 4413 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4414
d521916d 4415 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4416 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4417 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4418 *exit_status = EXIT_USER;
12145637 4419 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4420 }
4421
da50b85a
LP
4422 r = compile_suggested_paths(context, params, &suggested_paths);
4423 if (r < 0) {
4424 *exit_status = EXIT_MEMORY;
4425 return log_oom();
4426 }
4427
4428 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4429 if (r < 0) {
4430 *exit_status = EXIT_USER;
d85ff944
YW
4431 if (r == -EILSEQ)
4432 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4433 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4434 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4435 }
524daa8c 4436
70dd455c 4437 if (!uid_is_valid(uid)) {
29206d46 4438 *exit_status = EXIT_USER;
d85ff944 4439 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4440 }
4441
4442 if (!gid_is_valid(gid)) {
4443 *exit_status = EXIT_USER;
d85ff944 4444 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4445 }
5bc7452b 4446
29206d46
LP
4447 if (dcreds->user)
4448 username = dcreds->user->name;
4449
4450 } else {
4d885bd3
DH
4451 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4452 if (r < 0) {
4453 *exit_status = EXIT_USER;
12145637 4454 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4455 }
5bc7452b 4456
4d885bd3
DH
4457 r = get_fixed_group(context, &groupname, &gid);
4458 if (r < 0) {
4459 *exit_status = EXIT_GROUP;
12145637 4460 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4461 }
cdc5d5c5 4462 }
29206d46 4463
cdc5d5c5
DH
4464 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4465 r = get_supplementary_groups(context, username, groupname, gid,
4466 &supplementary_gids, &ngids);
4467 if (r < 0) {
4468 *exit_status = EXIT_GROUP;
12145637 4469 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4470 }
5bc7452b 4471
00d9ef85
LP
4472 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4473 if (r < 0) {
4474 *exit_status = EXIT_USER;
12145637 4475 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4476 }
4477
4478 user_lookup_fd = safe_close(user_lookup_fd);
4479
6732edab
LP
4480 r = acquire_home(context, uid, &home, &home_buffer);
4481 if (r < 0) {
4482 *exit_status = EXIT_CHDIR;
12145637 4483 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4484 }
4485
d35fbf6b
DM
4486 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4487 * must sure to drop O_NONBLOCK */
4488 if (socket_fd >= 0)
a34ceba6 4489 (void) fd_nonblock(socket_fd, false);
acbb0225 4490
4c70a4a7
MS
4491 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4492 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4493 if (params->cgroup_path) {
4494 _cleanup_free_ char *p = NULL;
4495
4496 r = exec_parameters_get_cgroup_path(params, &p);
4497 if (r < 0) {
4498 *exit_status = EXIT_CGROUP;
4499 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4500 }
4501
4502 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4503 if (r == -EUCLEAN) {
4504 *exit_status = EXIT_CGROUP;
4505 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4506 "because the cgroup or one of its parents or "
4507 "siblings is in the threaded mode: %m", p);
4508 }
4c70a4a7
MS
4509 if (r < 0) {
4510 *exit_status = EXIT_CGROUP;
4511 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4512 }
4513 }
4514
a8d08f39 4515 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4516 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4517 if (r < 0) {
4518 *exit_status = EXIT_NETWORK;
4519 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4520 }
4521 }
4522
a70581ff
XR
4523 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4524 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4525 if (r < 0) {
4526 *exit_status = EXIT_NAMESPACE;
4527 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4528 }
4529 }
4530
52c239d7 4531 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4532 if (r < 0) {
4533 *exit_status = EXIT_STDIN;
12145637 4534 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4535 }
034c6ed7 4536
52c239d7 4537 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4538 if (r < 0) {
4539 *exit_status = EXIT_STDOUT;
12145637 4540 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4541 }
4542
52c239d7 4543 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4544 if (r < 0) {
4545 *exit_status = EXIT_STDERR;
12145637 4546 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4547 }
4548
d35fbf6b 4549 if (context->oom_score_adjust_set) {
9f8168eb
LP
4550 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4551 * prohibit write access to this file, and we shouldn't trip up over that. */
4552 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4553 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4554 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4555 else if (r < 0) {
ff0af2a1 4556 *exit_status = EXIT_OOM_ADJUST;
12145637 4557 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4558 }
d35fbf6b
DM
4559 }
4560
ad21e542
ZJS
4561 if (context->coredump_filter_set) {
4562 r = set_coredump_filter(context->coredump_filter);
4563 if (ERRNO_IS_PRIVILEGE(r))
4564 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4565 else if (r < 0)
4566 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4567 }
4568
39090201
DJL
4569 if (context->nice_set) {
4570 r = setpriority_closest(context->nice);
4571 if (r < 0)
4572 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4573 }
613b411c 4574
d35fbf6b
DM
4575 if (context->cpu_sched_set) {
4576 struct sched_param param = {
4577 .sched_priority = context->cpu_sched_priority,
4578 };
4579
ff0af2a1
LP
4580 r = sched_setscheduler(0,
4581 context->cpu_sched_policy |
4582 (context->cpu_sched_reset_on_fork ?
4583 SCHED_RESET_ON_FORK : 0),
4584 &param);
4585 if (r < 0) {
4586 *exit_status = EXIT_SETSCHEDULER;
12145637 4587 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4588 }
d35fbf6b 4589 }
fc9b2a84 4590
e2b2fb7f
MS
4591 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4592 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4593 const CPUSet *cpu_set;
4594
4595 if (context->cpu_affinity_from_numa) {
4596 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4597 if (r < 0) {
4598 *exit_status = EXIT_CPUAFFINITY;
4599 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4600 }
4601
4602 cpu_set = &converted_cpu_set;
4603 } else
4604 cpu_set = &context->cpu_set;
4605
4606 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4607 *exit_status = EXIT_CPUAFFINITY;
12145637 4608 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4609 }
e2b2fb7f 4610 }
034c6ed7 4611
b070c7c0
MS
4612 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4613 r = apply_numa_policy(&context->numa_policy);
4614 if (r == -EOPNOTSUPP)
33fe9e3f 4615 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4616 else if (r < 0) {
4617 *exit_status = EXIT_NUMA_POLICY;
4618 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4619 }
4620 }
4621
d35fbf6b
DM
4622 if (context->ioprio_set)
4623 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4624 *exit_status = EXIT_IOPRIO;
12145637 4625 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4626 }
da726a4d 4627
d35fbf6b
DM
4628 if (context->timer_slack_nsec != NSEC_INFINITY)
4629 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4630 *exit_status = EXIT_TIMERSLACK;
12145637 4631 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4632 }
9eba9da4 4633
21022b9d
LP
4634 if (context->personality != PERSONALITY_INVALID) {
4635 r = safe_personality(context->personality);
4636 if (r < 0) {
ff0af2a1 4637 *exit_status = EXIT_PERSONALITY;
12145637 4638 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4639 }
21022b9d 4640 }
94f04347 4641
33331d11
VB
4642 if (context->utmp_id) {
4643 const char *line = context->tty_path ?
4644 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4645 NULL;
df0ff127 4646 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4647 line,
023a4f67
LP
4648 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4649 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4650 USER_PROCESS,
6a93917d 4651 username);
33331d11 4652 }
d35fbf6b 4653
08f67696 4654 if (uid_is_valid(uid)) {
ff0af2a1
LP
4655 r = chown_terminal(STDIN_FILENO, uid);
4656 if (r < 0) {
4657 *exit_status = EXIT_STDIN;
12145637 4658 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4659 }
d35fbf6b 4660 }
8e274523 4661
4e1dfa45 4662 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4663 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4664 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4665 * touch a single hierarchy too. */
584b8688 4666 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4667 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4668 if (r < 0) {
4669 *exit_status = EXIT_CGROUP;
12145637 4670 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4671 }
d35fbf6b 4672 }
034c6ed7 4673
211a3d87
LB
4674 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4675
5b10116e 4676 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4677 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4678 if (r < 0)
4679 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4680 }
94f04347 4681
bb0c0d6f
LP
4682 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4683 r = setup_credentials(context, params, unit->id, uid);
4684 if (r < 0) {
4685 *exit_status = EXIT_CREDENTIALS;
4686 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4687 }
4688 }
4689
7bce046b 4690 r = build_environment(
fd63e712 4691 unit,
7bce046b
LP
4692 context,
4693 params,
4694 n_fds,
cd48e23f 4695 fdnames,
7bce046b
LP
4696 home,
4697 username,
4698 shell,
4699 journal_stream_dev,
4700 journal_stream_ino,
4701 &our_env);
2065ca69
JW
4702 if (r < 0) {
4703 *exit_status = EXIT_MEMORY;
12145637 4704 return log_oom();
2065ca69
JW
4705 }
4706
4707 r = build_pass_environment(context, &pass_env);
4708 if (r < 0) {
4709 *exit_status = EXIT_MEMORY;
12145637 4710 return log_oom();
2065ca69
JW
4711 }
4712
adf769b0
ZJS
4713 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4714 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4715 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4716 if (!strv_isempty(context->exec_search_path)) {
4717 _cleanup_free_ char *joined = NULL;
4718
4719 joined = strv_join(context->exec_search_path, ":");
4720 if (!joined) {
4721 *exit_status = EXIT_MEMORY;
4722 return log_oom();
4723 }
4724
4725 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4726 if (r < 0) {
4727 *exit_status = EXIT_MEMORY;
4728 return log_oom();
4729 }
4730 }
4731
4ab3d29f 4732 accum_env = strv_env_merge(params->environment,
2065ca69 4733 our_env,
8c35c10d 4734 joined_exec_search_path,
2065ca69
JW
4735 pass_env,
4736 context->environment,
44e5d006 4737 files_env);
2065ca69
JW
4738 if (!accum_env) {
4739 *exit_status = EXIT_MEMORY;
12145637 4740 return log_oom();
2065ca69 4741 }
1280503b 4742 accum_env = strv_env_clean(accum_env);
2065ca69 4743
096424d1 4744 (void) umask(context->umask);
b213e1c1 4745
b1edf445 4746 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4747 if (r < 0) {
4748 *exit_status = EXIT_KEYRING;
12145637 4749 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4750 }
4751
adf769b0
ZJS
4752 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4753 * from it. */
1703fa41 4754 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4755
adf769b0
ZJS
4756 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4757 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4758 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4759
adf769b0
ZJS
4760 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4761 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4762 * desired. */
165a31c0
LP
4763 if (needs_ambient_hack)
4764 needs_setuid = false;
4765 else
4766 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4767
4768 if (needs_sandboxing) {
adf769b0
ZJS
4769 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4770 * /sys being present. The actual MAC context application will happen later, as late as
4771 * possible, to avoid impacting our own code paths. */
7f18ef0a 4772
349cc4a5 4773#if HAVE_SELINUX
43b1f709 4774 use_selinux = mac_selinux_use();
7f18ef0a 4775#endif
f9fa32f0 4776#if ENABLE_SMACK
43b1f709 4777 use_smack = mac_smack_use();
7f18ef0a 4778#endif
349cc4a5 4779#if HAVE_APPARMOR
43b1f709 4780 use_apparmor = mac_apparmor_use();
7f18ef0a 4781#endif
165a31c0 4782 }
7f18ef0a 4783
ce932d2d
LP
4784 if (needs_sandboxing) {
4785 int which_failed;
4786
4787 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4788 * is set here. (See below.) */
4789
4790 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4791 if (r < 0) {
4792 *exit_status = EXIT_LIMITS;
4793 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4794 }
4795 }
4796
0af07108 4797 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4798 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4799 * wins here. (See above.) */
4800
1da37e58 4801 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4802 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4803 if (r < 0) {
4804 *exit_status = EXIT_PAM;
4805 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4806 }
ac45f971 4807
0af07108
ZJS
4808 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4809 if (ngids_after_pam < 0) {
4810 *exit_status = EXIT_MEMORY;
4811 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4812 }
b213e1c1 4813 }
5749f855 4814
26c45a6c 4815 if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
5749f855
AZ
4816 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4817 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4818 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4819
4820 userns_set_up = true;
4821 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4822 if (r < 0) {
4823 *exit_status = EXIT_USER;
4824 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4825 }
4826 }
4827
a8d08f39
LP
4828 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4829
6e2d7c4f 4830 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4831 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4832 if (r == -EPERM)
4833 log_unit_warning_errno(unit, r,
4834 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4835 else if (r < 0) {
6e2d7c4f
MS
4836 *exit_status = EXIT_NETWORK;
4837 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4838 }
a8d08f39
LP
4839 } else if (context->network_namespace_path) {
4840 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4841 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4842 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4843 } else
4844 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4845 }
169c1bda 4846
a70581ff
XR
4847 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4848
4849 if (ns_type_supported(NAMESPACE_IPC)) {
4850 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4851 if (r == -EPERM)
4852 log_unit_warning_errno(unit, r,
4853 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4854 else if (r < 0) {
4855 *exit_status = EXIT_NAMESPACE;
4856 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4857 }
4858 } else if (context->ipc_namespace_path) {
4859 *exit_status = EXIT_NAMESPACE;
4860 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4861 "IPCNamespacePath= is not supported, refusing.");
4862 } else
4863 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4864 }
4865
ee818b89 4866 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4867 _cleanup_free_ char *error_path = NULL;
4868
9f71ba8d 4869 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4870 if (r < 0) {
4871 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4872 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4873 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4874 }
d35fbf6b 4875 }
81a2b7ce 4876
daf8f72b
LP
4877 if (needs_sandboxing) {
4878 r = apply_protect_hostname(unit, context, exit_status);
4879 if (r < 0)
4880 return r;
aecd5ac6
TM
4881 }
4882
5749f855
AZ
4883 /* Drop groups as early as possible.
4884 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4885 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4886 if (needs_setuid) {
afb11bf1
DG
4887 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4888 int ngids_to_enforce = 0;
4889
4890 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4891 ngids,
4892 gids_after_pam,
4893 ngids_after_pam,
4894 &gids_to_enforce);
4895 if (ngids_to_enforce < 0) {
4896 *exit_status = EXIT_MEMORY;
4897 return log_unit_error_errno(unit,
4898 ngids_to_enforce,
4899 "Failed to merge group lists. Group membership might be incorrect: %m");
4900 }
4901
4902 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4903 if (r < 0) {
4904 *exit_status = EXIT_GROUP;
12145637 4905 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4906 }
165a31c0 4907 }
096424d1 4908
5749f855
AZ
4909 /* If the user namespace was not set up above, try to do it now.
4910 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 4911 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
4912 * case of mount namespaces being less privileged when the mount point list is copied from a
4913 * different user namespace). */
9008e1ac 4914
5749f855
AZ
4915 if (needs_sandboxing && context->private_users && !userns_set_up) {
4916 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4917 if (r < 0) {
4918 *exit_status = EXIT_USER;
4919 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4920 }
4921 }
4922
9f71ba8d
ZJS
4923 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4924 * shall execute. */
4925
4926 _cleanup_free_ char *executable = NULL;
254d1313 4927 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 4928 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4929 if (r < 0) {
4930 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4931 log_unit_struct_errno(unit, LOG_INFO, r,
4932 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4933 LOG_UNIT_INVOCATION_ID(unit),
4934 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4935 command->path),
4936 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4937 return 0;
4938 }
4939
4940 *exit_status = EXIT_EXEC;
c2503e35
RH
4941
4942 return log_unit_struct_errno(unit, LOG_INFO, r,
4943 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4944 LOG_UNIT_INVOCATION_ID(unit),
4945 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4946 command->path),
4947 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4948 }
4949
b83d5050
ZJS
4950 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4951 if (r < 0) {
4952 *exit_status = EXIT_FDS;
4953 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4954 }
4955
9f71ba8d 4956#if HAVE_SELINUX
49590d67 4957 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 4958 int fd = -EBADF;
49590d67
MS
4959
4960 if (socket_fd >= 0)
4961 fd = socket_fd;
4962 else if (params->n_socket_fds == 1)
4963 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4964 * use context from that fd to compute the label. */
4965 fd = params->fds[0];
4966
4967 if (fd >= 0) {
4968 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4969 if (r < 0) {
4970 if (!context->selinux_context_ignore) {
4971 *exit_status = EXIT_SELINUX_CONTEXT;
4972 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4973 }
4974 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4975 }
9f71ba8d
ZJS
4976 }
4977 }
4978#endif
4979
165a31c0 4980 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4981 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4982 * however if we have it as we want to keep it open until the final execve(). */
4983
1da37e58 4984 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4985 if (r >= 0)
4986 r = shift_fds(fds, n_fds);
4987 if (r >= 0)
cd48e23f 4988 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
4989 if (r < 0) {
4990 *exit_status = EXIT_FDS;
12145637 4991 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4992 }
e66cf1a3 4993
5686391b
LP
4994 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4995 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4996 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4997 * came this far. */
4998
165a31c0 4999 secure_bits = context->secure_bits;
e66cf1a3 5000
165a31c0
LP
5001 if (needs_sandboxing) {
5002 uint64_t bset;
e66cf1a3 5003
ce932d2d
LP
5004 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5005 * requested. (Note this is placed after the general resource limit initialization, see
5006 * above, in order to take precedence.) */
f4170c67
LP
5007 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5008 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5009 *exit_status = EXIT_LIMITS;
12145637 5010 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
5011 }
5012 }
5013
37ac2744
JB
5014#if ENABLE_SMACK
5015 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5016 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5017 if (use_smack) {
aa5ae971 5018 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 5019 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
5020 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5021 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5022 }
5023 }
5024#endif
5025
165a31c0
LP
5026 bset = context->capability_bounding_set;
5027 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5028 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5029 * instead of us doing that */
5030 if (needs_ambient_hack)
5031 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5032 (UINT64_C(1) << CAP_SETUID) |
5033 (UINT64_C(1) << CAP_SETGID);
5034
5035 if (!cap_test_all(bset)) {
5036 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
5037 if (r < 0) {
5038 *exit_status = EXIT_CAPABILITIES;
12145637 5039 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 5040 }
4c2630eb 5041 }
3b8bddde 5042
16fcb191
TK
5043 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5044 * keep-caps set.
5045 * To be able to raise the ambient capabilities after setresuid() they have to be
5046 * added to the inherited set and keep caps has to be set (done in enforce_user()).
5047 * After setresuid() the ambient capabilities can be raised as they are present in
5048 * the permitted and inhertiable set. However it is possible that someone wants to
5049 * set ambient capabilities without changing the user, so we also set the ambient
5050 * capabilities here.
5051 * The requested ambient capabilities are raised in the inheritable set if the
5052 * second argument is true. */
943800f4 5053 if (!needs_ambient_hack) {
755d4b67
IP
5054 r = capability_ambient_set_apply(context->capability_ambient_set, true);
5055 if (r < 0) {
5056 *exit_status = EXIT_CAPABILITIES;
12145637 5057 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 5058 }
755d4b67 5059 }
165a31c0 5060 }
755d4b67 5061
fa97f630
JB
5062 /* chroot to root directory first, before we lose the ability to chroot */
5063 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5064 if (r < 0)
5065 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5066
165a31c0 5067 if (needs_setuid) {
08f67696 5068 if (uid_is_valid(uid)) {
ff0af2a1
LP
5069 r = enforce_user(context, uid);
5070 if (r < 0) {
5071 *exit_status = EXIT_USER;
12145637 5072 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 5073 }
165a31c0
LP
5074
5075 if (!needs_ambient_hack &&
5076 context->capability_ambient_set != 0) {
755d4b67 5077
16fcb191 5078 /* Raise the ambient capabilities after user change. */
755d4b67
IP
5079 r = capability_ambient_set_apply(context->capability_ambient_set, false);
5080 if (r < 0) {
5081 *exit_status = EXIT_CAPABILITIES;
12145637 5082 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 5083 }
755d4b67 5084 }
5b6319dc 5085 }
165a31c0 5086 }
d35fbf6b 5087
56ef8db9
JB
5088 /* Apply working directory here, because the working directory might be on NFS and only the user running
5089 * this service might have the correct privilege to change to the working directory */
fa97f630 5090 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
5091 if (r < 0)
5092 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5093
165a31c0 5094 if (needs_sandboxing) {
37ac2744 5095 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
5096 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5097 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5098 * are restricted. */
5099
349cc4a5 5100#if HAVE_SELINUX
43b1f709 5101 if (use_selinux) {
5cd9cd35
LP
5102 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5103
5104 if (exec_context) {
5105 r = setexeccon(exec_context);
006d1864
TM
5106 if (r < 0) {
5107 if (!context->selinux_context_ignore) {
5108 *exit_status = EXIT_SELINUX_CONTEXT;
5109 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5110 }
5111 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
5112 }
5113 }
5114 }
5115#endif
5116
349cc4a5 5117#if HAVE_APPARMOR
43b1f709 5118 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
5119 r = aa_change_onexec(context->apparmor_profile);
5120 if (r < 0 && !context->apparmor_profile_ignore) {
5121 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 5122 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
5123 }
5124 }
5125#endif
5126
165a31c0 5127 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
5128 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
5129 * CAP_SETPCAP. */
5130 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 5131 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
5132 * effective set here.
5133 * The effective set is overwritten during execve with the following values:
5134 * - ambient set (for non-root processes)
5135 * - (inheritable | bounding) set for root processes)
5136 *
5137 * Hence there is no security impact to raise it in the effective set before execve
5138 */
5139 r = capability_gain_cap_setpcap(NULL);
5140 if (r < 0) {
5141 *exit_status = EXIT_CAPABILITIES;
5142 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5143 }
755d4b67 5144 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 5145 *exit_status = EXIT_SECUREBITS;
12145637 5146 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 5147 }
dbdc4098 5148 }
5b6319dc 5149
59eeb84b 5150 if (context_has_no_new_privileges(context))
d35fbf6b 5151 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 5152 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 5153 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
5154 }
5155
349cc4a5 5156#if HAVE_SECCOMP
469830d1
LP
5157 r = apply_address_families(unit, context);
5158 if (r < 0) {
5159 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 5160 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 5161 }
04aa0cb9 5162
469830d1
LP
5163 r = apply_memory_deny_write_execute(unit, context);
5164 if (r < 0) {
5165 *exit_status = EXIT_SECCOMP;
12145637 5166 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5167 }
f4170c67 5168
469830d1
LP
5169 r = apply_restrict_realtime(unit, context);
5170 if (r < 0) {
5171 *exit_status = EXIT_SECCOMP;
12145637 5172 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5173 }
5174
f69567cb
LP
5175 r = apply_restrict_suid_sgid(unit, context);
5176 if (r < 0) {
5177 *exit_status = EXIT_SECCOMP;
5178 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5179 }
5180
add00535
LP
5181 r = apply_restrict_namespaces(unit, context);
5182 if (r < 0) {
5183 *exit_status = EXIT_SECCOMP;
12145637 5184 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5185 }
5186
469830d1
LP
5187 r = apply_protect_sysctl(unit, context);
5188 if (r < 0) {
5189 *exit_status = EXIT_SECCOMP;
12145637 5190 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5191 }
5192
469830d1
LP
5193 r = apply_protect_kernel_modules(unit, context);
5194 if (r < 0) {
5195 *exit_status = EXIT_SECCOMP;
12145637 5196 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5197 }
5198
84703040
KK
5199 r = apply_protect_kernel_logs(unit, context);
5200 if (r < 0) {
5201 *exit_status = EXIT_SECCOMP;
5202 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5203 }
5204
fc64760d
KK
5205 r = apply_protect_clock(unit, context);
5206 if (r < 0) {
5207 *exit_status = EXIT_SECCOMP;
5208 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5209 }
5210
469830d1
LP
5211 r = apply_private_devices(unit, context);
5212 if (r < 0) {
5213 *exit_status = EXIT_SECCOMP;
12145637 5214 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5215 }
5216
5217 r = apply_syscall_archs(unit, context);
5218 if (r < 0) {
5219 *exit_status = EXIT_SECCOMP;
12145637 5220 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5221 }
5222
78e864e5
TM
5223 r = apply_lock_personality(unit, context);
5224 if (r < 0) {
5225 *exit_status = EXIT_SECCOMP;
12145637 5226 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5227 }
5228
9df2cdd8
TM
5229 r = apply_syscall_log(unit, context);
5230 if (r < 0) {
5231 *exit_status = EXIT_SECCOMP;
5232 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5233 }
5234
5cd9cd35
LP
5235 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5236 * by the filter as little as possible. */
165a31c0 5237 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5238 if (r < 0) {
5239 *exit_status = EXIT_SECCOMP;
12145637 5240 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5241 }
5242#endif
b1994387
ILG
5243
5244#if HAVE_LIBBPF
5245 r = apply_restrict_filesystems(unit, context);
5246 if (r < 0) {
5247 *exit_status = EXIT_BPF;
5248 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5249 }
5250#endif
5251
d35fbf6b 5252 }
034c6ed7 5253
00819cc1
LP
5254 if (!strv_isempty(context->unset_environment)) {
5255 char **ee = NULL;
5256
5257 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5258 if (!ee) {
5259 *exit_status = EXIT_MEMORY;
12145637 5260 return log_oom();
00819cc1
LP
5261 }
5262
130d3d22 5263 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5264 }
5265
7ca69792
AZ
5266 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5267 replaced_argv = replace_env_argv(command->argv, accum_env);
5268 if (!replaced_argv) {
5269 *exit_status = EXIT_MEMORY;
5270 return log_oom();
5271 }
5272 final_argv = replaced_argv;
5273 } else
5274 final_argv = command->argv;
034c6ed7 5275
f1d34068 5276 if (DEBUG_LOGGING) {
c2b2df60 5277 _cleanup_free_ char *line = NULL;
81a2b7ce 5278
4ef15008 5279 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5280 if (!line) {
5281 *exit_status = EXIT_MEMORY;
5282 return log_oom();
5283 }
5284
5285 log_unit_struct(unit, LOG_DEBUG,
5286 "EXECUTABLE=%s", executable,
5287 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5288 }
dd305ec9 5289
5686391b
LP
5290 if (exec_fd >= 0) {
5291 uint8_t hot = 1;
5292
5293 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5294 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5295
5296 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5297 *exit_status = EXIT_EXEC;
5298 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5299 }
5300 }
5301
a6d9111c 5302 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5303
5304 if (exec_fd >= 0) {
5305 uint8_t hot = 0;
5306
5307 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5308 * that POLLHUP on it no longer means execve() succeeded. */
5309
5310 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5311 *exit_status = EXIT_EXEC;
5312 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5313 }
5314 }
12145637 5315
ff0af2a1 5316 *exit_status = EXIT_EXEC;
9f71ba8d 5317 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5318}
81a2b7ce 5319
34cf6c43 5320static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5321static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5322
f2341e0a
LP
5323int exec_spawn(Unit *unit,
5324 ExecCommand *command,
d35fbf6b
DM
5325 const ExecContext *context,
5326 const ExecParameters *params,
5327 ExecRuntime *runtime,
29206d46 5328 DynamicCreds *dcreds,
d35fbf6b 5329 pid_t *ret) {
8351ceae 5330
ee39ca20 5331 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5332 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5333 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5334 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5335 _cleanup_free_ char *line = NULL;
d35fbf6b 5336 pid_t pid;
8351ceae 5337
f2341e0a 5338 assert(unit);
d35fbf6b
DM
5339 assert(command);
5340 assert(context);
5341 assert(ret);
5342 assert(params);
25b583d7 5343 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5344
d35fbf6b
DM
5345 if (context->std_input == EXEC_INPUT_SOCKET ||
5346 context->std_output == EXEC_OUTPUT_SOCKET ||
5347 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5348
d85ff944
YW
5349 if (params->n_socket_fds > 1)
5350 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5351
d85ff944
YW
5352 if (params->n_socket_fds == 0)
5353 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5354
d35fbf6b
DM
5355 socket_fd = params->fds[0];
5356 } else {
254d1313 5357 socket_fd = -EBADF;
d35fbf6b 5358 fds = params->fds;
9b141911 5359 n_socket_fds = params->n_socket_fds;
25b583d7 5360 n_storage_fds = params->n_storage_fds;
d35fbf6b 5361 }
94f04347 5362
34cf6c43 5363 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5364 if (r < 0)
5365 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5366
f2341e0a 5367 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5368 if (r < 0)
f2341e0a 5369 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5370
4ef15008 5371 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5372 if (!line)
5373 return log_oom();
fab56fc5 5374
9f71ba8d
ZJS
5375 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5376 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5377 mac_selinux_maybe_reload();
5378
c2503e35
RH
5379 log_unit_struct(unit, LOG_DEBUG,
5380 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5381 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5382 the mount namespace in the child, but we want to log
5383 from the parent, so we need to use the (possibly
5384 inaccurate) path here. */
5385 LOG_UNIT_INVOCATION_ID(unit));
12145637 5386
78f93209
LP
5387 if (params->cgroup_path) {
5388 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5389 if (r < 0)
5390 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5391 if (r > 0) { /* We are using a child cgroup */
5392 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5393 if (r < 0)
5394 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa 5395
523ea123 5396 /* Normally we would not propagate the xattrs to children but since we created this
4e806bfa
AZ
5397 * sub-cgroup internally we should do it. */
5398 cgroup_oomd_xattr_apply(unit, subcgroup_path);
523ea123 5399 cgroup_log_xattr_apply(unit, subcgroup_path);
78f93209
LP
5400 }
5401 }
5402
d35fbf6b
DM
5403 pid = fork();
5404 if (pid < 0)
74129a12 5405 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5406
5407 if (pid == 0) {
12145637 5408 int exit_status = EXIT_SUCCESS;
ff0af2a1 5409
f2341e0a
LP
5410 r = exec_child(unit,
5411 command,
ff0af2a1
LP
5412 context,
5413 params,
5414 runtime,
29206d46 5415 dcreds,
ff0af2a1 5416 socket_fd,
52c239d7 5417 named_iofds,
4c47affc 5418 fds,
9b141911 5419 n_socket_fds,
25b583d7 5420 n_storage_fds,
ff0af2a1 5421 files_env,
00d9ef85 5422 unit->manager->user_lookup_fds[1],
12145637
LP
5423 &exit_status);
5424
e1714f02
ZJS
5425 if (r < 0) {
5426 const char *status =
5427 exit_status_to_string(exit_status,
e04ed6db 5428 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5429
c2503e35
RH
5430 log_unit_struct_errno(unit, LOG_ERR, r,
5431 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5432 LOG_UNIT_INVOCATION_ID(unit),
5433 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5434 status, command->path),
5435 "EXECUTABLE=%s", command->path);
e1714f02 5436 }
4c2630eb 5437
ff0af2a1 5438 _exit(exit_status);
034c6ed7
LP
5439 }
5440
f2341e0a 5441 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5442
78f93209
LP
5443 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5444 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5445 * process will be killed too). */
5446 if (subcgroup_path)
5447 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5448
b58b4116 5449 exec_status_start(&command->exec_status, pid);
9fb86720 5450
034c6ed7 5451 *ret = pid;
5cb5a6ff
LP
5452 return 0;
5453}
5454
034c6ed7
LP
5455void exec_context_init(ExecContext *c) {
5456 assert(c);
5457
4c12626c 5458 c->umask = 0022;
0692548c 5459 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5460 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5461 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5462 c->syslog_level_prefix = true;
353e12c2 5463 c->ignore_sigpipe = true;
3a43da28 5464 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5465 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5466 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5467 c->directories[t].mode = 0755;
12213aed 5468 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5469 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5470 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5471 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5472 c->log_level_max = -1;
005bfaf1
TM
5473#if HAVE_SECCOMP
5474 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5475#endif
51462135
DDM
5476 c->tty_rows = UINT_MAX;
5477 c->tty_cols = UINT_MAX;
b070c7c0 5478 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5479}
5480
613b411c 5481void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5482 assert(c);
5483
6796073e
LP
5484 c->environment = strv_free(c->environment);
5485 c->environment_files = strv_free(c->environment_files);
b4c14404 5486 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5487 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5488
31ce987c 5489 rlimit_free_all(c->rlimit);
034c6ed7 5490
5b10116e 5491 for (size_t l = 0; l < 3; l++) {
52c239d7 5492 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5493 c->stdio_file[l] = mfree(c->stdio_file[l]);
5494 }
52c239d7 5495
a1e58e8e
LP
5496 c->working_directory = mfree(c->working_directory);
5497 c->root_directory = mfree(c->root_directory);
915e6d16 5498 c->root_image = mfree(c->root_image);
18d73705 5499 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5500 c->root_hash = mfree(c->root_hash);
5501 c->root_hash_size = 0;
5502 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5503 c->root_hash_sig = mfree(c->root_hash_sig);
5504 c->root_hash_sig_size = 0;
5505 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5506 c->root_verity = mfree(c->root_verity);
93f59701 5507 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5508 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5509 c->tty_path = mfree(c->tty_path);
5510 c->syslog_identifier = mfree(c->syslog_identifier);
5511 c->user = mfree(c->user);
5512 c->group = mfree(c->group);
034c6ed7 5513
6796073e 5514 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5515
a1e58e8e 5516 c->pam_name = mfree(c->pam_name);
5b6319dc 5517
2a624c36
AP
5518 c->read_only_paths = strv_free(c->read_only_paths);
5519 c->read_write_paths = strv_free(c->read_write_paths);
5520 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5521 c->exec_paths = strv_free(c->exec_paths);
5522 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5523 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5524
d2d6c096 5525 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5526 c->bind_mounts = NULL;
5527 c->n_bind_mounts = 0;
2abd4e38
YW
5528 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5529 c->temporary_filesystems = NULL;
5530 c->n_temporary_filesystems = 0;
b3d13314 5531 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5532
0985c7c4 5533 cpu_set_reset(&c->cpu_set);
b070c7c0 5534 numa_policy_reset(&c->numa_policy);
86a3475b 5535
a1e58e8e
LP
5536 c->utmp_id = mfree(c->utmp_id);
5537 c->selinux_context = mfree(c->selinux_context);
5538 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5539 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5540
b1994387
ILG
5541 c->restrict_filesystems = set_free(c->restrict_filesystems);
5542
8cfa775f 5543 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5544 c->syscall_archs = set_free(c->syscall_archs);
5545 c->address_families = set_free(c->address_families);
e66cf1a3 5546
5b10116e 5547 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5548 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5549
5550 c->log_level_max = -1;
5551
5552 exec_context_free_log_extra_fields(c);
523ea123
QD
5553 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5554 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
08f3be7a 5555
5ac1530e
ZJS
5556 c->log_ratelimit_interval_usec = 0;
5557 c->log_ratelimit_burst = 0;
90fc172e 5558
08f3be7a
LP
5559 c->stdin_data = mfree(c->stdin_data);
5560 c->stdin_data_size = 0;
a8d08f39
LP
5561
5562 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5563 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5564
5565 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5566
43144be4 5567 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5568 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5569}
5570
34cf6c43 5571int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5572 assert(c);
5573
5574 if (!runtime_prefix)
5575 return 0;
5576
211a3d87 5577 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5578 _cleanup_free_ char *p = NULL;
e66cf1a3 5579
494d0247 5580 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5581 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5582 else
211a3d87 5583 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5584 if (!p)
5585 return -ENOMEM;
5586
7bc4bf4a
LP
5587 /* We execute this synchronously, since we need to be sure this is gone when we start the
5588 * service next. */
c6878637 5589 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5590
211a3d87
LB
5591 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5592 _cleanup_free_ char *symlink_abs = NULL;
5593
5594 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5595 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5596 else
5597 symlink_abs = path_join(runtime_prefix, *symlink);
5598 if (!symlink_abs)
5599 return -ENOMEM;
5600
5601 (void) unlink(symlink_abs);
5602 }
e66cf1a3
LP
5603 }
5604
5605 return 0;
5cb5a6ff
LP
5606}
5607
bb0c0d6f
LP
5608int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5609 _cleanup_free_ char *p = NULL;
5610
5611 assert(c);
5612
5613 if (!runtime_prefix || !unit)
5614 return 0;
5615
5616 p = path_join(runtime_prefix, "credentials", unit);
5617 if (!p)
5618 return -ENOMEM;
5619
5620 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5621 * unmount it, and afterwards remove the mount point */
5622 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5623 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5624
5625 return 0;
5626}
5627
b9f976fb
MK
5628int exec_context_destroy_mount_ns_dir(Unit *u) {
5629 _cleanup_free_ char *p = NULL;
5630
5631 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5632 return 0;
5633
5634 p = path_join("/run/systemd/propagate/", u->id);
5635 if (!p)
5636 return -ENOMEM;
5637
5638 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5639 if (rmdir(p) < 0 && errno != ENOENT)
5640 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5641
5642 return 0;
5643}
5644
34cf6c43 5645static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5646 assert(c);
5647
a1e58e8e 5648 c->path = mfree(c->path);
6796073e 5649 c->argv = strv_free(c->argv);
43d0fcbd
LP
5650}
5651
da6053d0 5652void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5653 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5654 exec_command_done(c+i);
5655}
5656
f1acf85a 5657ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5658 ExecCommand *i;
5659
5660 while ((i = c)) {
71fda00f 5661 LIST_REMOVE(command, c, i);
43d0fcbd 5662 exec_command_done(i);
5cb5a6ff
LP
5663 free(i);
5664 }
f1acf85a
ZJS
5665
5666 return NULL;
5cb5a6ff
LP
5667}
5668
da6053d0 5669void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5670 for (size_t i = 0; i < n; i++)
f1acf85a 5671 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5672}
5673
6a1d4d9f 5674void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5675 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5676 exec_status_reset(&c[i].exec_status);
5677}
5678
5679void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5680 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5681 LIST_FOREACH(command, z, c[i])
5682 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5683}
5684
039f0e70 5685typedef struct InvalidEnvInfo {
34cf6c43 5686 const Unit *unit;
039f0e70
LP
5687 const char *path;
5688} InvalidEnvInfo;
5689
5690static void invalid_env(const char *p, void *userdata) {
5691 InvalidEnvInfo *info = userdata;
5692
f2341e0a 5693 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5694}
5695
52c239d7
LB
5696const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5697 assert(c);
5698
5699 switch (fd_index) {
5073ff6b 5700
52c239d7
LB
5701 case STDIN_FILENO:
5702 if (c->std_input != EXEC_INPUT_NAMED_FD)
5703 return NULL;
5073ff6b 5704
52c239d7 5705 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5706
52c239d7
LB
5707 case STDOUT_FILENO:
5708 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5709 return NULL;
5073ff6b 5710
52c239d7 5711 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5712
52c239d7
LB
5713 case STDERR_FILENO:
5714 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5715 return NULL;
5073ff6b 5716
52c239d7 5717 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5718
52c239d7
LB
5719 default:
5720 return NULL;
5721 }
5722}
5723
2caa38e9
LP
5724static int exec_context_named_iofds(
5725 const ExecContext *c,
5726 const ExecParameters *p,
5727 int named_iofds[static 3]) {
5728
5b10116e 5729 size_t targets;
56fbd561 5730 const char* stdio_fdname[3];
da6053d0 5731 size_t n_fds;
52c239d7
LB
5732
5733 assert(c);
5734 assert(p);
2caa38e9 5735 assert(named_iofds);
52c239d7
LB
5736
5737 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5738 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5739 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5740
5b10116e 5741 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5742 stdio_fdname[i] = exec_context_fdname(c, i);
5743
4c47affc
FB
5744 n_fds = p->n_storage_fds + p->n_socket_fds;
5745
5b10116e 5746 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5747 if (named_iofds[STDIN_FILENO] < 0 &&
5748 c->std_input == EXEC_INPUT_NAMED_FD &&
5749 stdio_fdname[STDIN_FILENO] &&
5750 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5751
52c239d7
LB
5752 named_iofds[STDIN_FILENO] = p->fds[i];
5753 targets--;
56fbd561
ZJS
5754
5755 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5756 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5757 stdio_fdname[STDOUT_FILENO] &&
5758 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5759
52c239d7
LB
5760 named_iofds[STDOUT_FILENO] = p->fds[i];
5761 targets--;
56fbd561
ZJS
5762
5763 } else if (named_iofds[STDERR_FILENO] < 0 &&
5764 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5765 stdio_fdname[STDERR_FILENO] &&
5766 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5767
52c239d7
LB
5768 named_iofds[STDERR_FILENO] = p->fds[i];
5769 targets--;
5770 }
5771
56fbd561 5772 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5773}
5774
398a5009
ZJS
5775static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5776 _cleanup_strv_free_ char **v = NULL;
398a5009 5777 int r;
8c7be95e
LP
5778
5779 assert(c);
398a5009 5780 assert(ret);
8c7be95e
LP
5781
5782 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5783 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5784 bool ignore = false;
5785 char *fn = *i;
8c7be95e
LP
5786
5787 if (fn[0] == '-') {
5788 ignore = true;
313cefa1 5789 fn++;
8c7be95e
LP
5790 }
5791
5792 if (!path_is_absolute(fn)) {
8c7be95e
LP
5793 if (ignore)
5794 continue;
8c7be95e
LP
5795 return -EINVAL;
5796 }
5797
2bef10ab 5798 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5799 r = safe_glob(fn, 0, &pglob);
5800 if (r < 0) {
2bef10ab
PL
5801 if (ignore)
5802 continue;
398a5009 5803 return r;
2bef10ab 5804 }
8c7be95e 5805
d8c92e8b
ZJS
5806 /* When we don't match anything, -ENOENT should be returned */
5807 assert(pglob.gl_pathc > 0);
5808
5b10116e 5809 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5810 _cleanup_strv_free_ char **p = NULL;
5811
5812 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5813 if (r < 0) {
2bef10ab
PL
5814 if (ignore)
5815 continue;
398a5009 5816 return r;
e9c1ea9d 5817 }
398a5009 5818
ebc05a09 5819 /* Log invalid environment variables with filename */
039f0e70
LP
5820 if (p) {
5821 InvalidEnvInfo info = {
f2341e0a 5822 .unit = unit,
039f0e70
LP
5823 .path = pglob.gl_pathv[n]
5824 };
5825
5826 p = strv_env_clean_with_callback(p, invalid_env, &info);
5827 }
8c7be95e 5828
398a5009
ZJS
5829 if (!v)
5830 v = TAKE_PTR(p);
2bef10ab 5831 else {
398a5009 5832 char **m = strv_env_merge(v, p);
c84a9488 5833 if (!m)
2bef10ab 5834 return -ENOMEM;
2bef10ab 5835
398a5009 5836 strv_free_and_replace(v, m);
2bef10ab 5837 }
8c7be95e
LP
5838 }
5839 }
5840
398a5009 5841 *ret = TAKE_PTR(v);
8c7be95e
LP
5842
5843 return 0;
5844}
5845
6ac8fdc9 5846static bool tty_may_match_dev_console(const char *tty) {
7b912648 5847 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5848
1e22b5cd
LP
5849 if (!tty)
5850 return true;
5851
a119ec7c 5852 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5853
5854 /* trivial identity? */
5855 if (streq(tty, "console"))
5856 return true;
5857
7b912648
LP
5858 if (resolve_dev_console(&resolved) < 0)
5859 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5860
5861 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5862 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5863}
5864
6c0ae739
LP
5865static bool exec_context_may_touch_tty(const ExecContext *ec) {
5866 assert(ec);
1e22b5cd 5867
6c0ae739 5868 return ec->tty_reset ||
1e22b5cd
LP
5869 ec->tty_vhangup ||
5870 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5871 is_terminal_input(ec->std_input) ||
5872 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5873 is_terminal_output(ec->std_error);
5874}
5875
5876bool exec_context_may_touch_console(const ExecContext *ec) {
5877
5878 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5879 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5880}
5881
15ae422b 5882static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5883 assert(f);
5884
5885 STRV_FOREACH(g, l)
5886 fprintf(f, " %s", *g);
5887}
5888
ddc155b2
TM
5889static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5890 assert(f);
5891 assert(prefix);
5892 assert(name);
5893
5894 if (!strv_isempty(strv)) {
a7bd1656 5895 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5896 strv_fprintf(f, strv);
5897 fputs("\n", f);
5898 }
5899}
5900
34cf6c43 5901void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5902 int r;
9eba9da4 5903
5cb5a6ff
LP
5904 assert(c);
5905 assert(f);
5906
4ad49000 5907 prefix = strempty(prefix);
5cb5a6ff
LP
5908
5909 fprintf(f,
94f04347
LP
5910 "%sUMask: %04o\n"
5911 "%sWorkingDirectory: %s\n"
451a074f 5912 "%sRootDirectory: %s\n"
15ae422b 5913 "%sNonBlocking: %s\n"
64747e2d 5914 "%sPrivateTmp: %s\n"
7f112f50 5915 "%sPrivateDevices: %s\n"
59eeb84b 5916 "%sProtectKernelTunables: %s\n"
e66a2f65 5917 "%sProtectKernelModules: %s\n"
84703040 5918 "%sProtectKernelLogs: %s\n"
fc64760d 5919 "%sProtectClock: %s\n"
59eeb84b 5920 "%sProtectControlGroups: %s\n"
d251207d
LP
5921 "%sPrivateNetwork: %s\n"
5922 "%sPrivateUsers: %s\n"
1b8689f9
LP
5923 "%sProtectHome: %s\n"
5924 "%sProtectSystem: %s\n"
5d997827 5925 "%sMountAPIVFS: %s\n"
f3e43635 5926 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5927 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5928 "%sRestrictRealtime: %s\n"
f69567cb 5929 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5930 "%sKeyringMode: %s\n"
4e399953
LP
5931 "%sProtectHostname: %s\n"
5932 "%sProtectProc: %s\n"
5933 "%sProcSubset: %s\n",
5cb5a6ff 5934 prefix, c->umask,
14eb3285
LP
5935 prefix, empty_to_root(c->working_directory),
5936 prefix, empty_to_root(c->root_directory),
15ae422b 5937 prefix, yes_no(c->non_blocking),
64747e2d 5938 prefix, yes_no(c->private_tmp),
7f112f50 5939 prefix, yes_no(c->private_devices),
59eeb84b 5940 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5941 prefix, yes_no(c->protect_kernel_modules),
84703040 5942 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5943 prefix, yes_no(c->protect_clock),
59eeb84b 5944 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5945 prefix, yes_no(c->private_network),
5946 prefix, yes_no(c->private_users),
1b8689f9
LP
5947 prefix, protect_home_to_string(c->protect_home),
5948 prefix, protect_system_to_string(c->protect_system),
5e98086d 5949 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5950 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5951 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5952 prefix, yes_no(c->restrict_realtime),
f69567cb 5953 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5954 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5955 prefix, yes_no(c->protect_hostname),
5956 prefix, protect_proc_to_string(c->protect_proc),
5957 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5958
915e6d16
LP
5959 if (c->root_image)
5960 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5961
18d73705 5962 if (c->root_image_options) {
18d73705
LB
5963 fprintf(f, "%sRootImageOptions:", prefix);
5964 LIST_FOREACH(mount_options, o, c->root_image_options)
5965 if (!isempty(o->options))
9ece6444
LB
5966 fprintf(f, " %s:%s",
5967 partition_designator_to_string(o->partition_designator),
5968 o->options);
18d73705
LB
5969 fprintf(f, "\n");
5970 }
5971
0389f4fa
LB
5972 if (c->root_hash) {
5973 _cleanup_free_ char *encoded = NULL;
5974 encoded = hexmem(c->root_hash, c->root_hash_size);
5975 if (encoded)
5976 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5977 }
5978
5979 if (c->root_hash_path)
5980 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5981
d4d55b0d
LB
5982 if (c->root_hash_sig) {
5983 _cleanup_free_ char *encoded = NULL;
5984 ssize_t len;
5985 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5986 if (len)
5987 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5988 }
5989
5990 if (c->root_hash_sig_path)
5991 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5992
0389f4fa
LB
5993 if (c->root_verity)
5994 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5995
8c7be95e
LP
5996 STRV_FOREACH(e, c->environment)
5997 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5998
5999 STRV_FOREACH(e, c->environment_files)
6000 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 6001
b4c14404
FB
6002 STRV_FOREACH(e, c->pass_environment)
6003 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6004
00819cc1
LP
6005 STRV_FOREACH(e, c->unset_environment)
6006 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6007
53f47dfc
YW
6008 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6009
5b10116e 6010 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
6011 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6012
211a3d87
LB
6013 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6014 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6015
6016 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6017 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6018 }
3536f49e 6019 }
c2bbd90b 6020
5291f26d 6021 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 6022
fb33a393 6023 if (c->nice_set)
5291f26d 6024 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 6025
dd6c17b1 6026 if (c->oom_score_adjust_set)
5291f26d 6027 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 6028
ad21e542 6029 if (c->coredump_filter_set)
5291f26d 6030 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 6031
5b10116e 6032 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 6033 if (c->rlimit[i]) {
4c3a2b84 6034 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 6035 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 6036 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
6037 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6038 }
94f04347 6039
f8b69d1d 6040 if (c->ioprio_set) {
1756a011 6041 _cleanup_free_ char *class_str = NULL;
f8b69d1d 6042
5bead76e 6043 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
6044 if (r >= 0)
6045 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6046
5bead76e 6047 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 6048 }
94f04347 6049
f8b69d1d 6050 if (c->cpu_sched_set) {
1756a011 6051 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 6052
837df140
YW
6053 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6054 if (r >= 0)
6055 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6056
94f04347 6057 fprintf(f,
38b48754
LP
6058 "%sCPUSchedulingPriority: %i\n"
6059 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
6060 prefix, c->cpu_sched_priority,
6061 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 6062 }
94f04347 6063
0985c7c4 6064 if (c->cpu_set.set) {
e7fca352
MS
6065 _cleanup_free_ char *affinity = NULL;
6066
6067 affinity = cpu_set_to_range_string(&c->cpu_set);
6068 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
6069 }
6070
b070c7c0
MS
6071 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6072 _cleanup_free_ char *nodes = NULL;
6073
6074 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6075 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6076 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6077 }
6078
3a43da28 6079 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 6080 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
6081
6082 fprintf(f,
80876c20
LP
6083 "%sStandardInput: %s\n"
6084 "%sStandardOutput: %s\n"
6085 "%sStandardError: %s\n",
6086 prefix, exec_input_to_string(c->std_input),
6087 prefix, exec_output_to_string(c->std_output),
6088 prefix, exec_output_to_string(c->std_error));
6089
befc4a80
LP
6090 if (c->std_input == EXEC_INPUT_NAMED_FD)
6091 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6092 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6093 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6094 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6095 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6096
6097 if (c->std_input == EXEC_INPUT_FILE)
6098 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6099 if (c->std_output == EXEC_OUTPUT_FILE)
6100 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
6101 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6102 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
6103 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6104 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
6105 if (c->std_error == EXEC_OUTPUT_FILE)
6106 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
6107 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6108 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
6109 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6110 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 6111
80876c20
LP
6112 if (c->tty_path)
6113 fprintf(f,
6ea832a2
LP
6114 "%sTTYPath: %s\n"
6115 "%sTTYReset: %s\n"
6116 "%sTTYVHangup: %s\n"
51462135
DDM
6117 "%sTTYVTDisallocate: %s\n"
6118 "%sTTYRows: %u\n"
6119 "%sTTYColumns: %u\n",
6ea832a2
LP
6120 prefix, c->tty_path,
6121 prefix, yes_no(c->tty_reset),
6122 prefix, yes_no(c->tty_vhangup),
51462135
DDM
6123 prefix, yes_no(c->tty_vt_disallocate),
6124 prefix, c->tty_rows,
6125 prefix, c->tty_cols);
94f04347 6126
9f6444eb 6127 if (IN_SET(c->std_output,
9f6444eb
LP
6128 EXEC_OUTPUT_KMSG,
6129 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6130 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6131 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6132 IN_SET(c->std_error,
9f6444eb
LP
6133 EXEC_OUTPUT_KMSG,
6134 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6135 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6136 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 6137
5ce70e5b 6138 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 6139
837df140
YW
6140 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6141 if (r >= 0)
6142 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 6143
837df140
YW
6144 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6145 if (r >= 0)
6146 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 6147 }
94f04347 6148
d3070fbd
LP
6149 if (c->log_level_max >= 0) {
6150 _cleanup_free_ char *t = NULL;
6151
6152 (void) log_level_to_string_alloc(c->log_level_max, &t);
6153
6154 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6155 }
6156
5291f26d 6157 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
6158 fprintf(f,
6159 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 6160 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 6161
5ac1530e
ZJS
6162 if (c->log_ratelimit_burst > 0)
6163 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 6164
523ea123
QD
6165 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6166 fprintf(f, "%sLogFilterPatterns:", prefix);
6167
6168 char *pattern;
6169 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6170 fprintf(f, " %s", pattern);
6171 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6172 fprintf(f, " ~%s", pattern);
6173 fputc('\n', f);
6174 }
6175
5b10116e
ZJS
6176 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6177 fprintf(f, "%sLogExtraFields: ", prefix);
6178 fwrite(c->log_extra_fields[j].iov_base,
6179 1, c->log_extra_fields[j].iov_len,
6180 f);
6181 fputc('\n', f);
d3070fbd
LP
6182 }
6183
91dd5f7c
LP
6184 if (c->log_namespace)
6185 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6186
07d46372
YW
6187 if (c->secure_bits) {
6188 _cleanup_free_ char *str = NULL;
6189
6190 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6191 if (r >= 0)
6192 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6193 }
94f04347 6194
a103496c 6195 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 6196 _cleanup_free_ char *str = NULL;
94f04347 6197
8142d735 6198 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
6199 if (r >= 0)
6200 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6201 }
6202
6203 if (c->capability_ambient_set != 0) {
dd1f5bd0 6204 _cleanup_free_ char *str = NULL;
755d4b67 6205
8142d735 6206 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
6207 if (r >= 0)
6208 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6209 }
6210
6211 if (c->user)
f2d3769a 6212 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6213 if (c->group)
f2d3769a 6214 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6215
29206d46
LP
6216 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6217
ddc155b2 6218 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6219
5b6319dc 6220 if (c->pam_name)
f2d3769a 6221 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6222
ddc155b2
TM
6223 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6224 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6225 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6226 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6227 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6228 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6229
5b10116e
ZJS
6230 for (size_t i = 0; i < c->n_bind_mounts; i++)
6231 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6232 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6233 c->bind_mounts[i].ignore_enoent ? "-": "",
6234 c->bind_mounts[i].source,
6235 c->bind_mounts[i].destination,
6236 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6237
5b10116e
ZJS
6238 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6239 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6240
5b10116e
ZJS
6241 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6242 t->path,
6243 isempty(t->options) ? "" : ":",
6244 strempty(t->options));
6245 }
2abd4e38 6246
169c1bda
LP
6247 if (c->utmp_id)
6248 fprintf(f,
6249 "%sUtmpIdentifier: %s\n",
6250 prefix, c->utmp_id);
7b52a628
MS
6251
6252 if (c->selinux_context)
6253 fprintf(f,
5f8640fb
LP
6254 "%sSELinuxContext: %s%s\n",
6255 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6256
80c21aea
WC
6257 if (c->apparmor_profile)
6258 fprintf(f,
6259 "%sAppArmorProfile: %s%s\n",
6260 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6261
6262 if (c->smack_process_label)
6263 fprintf(f,
6264 "%sSmackProcessLabel: %s%s\n",
6265 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6266
050f7277 6267 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6268 fprintf(f,
6269 "%sPersonality: %s\n",
6270 prefix, strna(personality_to_string(c->personality)));
6271
78e864e5
TM
6272 fprintf(f,
6273 "%sLockPersonality: %s\n",
6274 prefix, yes_no(c->lock_personality));
6275
17df7223 6276 if (c->syscall_filter) {
17df7223 6277 fprintf(f,
57183d11 6278 "%sSystemCallFilter: ",
17df7223
LP
6279 prefix);
6280
6b000af4 6281 if (!c->syscall_allow_list)
17df7223
LP
6282 fputc('~', f);
6283
349cc4a5 6284#if HAVE_SECCOMP
d5a99b7c
JJ
6285 void *id, *val;
6286 bool first = true;
90e74a66 6287 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6288 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6289 const char *errno_name = NULL;
6290 int num = PTR_TO_INT(val);
17df7223
LP
6291
6292 if (first)
6293 first = false;
6294 else
6295 fputc(' ', f);
6296
57183d11 6297 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6298 fputs(strna(name), f);
8cfa775f
YW
6299
6300 if (num >= 0) {
005bfaf1 6301 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6302 if (errno_name)
6303 fprintf(f, ":%s", errno_name);
6304 else
6305 fprintf(f, ":%d", num);
6306 }
17df7223 6307 }
351a19b1 6308#endif
17df7223
LP
6309
6310 fputc('\n', f);
6311 }
6312
57183d11 6313 if (c->syscall_archs) {
57183d11
LP
6314 fprintf(f,
6315 "%sSystemCallArchitectures:",
6316 prefix);
6317
349cc4a5 6318#if HAVE_SECCOMP
d5a99b7c 6319 void *id;
90e74a66 6320 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6321 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6322#endif
6323 fputc('\n', f);
6324 }
6325
add00535
LP
6326 if (exec_context_restrict_namespaces_set(c)) {
6327 _cleanup_free_ char *s = NULL;
6328
86c2a9f1 6329 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6330 if (r >= 0)
6331 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6332 prefix, strna(s));
add00535
LP
6333 }
6334
b1994387 6335#if HAVE_LIBBPF
8fe84dc8
YW
6336 if (exec_context_restrict_filesystems_set(c)) {
6337 char *fs;
6338 SET_FOREACH(fs, c->restrict_filesystems)
6339 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6340 }
b1994387
ILG
6341#endif
6342
a8d08f39
LP
6343 if (c->network_namespace_path)
6344 fprintf(f,
6345 "%sNetworkNamespacePath: %s\n",
6346 prefix, c->network_namespace_path);
6347
3df90f24 6348 if (c->syscall_errno > 0) {
3df90f24
YW
6349 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6350
005bfaf1 6351#if HAVE_SECCOMP
d5a99b7c 6352 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6353 if (errno_name)
005bfaf1 6354 fputs(errno_name, f);
3df90f24 6355 else
005bfaf1
TM
6356 fprintf(f, "%d", c->syscall_errno);
6357#endif
6358 fputc('\n', f);
3df90f24 6359 }
b3d13314 6360
5b10116e 6361 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6362 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6363 c->mount_images[i].ignore_enoent ? "-": "",
6364 c->mount_images[i].source,
79e20ceb 6365 c->mount_images[i].destination);
427353f6 6366 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6367 fprintf(f, ":%s:%s",
427353f6 6368 partition_designator_to_string(o->partition_designator),
79e20ceb 6369 strempty(o->options));
427353f6
LB
6370 fprintf(f, "\n");
6371 }
93f59701
LB
6372
6373 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6374 fprintf(f, "%sExtensionImages: %s%s", prefix,
6375 c->extension_images[i].ignore_enoent ? "-": "",
6376 c->extension_images[i].source);
6377 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6378 fprintf(f, ":%s:%s",
6379 partition_designator_to_string(o->partition_designator),
6380 strempty(o->options));
6381 fprintf(f, "\n");
6382 }
a07b9926
LB
6383
6384 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6385}
6386
34cf6c43 6387bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6388 assert(c);
6389
61233823 6390 /* Returns true if the process forked off would run under
a931ad47
LP
6391 * an unchanged UID or as root. */
6392
6393 if (!c->user)
6394 return true;
6395
6396 if (streq(c->user, "root") || streq(c->user, "0"))
6397 return true;
6398
6399 return false;
6400}
6401
34cf6c43 6402int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6403 int p;
6404
6405 assert(c);
6406
6407 if (c->ioprio_set)
6408 return c->ioprio;
6409
6410 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6411 if (p < 0)
0692548c 6412 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6413
8b330d7d 6414 return ioprio_normalize(p);
7f452159
LP
6415}
6416
5e98086d
ZJS
6417bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6418 assert(c);
6419
61198784 6420 /* Explicit setting wins */
5e98086d
ZJS
6421 if (c->mount_apivfs_set)
6422 return c->mount_apivfs;
6423
61198784 6424 /* Default to "yes" if root directory or image are specified */
74e12520 6425 if (exec_context_with_rootfs(c))
61198784
ZJS
6426 return true;
6427
5e98086d
ZJS
6428 return false;
6429}
6430
d3070fbd 6431void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6432 assert(c);
6433
5b10116e 6434 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6435 free(c->log_extra_fields[l].iov_base);
6436 c->log_extra_fields = mfree(c->log_extra_fields);
6437 c->n_log_extra_fields = 0;
6438}
6439
6f765baf 6440void exec_context_revert_tty(ExecContext *c) {
254d1313 6441 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6442 const char *path;
6443 struct stat st;
6f765baf
LP
6444 int r;
6445
6446 assert(c);
6447
6448 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6449 exec_context_tty_reset(c, NULL);
6450
6451 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6452 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6453 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6454 if (!exec_context_may_touch_tty(c))
6455 return;
6f765baf 6456
0ba976e8
LP
6457 path = exec_context_tty_path(c);
6458 if (!path)
6459 return;
6f765baf 6460
0ba976e8
LP
6461 fd = open(path, O_PATH|O_CLOEXEC);
6462 if (fd < 0)
6463 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6464 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6465 path);
6466
6467 if (fstat(fd, &st) < 0)
6468 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6469
6470 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6471 * if things are a character device, since a proper check either means we'd have to open the TTY and
6472 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6473 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6474 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6475 if (!S_ISCHR(st.st_mode))
6476 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6477
6478 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6479 if (r < 0)
6480 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6481}
6482
4c2f5842
LP
6483int exec_context_get_clean_directories(
6484 ExecContext *c,
6485 char **prefix,
6486 ExecCleanMask mask,
6487 char ***ret) {
6488
6489 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6490 int r;
6491
6492 assert(c);
6493 assert(prefix);
6494 assert(ret);
6495
5b10116e 6496 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6497 if (!FLAGS_SET(mask, 1U << t))
6498 continue;
6499
6500 if (!prefix[t])
6501 continue;
6502
211a3d87 6503 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6504 char *j;
6505
211a3d87 6506 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6507 if (!j)
6508 return -ENOMEM;
6509
6510 r = strv_consume(&l, j);
6511 if (r < 0)
6512 return r;
7f622a19
YW
6513
6514 /* Also remove private directories unconditionally. */
6515 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6516 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6517 if (!j)
6518 return -ENOMEM;
6519
6520 r = strv_consume(&l, j);
6521 if (r < 0)
6522 return r;
6523 }
6524
211a3d87
LB
6525 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6526 j = path_join(prefix[t], *symlink);
7f622a19
YW
6527 if (!j)
6528 return -ENOMEM;
6529
6530 r = strv_consume(&l, j);
6531 if (r < 0)
6532 return r;
6533 }
4c2f5842
LP
6534 }
6535 }
6536
6537 *ret = TAKE_PTR(l);
6538 return 0;
6539}
6540
6541int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6542 ExecCleanMask mask = 0;
6543
6544 assert(c);
6545 assert(ret);
6546
6547 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6548 if (c->directories[t].n_items > 0)
4c2f5842
LP
6549 mask |= 1U << t;
6550
6551 *ret = mask;
6552 return 0;
6553}
6554
b58b4116 6555void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6556 assert(s);
5cb5a6ff 6557
2ed26ed0
LP
6558 *s = (ExecStatus) {
6559 .pid = pid,
6560 };
6561
b58b4116
LP
6562 dual_timestamp_get(&s->start_timestamp);
6563}
6564
34cf6c43 6565void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6566 assert(s);
6567
d46b79bb 6568 if (s->pid != pid)
2ed26ed0
LP
6569 *s = (ExecStatus) {
6570 .pid = pid,
6571 };
b58b4116 6572
63983207 6573 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6574
034c6ed7
LP
6575 s->code = code;
6576 s->status = status;
169c1bda 6577
6f765baf
LP
6578 if (context && context->utmp_id)
6579 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6580}
6581
6a1d4d9f
LP
6582void exec_status_reset(ExecStatus *s) {
6583 assert(s);
6584
6585 *s = (ExecStatus) {};
6586}
6587
34cf6c43 6588void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6589 assert(s);
6590 assert(f);
6591
9fb86720
LP
6592 if (s->pid <= 0)
6593 return;
6594
4c940960
LP
6595 prefix = strempty(prefix);
6596
9fb86720 6597 fprintf(f,
ccd06097
ZJS
6598 "%sPID: "PID_FMT"\n",
6599 prefix, s->pid);
9fb86720 6600
af9d16e1 6601 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6602 fprintf(f,
6603 "%sStart Timestamp: %s\n",
04f5c018 6604 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6605
af9d16e1 6606 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6607 fprintf(f,
6608 "%sExit Timestamp: %s\n"
6609 "%sExit Code: %s\n"
6610 "%sExit Status: %i\n",
04f5c018 6611 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6612 prefix, sigchld_code_to_string(s->code),
6613 prefix, s->status);
5cb5a6ff 6614}
44d8db9e 6615
34cf6c43 6616static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6617 _cleanup_free_ char *cmd = NULL;
4c940960 6618 const char *prefix2;
44d8db9e
LP
6619
6620 assert(c);
6621 assert(f);
6622
4c940960 6623 prefix = strempty(prefix);
63c372cb 6624 prefix2 = strjoina(prefix, "\t");
44d8db9e 6625
4ef15008 6626 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6627
44d8db9e
LP
6628 fprintf(f,
6629 "%sCommand Line: %s\n",
38553034 6630 prefix, strnull(cmd));
44d8db9e 6631
9fb86720 6632 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6633}
6634
6635void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6636 assert(f);
6637
4c940960 6638 prefix = strempty(prefix);
44d8db9e 6639
03677889
YW
6640 LIST_FOREACH(command, i, c)
6641 exec_command_dump(i, f, prefix);
44d8db9e 6642}
94f04347 6643
a6a80b4f
LP
6644void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6645 ExecCommand *end;
6646
6647 assert(l);
6648 assert(e);
6649
6650 if (*l) {
35b8ca3a 6651 /* It's kind of important, that we keep the order here */
cc232fa0 6652 end = LIST_FIND_TAIL(command, *l);
71fda00f 6653 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6654 } else
6655 *l = e;
6656}
6657
26fd040d
LP
6658int exec_command_set(ExecCommand *c, const char *path, ...) {
6659 va_list ap;
6660 char **l, *p;
6661
6662 assert(c);
6663 assert(path);
6664
6665 va_start(ap, path);
6666 l = strv_new_ap(path, ap);
6667 va_end(ap);
6668
6669 if (!l)
6670 return -ENOMEM;
6671
250a918d
LP
6672 p = strdup(path);
6673 if (!p) {
26fd040d
LP
6674 strv_free(l);
6675 return -ENOMEM;
6676 }
6677
6897dfe8 6678 free_and_replace(c->path, p);
26fd040d 6679
130d3d22 6680 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6681}
6682
86b23b07 6683int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6684 _cleanup_strv_free_ char **l = NULL;
86b23b07 6685 va_list ap;
86b23b07
JS
6686 int r;
6687
6688 assert(c);
6689 assert(path);
6690
6691 va_start(ap, path);
6692 l = strv_new_ap(path, ap);
6693 va_end(ap);
6694
6695 if (!l)
6696 return -ENOMEM;
6697
e287086b 6698 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6699 if (r < 0)
86b23b07 6700 return r;
86b23b07
JS
6701
6702 return 0;
6703}
6704
e8a565cb
YW
6705static void *remove_tmpdir_thread(void *p) {
6706 _cleanup_free_ char *path = p;
86b23b07 6707
e8a565cb
YW
6708 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6709 return NULL;
6710}
6711
6712static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6713 int r;
6714
6715 if (!rt)
6716 return NULL;
6717
6718 if (rt->manager)
6719 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6720
6721 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6722
6723 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6724 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6725
6726 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6727 if (r < 0)
e8a565cb 6728 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6729 else
6730 rt->tmp_dir = NULL;
e8a565cb 6731 }
613b411c 6732
56a13a49 6733 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6734 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6735
6736 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6737 if (r < 0)
e8a565cb 6738 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6739 else
6740 rt->var_tmp_dir = NULL;
e8a565cb
YW
6741 }
6742
6743 rt->id = mfree(rt->id);
6744 rt->tmp_dir = mfree(rt->tmp_dir);
6745 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6746 safe_close_pair(rt->netns_storage_socket);
a70581ff 6747 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6748 return mfree(rt);
6749}
6750
6751static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6752 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6753}
6754
56a13a49
ZJS
6755static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6756 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6757 ExecRuntime *n;
613b411c 6758
8e8009dc 6759 assert(ret);
613b411c 6760
56a13a49
ZJS
6761 id_copy = strdup(id);
6762 if (!id_copy)
6763 return -ENOMEM;
6764
8e8009dc
LP
6765 n = new(ExecRuntime, 1);
6766 if (!n)
613b411c
LP
6767 return -ENOMEM;
6768
8e8009dc 6769 *n = (ExecRuntime) {
56a13a49 6770 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6771 .netns_storage_socket = PIPE_EBADF,
6772 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6773 };
6774
6775 *ret = n;
613b411c
LP
6776 return 0;
6777}
6778
e8a565cb
YW
6779static int exec_runtime_add(
6780 Manager *m,
6781 const char *id,
56a13a49
ZJS
6782 char **tmp_dir,
6783 char **var_tmp_dir,
6784 int netns_storage_socket[2],
a70581ff 6785 int ipcns_storage_socket[2],
e8a565cb
YW
6786 ExecRuntime **ret) {
6787
6788 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6789 int r;
6790
e8a565cb 6791 assert(m);
613b411c
LP
6792 assert(id);
6793
a70581ff 6794 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6795
56a13a49 6796 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6797 if (r < 0)
6798 return r;
6799
63083706 6800 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6801 if (r < 0)
6802 return r;
e8a565cb 6803
56a13a49
ZJS
6804 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6805 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6806 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6807
6808 if (netns_storage_socket) {
56a13a49
ZJS
6809 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6810 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6811 }
6812
a70581ff
XR
6813 if (ipcns_storage_socket) {
6814 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6815 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6816 }
6817
e8a565cb
YW
6818 rt->manager = m;
6819
6820 if (ret)
6821 *ret = rt;
e8a565cb 6822 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6823 TAKE_PTR(rt);
e8a565cb
YW
6824 return 0;
6825}
6826
74aaf59b
LP
6827static int exec_runtime_make(
6828 Manager *m,
6829 const ExecContext *c,
6830 const char *id,
6831 ExecRuntime **ret) {
6832
56a13a49 6833 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6834 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6835 int r;
6836
6837 assert(m);
6838 assert(c);
6839 assert(id);
6840
6841 /* It is not necessary to create ExecRuntime object. */
a70581ff 6842 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6843 *ret = NULL;
e8a565cb 6844 return 0;
74aaf59b 6845 }
e8a565cb 6846
efa2f3a1
TM
6847 if (c->private_tmp &&
6848 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6849 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6850 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6851 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6852 if (r < 0)
6853 return r;
6854 }
6855
a8d08f39 6856 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6857 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6858 return -errno;
6859 }
6860
a70581ff
XR
6861 if (c->private_ipc || c->ipc_namespace_path) {
6862 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6863 return -errno;
6864 }
6865
6866 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6867 if (r < 0)
6868 return r;
6869
613b411c
LP
6870 return 1;
6871}
6872
e8a565cb
YW
6873int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6874 ExecRuntime *rt;
6875 int r;
613b411c 6876
e8a565cb
YW
6877 assert(m);
6878 assert(id);
6879 assert(ret);
6880
6881 rt = hashmap_get(m->exec_runtime_by_id, id);
6882 if (rt)
387f6955 6883 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6884 goto ref;
6885
74aaf59b
LP
6886 if (!create) {
6887 *ret = NULL;
e8a565cb 6888 return 0;
74aaf59b 6889 }
e8a565cb
YW
6890
6891 /* If not found, then create a new object. */
6892 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6893 if (r < 0)
e8a565cb 6894 return r;
74aaf59b
LP
6895 if (r == 0) {
6896 /* When r == 0, it is not necessary to create ExecRuntime object. */
6897 *ret = NULL;
6898 return 0;
6899 }
613b411c 6900
e8a565cb
YW
6901ref:
6902 /* increment reference counter. */
6903 rt->n_ref++;
6904 *ret = rt;
6905 return 1;
6906}
613b411c 6907
e8a565cb
YW
6908ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6909 if (!rt)
613b411c
LP
6910 return NULL;
6911
e8a565cb 6912 assert(rt->n_ref > 0);
613b411c 6913
e8a565cb
YW
6914 rt->n_ref--;
6915 if (rt->n_ref > 0)
f2341e0a
LP
6916 return NULL;
6917
e8a565cb 6918 return exec_runtime_free(rt, destroy);
613b411c
LP
6919}
6920
e8a565cb
YW
6921int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6922 ExecRuntime *rt;
e8a565cb
YW
6923
6924 assert(m);
613b411c
LP
6925 assert(f);
6926 assert(fds);
6927
90e74a66 6928 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6929 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6930
e8a565cb
YW
6931 if (rt->tmp_dir)
6932 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6933
e8a565cb
YW
6934 if (rt->var_tmp_dir)
6935 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6936
e8a565cb
YW
6937 if (rt->netns_storage_socket[0] >= 0) {
6938 int copy;
613b411c 6939
e8a565cb
YW
6940 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6941 if (copy < 0)
6942 return copy;
613b411c 6943
e8a565cb
YW
6944 fprintf(f, " netns-socket-0=%i", copy);
6945 }
613b411c 6946
e8a565cb
YW
6947 if (rt->netns_storage_socket[1] >= 0) {
6948 int copy;
613b411c 6949
e8a565cb
YW
6950 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6951 if (copy < 0)
6952 return copy;
613b411c 6953
e8a565cb
YW
6954 fprintf(f, " netns-socket-1=%i", copy);
6955 }
6956
a70581ff
XR
6957 if (rt->ipcns_storage_socket[0] >= 0) {
6958 int copy;
6959
6960 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6961 if (copy < 0)
6962 return copy;
6963
6964 fprintf(f, " ipcns-socket-0=%i", copy);
6965 }
6966
6967 if (rt->ipcns_storage_socket[1] >= 0) {
6968 int copy;
6969
6970 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6971 if (copy < 0)
6972 return copy;
6973
6974 fprintf(f, " ipcns-socket-1=%i", copy);
6975 }
6976
e8a565cb 6977 fputc('\n', f);
613b411c
LP
6978 }
6979
6980 return 0;
6981}
6982
e8a565cb
YW
6983int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6984 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6985 ExecRuntime *rt;
613b411c
LP
6986 int r;
6987
e8a565cb
YW
6988 /* This is for the migration from old (v237 or earlier) deserialization text.
6989 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6990 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6991 * so or not from the serialized text, then we always creates a new object owned by this. */
6992
6993 assert(u);
613b411c
LP
6994 assert(key);
6995 assert(value);
6996
e8a565cb
YW
6997 /* Manager manages ExecRuntime objects by the unit id.
6998 * So, we omit the serialized text when the unit does not have id (yet?)... */
6999 if (isempty(u->id)) {
7000 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7001 return 0;
7002 }
613b411c 7003
cbc165d1
ZJS
7004 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
7005 return log_oom();
e8a565cb
YW
7006
7007 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
7008 if (!rt) {
cbc165d1 7009 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 7010 return log_oom();
613b411c 7011
e8a565cb
YW
7012 rt = rt_create;
7013 }
7014
7015 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
7016 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7017 return -ENOMEM;
613b411c
LP
7018
7019 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
7020 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7021 return -ENOMEM;
613b411c
LP
7022
7023 } else if (streq(key, "netns-socket-0")) {
7024 int fd;
7025
e8a565cb 7026 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7027 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7028 return 0;
613b411c 7029 }
e8a565cb
YW
7030
7031 safe_close(rt->netns_storage_socket[0]);
7032 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7033
613b411c
LP
7034 } else if (streq(key, "netns-socket-1")) {
7035 int fd;
7036
e8a565cb 7037 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7038 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7039 return 0;
613b411c 7040 }
e8a565cb
YW
7041
7042 safe_close(rt->netns_storage_socket[1]);
7043 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 7044
613b411c
LP
7045 } else
7046 return 0;
7047
e8a565cb
YW
7048 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7049 if (rt_create) {
7050 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
7051 if (r < 0) {
3fe91079 7052 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
7053 return 0;
7054 }
613b411c 7055
e8a565cb 7056 rt_create->manager = u->manager;
613b411c 7057
e8a565cb 7058 /* Avoid cleanup */
56a13a49 7059 TAKE_PTR(rt_create);
e8a565cb 7060 }
98b47d54 7061
e8a565cb
YW
7062 return 1;
7063}
613b411c 7064
56a13a49
ZJS
7065int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7066 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7067 char *id = NULL;
a70581ff 7068 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 7069 const char *p, *v = ASSERT_PTR(value);
e8a565cb 7070 size_t n;
613b411c 7071
e8a565cb 7072 assert(m);
e8a565cb 7073 assert(fds);
98b47d54 7074
e8a565cb 7075 n = strcspn(v, " ");
2f82562b 7076 id = strndupa_safe(v, n);
e8a565cb
YW
7077 if (v[n] != ' ')
7078 goto finalize;
7079 p = v + n + 1;
7080
7081 v = startswith(p, "tmp-dir=");
7082 if (v) {
7083 n = strcspn(v, " ");
56a13a49
ZJS
7084 tmp_dir = strndup(v, n);
7085 if (!tmp_dir)
7086 return log_oom();
e8a565cb
YW
7087 if (v[n] != ' ')
7088 goto finalize;
7089 p = v + n + 1;
7090 }
7091
7092 v = startswith(p, "var-tmp-dir=");
7093 if (v) {
7094 n = strcspn(v, " ");
56a13a49
ZJS
7095 var_tmp_dir = strndup(v, n);
7096 if (!var_tmp_dir)
7097 return log_oom();
e8a565cb
YW
7098 if (v[n] != ' ')
7099 goto finalize;
7100 p = v + n + 1;
7101 }
7102
7103 v = startswith(p, "netns-socket-0=");
7104 if (v) {
7105 char *buf;
7106
7107 n = strcspn(v, " ");
2f82562b 7108 buf = strndupa_safe(v, n);
c413bb28 7109
a70581ff 7110 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
7111 if (r < 0)
7112 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 7113 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 7114 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7115 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7116 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
7117 if (v[n] != ' ')
7118 goto finalize;
7119 p = v + n + 1;
613b411c
LP
7120 }
7121
e8a565cb
YW
7122 v = startswith(p, "netns-socket-1=");
7123 if (v) {
7124 char *buf;
98b47d54 7125
e8a565cb 7126 n = strcspn(v, " ");
2f82562b 7127 buf = strndupa_safe(v, n);
a70581ff
XR
7128
7129 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
7130 if (r < 0)
7131 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
7132 if (!fdset_contains(fds, netns_fdpair[1]))
7133 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7134 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7135 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7136 if (v[n] != ' ')
7137 goto finalize;
7138 p = v + n + 1;
7139 }
7140
7141 v = startswith(p, "ipcns-socket-0=");
7142 if (v) {
7143 char *buf;
7144
7145 n = strcspn(v, " ");
2f82562b 7146 buf = strndupa_safe(v, n);
a70581ff
XR
7147
7148 r = safe_atoi(buf, &ipcns_fdpair[0]);
7149 if (r < 0)
7150 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7151 if (!fdset_contains(fds, ipcns_fdpair[0]))
7152 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7153 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7154 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7155 if (v[n] != ' ')
7156 goto finalize;
7157 p = v + n + 1;
7158 }
7159
7160 v = startswith(p, "ipcns-socket-1=");
7161 if (v) {
7162 char *buf;
7163
7164 n = strcspn(v, " ");
2f82562b 7165 buf = strndupa_safe(v, n);
a70581ff
XR
7166
7167 r = safe_atoi(buf, &ipcns_fdpair[1]);
7168 if (r < 0)
7169 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7170 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 7171 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7172 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7173 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 7174 }
98b47d54 7175
e8a565cb 7176finalize:
a70581ff 7177 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 7178 if (r < 0)
56a13a49
ZJS
7179 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7180 return 0;
e8a565cb 7181}
613b411c 7182
e8a565cb
YW
7183void exec_runtime_vacuum(Manager *m) {
7184 ExecRuntime *rt;
e8a565cb
YW
7185
7186 assert(m);
7187
7188 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7189
90e74a66 7190 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
7191 if (rt->n_ref > 0)
7192 continue;
7193
7194 (void) exec_runtime_free(rt, false);
7195 }
613b411c
LP
7196}
7197
b9c04eaf
YW
7198void exec_params_clear(ExecParameters *p) {
7199 if (!p)
7200 return;
7201
c3f8a065
LP
7202 p->environment = strv_free(p->environment);
7203 p->fd_names = strv_free(p->fd_names);
7204 p->fds = mfree(p->fds);
7205 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7206}
7207
bb0c0d6f
LP
7208ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7209 if (!sc)
7210 return NULL;
7211
7212 free(sc->id);
7213 free(sc->data);
7214 return mfree(sc);
7215}
7216
43144be4
LP
7217ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7218 if (!lc)
7219 return NULL;
7220
7221 free(lc->id);
7222 free(lc->path);
7223 return mfree(lc);
7224}
7225
211a3d87
LB
7226void exec_directory_done(ExecDirectory *d) {
7227 if (!d)
7228 return;
7229
7230 for (size_t i = 0; i < d->n_items; i++) {
7231 free(d->items[i].path);
7232 strv_free(d->items[i].symlinks);
7233 }
7234
7235 d->items = mfree(d->items);
7236 d->n_items = 0;
7237 d->mode = 0755;
7238}
7239
564e5c98
YW
7240static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7241 assert(d);
7242 assert(path);
7243
7244 for (size_t i = 0; i < d->n_items; i++)
7245 if (path_equal(d->items[i].path, path))
7246 return &d->items[i];
7247
7248 return NULL;
7249}
7250
7251int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7252 _cleanup_strv_free_ char **s = NULL;
7253 _cleanup_free_ char *p = NULL;
564e5c98
YW
7254 ExecDirectoryItem *existing;
7255 int r;
211a3d87
LB
7256
7257 assert(d);
211a3d87
LB
7258 assert(path);
7259
564e5c98
YW
7260 existing = exec_directory_find(d, path);
7261 if (existing) {
7262 r = strv_extend(&existing->symlinks, symlink);
7263 if (r < 0)
7264 return r;
7265
7266 return 0; /* existing item is updated */
7267 }
7268
211a3d87
LB
7269 p = strdup(path);
7270 if (!p)
7271 return -ENOMEM;
7272
564e5c98
YW
7273 if (symlink) {
7274 s = strv_new(symlink);
211a3d87
LB
7275 if (!s)
7276 return -ENOMEM;
7277 }
7278
564e5c98 7279 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7280 return -ENOMEM;
7281
564e5c98 7282 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7283 .path = TAKE_PTR(p),
7284 .symlinks = TAKE_PTR(s),
7285 };
7286
564e5c98 7287 return 1; /* new item is added */
211a3d87
LB
7288}
7289
a2ab603c
YW
7290static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7291 assert(a);
7292 assert(b);
7293
7294 return path_compare(a->path, b->path);
7295}
7296
7297void exec_directory_sort(ExecDirectory *d) {
7298 assert(d);
7299
7300 /* Sort the exec directories to make always parent directories processed at first in
7301 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7302 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7303 * list. See also comments in setup_exec_directory() and issue #24783. */
7304
7305 if (d->n_items <= 1)
7306 return;
7307
7308 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7309
7310 for (size_t i = 1; i < d->n_items; i++)
7311 for (size_t j = 0; j < i; j++)
7312 if (path_startswith(d->items[i].path, d->items[j].path)) {
7313 d->items[i].only_create = true;
7314 break;
7315 }
211a3d87
LB
7316}
7317
bb0c0d6f 7318DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7319DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7320
80876c20
LP
7321static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7322 [EXEC_INPUT_NULL] = "null",
7323 [EXEC_INPUT_TTY] = "tty",
7324 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7325 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7326 [EXEC_INPUT_SOCKET] = "socket",
7327 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7328 [EXEC_INPUT_DATA] = "data",
2038c3f5 7329 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7330};
7331
8a0867d6
LP
7332DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7333
94f04347 7334static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7335 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7336 [EXEC_OUTPUT_NULL] = "null",
80876c20 7337 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7338 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7339 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7340 [EXEC_OUTPUT_JOURNAL] = "journal",
7341 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7342 [EXEC_OUTPUT_SOCKET] = "socket",
7343 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7344 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7345 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7346 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7347};
7348
7349DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7350
7351static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7352 [EXEC_UTMP_INIT] = "init",
7353 [EXEC_UTMP_LOGIN] = "login",
7354 [EXEC_UTMP_USER] = "user",
7355};
7356
7357DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7358
7359static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7360 [EXEC_PRESERVE_NO] = "no",
7361 [EXEC_PRESERVE_YES] = "yes",
7362 [EXEC_PRESERVE_RESTART] = "restart",
7363};
7364
7365DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7366
6b7b2ed9 7367/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7368static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7369 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7370 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7371 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7372 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7373 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7374};
7375
7376DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7377
211a3d87
LB
7378/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7379static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7380 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7381 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7382 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7383 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7384 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7385};
7386
7387DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7388
6b7b2ed9
LP
7389/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7390 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7391 * directories, specifically .timer units with their timestamp touch file. */
7392static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7393 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7394 [EXEC_DIRECTORY_STATE] = "state",
7395 [EXEC_DIRECTORY_CACHE] = "cache",
7396 [EXEC_DIRECTORY_LOGS] = "logs",
7397 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7398};
7399
7400DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7401
7402/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7403 * the service payload in. */
fb2042dd
YW
7404static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7405 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7406 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7407 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7408 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7409 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7410};
7411
7412DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7413
b1edf445
LP
7414static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7415 [EXEC_KEYRING_INHERIT] = "inherit",
7416 [EXEC_KEYRING_PRIVATE] = "private",
7417 [EXEC_KEYRING_SHARED] = "shared",
7418};
7419
7420DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);