]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #26563 from dtardon/fd-init
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
ee617a4e 42#include "argv-util.h"
8dd4c05b
LP
43#include "async.h"
44#include "barrier.h"
b1994387 45#include "bpf-lsm.h"
8dd4c05b 46#include "cap-list.h"
430f0182 47#include "capability-util.h"
fdb3deca 48#include "cgroup-setup.h"
f4351959 49#include "chase-symlinks.h"
bb0c0d6f 50#include "chown-recursive.h"
28db6fbf 51#include "constants.h"
da681e1b 52#include "cpu-set-util.h"
43144be4 53#include "creds-util.h"
6a818c3c 54#include "data-fd-util.h"
686d13b9 55#include "env-file.h"
4d1a6904 56#include "env-util.h"
17df7223 57#include "errno-list.h"
8a62620e 58#include "escape.h"
3ffd4af2 59#include "execute.h"
8dd4c05b 60#include "exit-status.h"
3ffd4af2 61#include "fd-util.h"
bb0c0d6f 62#include "fileio.h"
f97b34a6 63#include "format-util.h"
7d50b32a 64#include "glob-util.h"
0389f4fa 65#include "hexdecoct.h"
c004493c 66#include "io-util.h"
032b3afb 67#include "ioprio-util.h"
a1164ae3 68#include "label.h"
8dd4c05b
LP
69#include "log.h"
70#include "macro.h"
e8a565cb 71#include "manager.h"
2a341bb9 72#include "manager-dump.h"
0a970718 73#include "memory-util.h"
f5947a5e 74#include "missing_fs.h"
5bead76e 75#include "missing_ioprio.h"
35cd0ba5 76#include "mkdir-label.h"
21935150 77#include "mount-util.h"
bb0c0d6f 78#include "mountpoint-util.h"
8dd4c05b 79#include "namespace.h"
6bedfcbb 80#include "parse-util.h"
8dd4c05b 81#include "path-util.h"
0b452006 82#include "process-util.h"
d3dcf4e3 83#include "random-util.h"
3989bdc1 84#include "recurse-dir.h"
78f22b97 85#include "rlimit-util.h"
8dd4c05b 86#include "rm-rf.h"
349cc4a5 87#if HAVE_SECCOMP
3ffd4af2
LP
88#include "seccomp-util.h"
89#endif
07d46372 90#include "securebits-util.h"
8dd4c05b 91#include "selinux-util.h"
24882e06 92#include "signal-util.h"
8dd4c05b 93#include "smack-util.h"
57b7a260 94#include "socket-util.h"
a2ab603c 95#include "sort-util.h"
fd63e712 96#include "special.h"
949befd3 97#include "stat-util.h"
8b43440b 98#include "string-table.h"
07630cea 99#include "string-util.h"
8dd4c05b 100#include "strv.h"
7ccbd1ae 101#include "syslog-util.h"
8dd4c05b 102#include "terminal-util.h"
bb0c0d6f 103#include "tmpfile-util.h"
566b7d23 104#include "umask-util.h"
2d3b784d 105#include "unit-serialize.h"
b1d4f8e1 106#include "user-util.h"
8dd4c05b 107#include "utmp-wtmp.h"
5cb5a6ff 108
e056b01d 109#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 110#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 111
531dca78
LP
112#define SNDBUF_SIZE (8*1024*1024)
113
da6053d0 114static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
115 if (n_fds <= 0)
116 return 0;
117
a0d40ac5
LP
118 /* Modifies the fds array! (sorts it) */
119
034c6ed7
LP
120 assert(fds);
121
5b10116e
ZJS
122 for (int start = 0;;) {
123 int restart_from = -1;
034c6ed7 124
5b10116e 125 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
126 int nfd;
127
128 /* Already at right index? */
129 if (fds[i] == i+3)
130 continue;
131
3cc2aff1
LP
132 nfd = fcntl(fds[i], F_DUPFD, i + 3);
133 if (nfd < 0)
034c6ed7
LP
134 return -errno;
135
03e334a1 136 safe_close(fds[i]);
034c6ed7
LP
137 fds[i] = nfd;
138
139 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 140 * let's remember that and try again from here */
034c6ed7
LP
141 if (nfd != i+3 && restart_from < 0)
142 restart_from = i;
143 }
144
145 if (restart_from < 0)
146 break;
147
148 start = restart_from;
149 }
150
151 return 0;
152}
153
cd48e23f
RP
154static int flags_fds(
155 const int fds[],
156 size_t n_socket_fds,
157 size_t n_fds,
158 bool nonblock) {
159
e2c76839 160 int r;
47a71eed
LP
161
162 if (n_fds <= 0)
163 return 0;
164
165 assert(fds);
166
9b141911
FB
167 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
168 * O_NONBLOCK only applies to socket activation though. */
47a71eed 169
5b10116e 170 for (size_t i = 0; i < n_fds; i++) {
47a71eed 171
9b141911
FB
172 if (i < n_socket_fds) {
173 r = fd_nonblock(fds[i], nonblock);
174 if (r < 0)
175 return r;
176 }
47a71eed 177
451a074f
LP
178 /* We unconditionally drop FD_CLOEXEC from the fds,
179 * since after all we want to pass these fds to our
180 * children */
47a71eed 181
3cc2aff1
LP
182 r = fd_cloexec(fds[i], false);
183 if (r < 0)
e2c76839 184 return r;
47a71eed
LP
185 }
186
187 return 0;
188}
189
1e22b5cd 190static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
191 assert(context);
192
1e22b5cd
LP
193 if (context->stdio_as_fds)
194 return NULL;
195
80876c20
LP
196 if (context->tty_path)
197 return context->tty_path;
198
199 return "/dev/console";
200}
201
1e22b5cd
LP
202static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
203 const char *path;
204
6ea832a2
LP
205 assert(context);
206
1e22b5cd 207 path = exec_context_tty_path(context);
6ea832a2 208
1e22b5cd
LP
209 if (context->tty_vhangup) {
210 if (p && p->stdin_fd >= 0)
211 (void) terminal_vhangup_fd(p->stdin_fd);
212 else if (path)
213 (void) terminal_vhangup(path);
214 }
6ea832a2 215
1e22b5cd
LP
216 if (context->tty_reset) {
217 if (p && p->stdin_fd >= 0)
218 (void) reset_terminal_fd(p->stdin_fd, true);
219 else if (path)
220 (void) reset_terminal(path);
221 }
222
51462135
DDM
223 if (p && p->stdin_fd >= 0)
224 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
225
1e22b5cd
LP
226 if (context->tty_vt_disallocate && path)
227 (void) vt_disallocate(path);
6ea832a2
LP
228}
229
6af760f3
LP
230static bool is_terminal_input(ExecInput i) {
231 return IN_SET(i,
232 EXEC_INPUT_TTY,
233 EXEC_INPUT_TTY_FORCE,
234 EXEC_INPUT_TTY_FAIL);
235}
236
3a1286b6 237static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
238 return IN_SET(o,
239 EXEC_OUTPUT_TTY,
6af760f3
LP
240 EXEC_OUTPUT_KMSG_AND_CONSOLE,
241 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
242}
243
aac8c0c3
LP
244static bool is_kmsg_output(ExecOutput o) {
245 return IN_SET(o,
246 EXEC_OUTPUT_KMSG,
247 EXEC_OUTPUT_KMSG_AND_CONSOLE);
248}
249
6af760f3
LP
250static bool exec_context_needs_term(const ExecContext *c) {
251 assert(c);
252
253 /* Return true if the execution context suggests we should set $TERM to something useful. */
254
255 if (is_terminal_input(c->std_input))
256 return true;
257
258 if (is_terminal_output(c->std_output))
259 return true;
260
261 if (is_terminal_output(c->std_error))
262 return true;
263
264 return !!c->tty_path;
3a1286b6
MS
265}
266
80876c20 267static int open_null_as(int flags, int nfd) {
046a82c1 268 int fd;
071830ff 269
80876c20 270 assert(nfd >= 0);
071830ff 271
613b411c
LP
272 fd = open("/dev/null", flags|O_NOCTTY);
273 if (fd < 0)
071830ff
LP
274 return -errno;
275
046a82c1 276 return move_fd(fd, nfd, false);
071830ff
LP
277}
278
91dd5f7c
LP
279static int connect_journal_socket(
280 int fd,
281 const char *log_namespace,
282 uid_t uid,
283 gid_t gid) {
284
524daa8c
ZJS
285 uid_t olduid = UID_INVALID;
286 gid_t oldgid = GID_INVALID;
91dd5f7c 287 const char *j;
524daa8c
ZJS
288 int r;
289
91dd5f7c
LP
290 j = log_namespace ?
291 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
292 "/run/systemd/journal/stdout";
91dd5f7c 293
cad93f29 294 if (gid_is_valid(gid)) {
524daa8c
ZJS
295 oldgid = getgid();
296
92a17af9 297 if (setegid(gid) < 0)
524daa8c
ZJS
298 return -errno;
299 }
300
cad93f29 301 if (uid_is_valid(uid)) {
524daa8c
ZJS
302 olduid = getuid();
303
92a17af9 304 if (seteuid(uid) < 0) {
524daa8c
ZJS
305 r = -errno;
306 goto restore_gid;
307 }
308 }
309
1861986a 310 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 311
1861986a
LP
312 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
313 an LSM interferes. */
524daa8c 314
cad93f29 315 if (uid_is_valid(uid))
524daa8c
ZJS
316 (void) seteuid(olduid);
317
318 restore_gid:
cad93f29 319 if (gid_is_valid(gid))
524daa8c
ZJS
320 (void) setegid(oldgid);
321
322 return r;
323}
324
fd1f9c89 325static int connect_logger_as(
34cf6c43 326 const Unit *unit,
fd1f9c89 327 const ExecContext *context,
af635cf3 328 const ExecParameters *params,
fd1f9c89
LP
329 ExecOutput output,
330 const char *ident,
fd1f9c89
LP
331 int nfd,
332 uid_t uid,
333 gid_t gid) {
334
254d1313 335 _cleanup_close_ int fd = -EBADF;
2ac1ff68 336 int r;
071830ff
LP
337
338 assert(context);
af635cf3 339 assert(params);
80876c20
LP
340 assert(output < _EXEC_OUTPUT_MAX);
341 assert(ident);
342 assert(nfd >= 0);
071830ff 343
54fe0cdb
LP
344 fd = socket(AF_UNIX, SOCK_STREAM, 0);
345 if (fd < 0)
80876c20 346 return -errno;
071830ff 347
91dd5f7c 348 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
349 if (r < 0)
350 return r;
071830ff 351
2ac1ff68 352 if (shutdown(fd, SHUT_RD) < 0)
80876c20 353 return -errno;
071830ff 354
fd1f9c89 355 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 356
2ac1ff68 357 if (dprintf(fd,
62bca2c6 358 "%s\n"
80876c20
LP
359 "%s\n"
360 "%i\n"
54fe0cdb
LP
361 "%i\n"
362 "%i\n"
363 "%i\n"
4f4a1dbf 364 "%i\n",
c867611e 365 context->syslog_identifier ?: ident,
af635cf3 366 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
367 context->syslog_priority,
368 !!context->syslog_level_prefix,
f3dc6af2 369 false,
aac8c0c3 370 is_kmsg_output(output),
2ac1ff68
EV
371 is_terminal_output(output)) < 0)
372 return -errno;
80876c20 373
2ac1ff68 374 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 375}
2ac1ff68 376
3a274a21 377static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 378 int fd;
071830ff 379
80876c20
LP
380 assert(path);
381 assert(nfd >= 0);
fd1f9c89 382
3a274a21 383 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 384 if (fd < 0)
80876c20 385 return fd;
071830ff 386
046a82c1 387 return move_fd(fd, nfd, false);
80876c20 388}
071830ff 389
2038c3f5 390static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 391 _cleanup_close_ int fd = -EBADF;
86fca584 392 int r;
071830ff 393
80876c20 394 assert(path);
071830ff 395
2038c3f5
LP
396 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
397 flags |= O_CREAT;
398
399 fd = open(path, flags|O_NOCTTY, mode);
400 if (fd >= 0)
15a3e96f 401 return TAKE_FD(fd);
071830ff 402
2038c3f5
LP
403 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
404 return -errno;
2038c3f5
LP
405
406 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
407
408 fd = socket(AF_UNIX, SOCK_STREAM, 0);
409 if (fd < 0)
410 return -errno;
411
1861986a
LP
412 r = connect_unix_path(fd, AT_FDCWD, path);
413 if (IN_SET(r, -ENOTSOCK, -EINVAL))
414 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
415 * wasn't an AF_UNIX socket after all */
416 return -ENXIO;
417 if (r < 0)
418 return r;
071830ff 419
2038c3f5
LP
420 if ((flags & O_ACCMODE) == O_RDONLY)
421 r = shutdown(fd, SHUT_WR);
422 else if ((flags & O_ACCMODE) == O_WRONLY)
423 r = shutdown(fd, SHUT_RD);
424 else
86fca584 425 r = 0;
15a3e96f 426 if (r < 0)
2038c3f5 427 return -errno;
2038c3f5 428
15a3e96f 429 return TAKE_FD(fd);
80876c20 430}
071830ff 431
08f3be7a
LP
432static int fixup_input(
433 const ExecContext *context,
434 int socket_fd,
435 bool apply_tty_stdin) {
436
437 ExecInput std_input;
438
439 assert(context);
440
441 std_input = context->std_input;
1e3ad081
LP
442
443 if (is_terminal_input(std_input) && !apply_tty_stdin)
444 return EXEC_INPUT_NULL;
071830ff 445
03fd9c49 446 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
447 return EXEC_INPUT_NULL;
448
08f3be7a
LP
449 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
450 return EXEC_INPUT_NULL;
451
03fd9c49 452 return std_input;
4f2d528d
LP
453}
454
7966a916 455static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 456
7966a916 457 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
458 return EXEC_OUTPUT_INHERIT;
459
7966a916 460 return output;
4f2d528d
LP
461}
462
a34ceba6
LP
463static int setup_input(
464 const ExecContext *context,
465 const ExecParameters *params,
52c239d7 466 int socket_fd,
2caa38e9 467 const int named_iofds[static 3]) {
a34ceba6 468
4f2d528d 469 ExecInput i;
51462135 470 int r;
4f2d528d
LP
471
472 assert(context);
a34ceba6 473 assert(params);
2caa38e9 474 assert(named_iofds);
a34ceba6
LP
475
476 if (params->stdin_fd >= 0) {
477 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
478 return -errno;
479
480 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
481 if (isatty(STDIN_FILENO)) {
482 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
483 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 484 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 485 }
a34ceba6
LP
486
487 return STDIN_FILENO;
488 }
4f2d528d 489
08f3be7a 490 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
491
492 switch (i) {
071830ff 493
80876c20
LP
494 case EXEC_INPUT_NULL:
495 return open_null_as(O_RDONLY, STDIN_FILENO);
496
497 case EXEC_INPUT_TTY:
498 case EXEC_INPUT_TTY_FORCE:
499 case EXEC_INPUT_TTY_FAIL: {
046a82c1 500 int fd;
071830ff 501
1e22b5cd 502 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
503 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
504 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
505 ACQUIRE_TERMINAL_WAIT,
3a43da28 506 USEC_INFINITY);
970edce6 507 if (fd < 0)
80876c20
LP
508 return fd;
509
51462135
DDM
510 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
511 if (r < 0)
512 return r;
513
046a82c1 514 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
515 }
516
4f2d528d 517 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
518 assert(socket_fd >= 0);
519
7c248223 520 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 521
52c239d7 522 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
523 assert(named_iofds[STDIN_FILENO] >= 0);
524
52c239d7 525 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 526 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 527
08f3be7a
LP
528 case EXEC_INPUT_DATA: {
529 int fd;
530
531 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
532 if (fd < 0)
533 return fd;
534
535 return move_fd(fd, STDIN_FILENO, false);
536 }
537
2038c3f5
LP
538 case EXEC_INPUT_FILE: {
539 bool rw;
540 int fd;
541
542 assert(context->stdio_file[STDIN_FILENO]);
543
544 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
545 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
546
547 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
548 if (fd < 0)
549 return fd;
550
551 return move_fd(fd, STDIN_FILENO, false);
552 }
553
80876c20 554 default:
04499a70 555 assert_not_reached();
80876c20
LP
556 }
557}
558
41fc585a
LP
559static bool can_inherit_stderr_from_stdout(
560 const ExecContext *context,
561 ExecOutput o,
562 ExecOutput e) {
563
564 assert(context);
565
566 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
567 * stderr fd */
568
569 if (e == EXEC_OUTPUT_INHERIT)
570 return true;
571 if (e != o)
572 return false;
573
574 if (e == EXEC_OUTPUT_NAMED_FD)
575 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
576
8d7dab1f 577 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
578 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
579
580 return true;
581}
582
a34ceba6 583static int setup_output(
34cf6c43 584 const Unit *unit,
a34ceba6
LP
585 const ExecContext *context,
586 const ExecParameters *params,
587 int fileno,
588 int socket_fd,
2caa38e9 589 const int named_iofds[static 3],
a34ceba6 590 const char *ident,
7bce046b
LP
591 uid_t uid,
592 gid_t gid,
593 dev_t *journal_stream_dev,
594 ino_t *journal_stream_ino) {
a34ceba6 595
4f2d528d
LP
596 ExecOutput o;
597 ExecInput i;
47c1d80d 598 int r;
4f2d528d 599
f2341e0a 600 assert(unit);
80876c20 601 assert(context);
a34ceba6 602 assert(params);
80876c20 603 assert(ident);
7bce046b
LP
604 assert(journal_stream_dev);
605 assert(journal_stream_ino);
80876c20 606
a34ceba6
LP
607 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
608
609 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
610 return -errno;
611
612 return STDOUT_FILENO;
613 }
614
615 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
616 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
617 return -errno;
618
619 return STDERR_FILENO;
620 }
621
08f3be7a 622 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 623 o = fixup_output(context->std_output, socket_fd);
4f2d528d 624
eb17e935
MS
625 if (fileno == STDERR_FILENO) {
626 ExecOutput e;
627 e = fixup_output(context->std_error, socket_fd);
80876c20 628
eb17e935
MS
629 /* This expects the input and output are already set up */
630
631 /* Don't change the stderr file descriptor if we inherit all
632 * the way and are not on a tty */
633 if (e == EXEC_OUTPUT_INHERIT &&
634 o == EXEC_OUTPUT_INHERIT &&
635 i == EXEC_INPUT_NULL &&
636 !is_terminal_input(context->std_input) &&
7966a916 637 getppid() != 1)
eb17e935
MS
638 return fileno;
639
640 /* Duplicate from stdout if possible */
41fc585a 641 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 642 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 643
eb17e935 644 o = e;
80876c20 645
eb17e935 646 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
647 /* If input got downgraded, inherit the original value */
648 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 649 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 650
08f3be7a
LP
651 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
652 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 653 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 654
acb591e4
LP
655 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
656 if (getppid() != 1)
eb17e935 657 return fileno;
94f04347 658
eb17e935
MS
659 /* We need to open /dev/null here anew, to get the right access mode. */
660 return open_null_as(O_WRONLY, fileno);
071830ff 661 }
94f04347 662
eb17e935 663 switch (o) {
80876c20
LP
664
665 case EXEC_OUTPUT_NULL:
eb17e935 666 return open_null_as(O_WRONLY, fileno);
80876c20
LP
667
668 case EXEC_OUTPUT_TTY:
4f2d528d 669 if (is_terminal_input(i))
7c248223 670 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
671
672 /* We don't reset the terminal if this is just about output */
1e22b5cd 673 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 674
9a6bca7a 675 case EXEC_OUTPUT_KMSG:
28dbc1e8 676 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
677 case EXEC_OUTPUT_JOURNAL:
678 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 679 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 680 if (r < 0) {
7966a916
ZJS
681 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
682 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 683 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
684 } else {
685 struct stat st;
686
687 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
688 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
689 * services to detect whether they are connected to the journal or not.
690 *
691 * If both stdout and stderr are connected to a stream then let's make sure to store the data
692 * about STDERR as that's usually the best way to do logging. */
7bce046b 693
ab2116b1
LP
694 if (fstat(fileno, &st) >= 0 &&
695 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
696 *journal_stream_dev = st.st_dev;
697 *journal_stream_ino = st.st_ino;
698 }
47c1d80d
MS
699 }
700 return r;
4f2d528d
LP
701
702 case EXEC_OUTPUT_SOCKET:
703 assert(socket_fd >= 0);
e75a9ed1 704
7c248223 705 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 706
52c239d7 707 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
708 assert(named_iofds[fileno] >= 0);
709
52c239d7 710 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 711 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 712
566b7d23 713 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
714 case EXEC_OUTPUT_FILE_APPEND:
715 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 716 bool rw;
566b7d23 717 int fd, flags;
2038c3f5
LP
718
719 assert(context->stdio_file[fileno]);
720
721 rw = context->std_input == EXEC_INPUT_FILE &&
722 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
723
724 if (rw)
7c248223 725 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 726
566b7d23
ZD
727 flags = O_WRONLY;
728 if (o == EXEC_OUTPUT_FILE_APPEND)
729 flags |= O_APPEND;
8d7dab1f
LW
730 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
731 flags |= O_TRUNC;
566b7d23
ZD
732
733 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
734 if (fd < 0)
735 return fd;
736
566b7d23 737 return move_fd(fd, fileno, 0);
2038c3f5
LP
738 }
739
94f04347 740 default:
04499a70 741 assert_not_reached();
94f04347 742 }
071830ff
LP
743}
744
02a51aba 745static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 746 int r;
02a51aba
LP
747
748 assert(fd >= 0);
02a51aba 749
1ff74fb6 750 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
751 if (isatty(fd) < 1) {
752 if (IN_SET(errno, EINVAL, ENOTTY))
753 return 0; /* not a tty */
1ff74fb6 754
02a51aba 755 return -errno;
4b3b5bc7 756 }
02a51aba 757
4b3b5bc7 758 /* This might fail. What matters are the results. */
f2df231f 759 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
760 if (r < 0)
761 return r;
02a51aba 762
4b3b5bc7 763 return 1;
02a51aba
LP
764}
765
aedec452 766static int setup_confirm_stdio(
51462135 767 const ExecContext *context,
aedec452
LP
768 const char *vc,
769 int *ret_saved_stdin,
770 int *ret_saved_stdout) {
771
254d1313 772 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
3d18b167 773 int r;
80876c20 774
aedec452
LP
775 assert(ret_saved_stdin);
776 assert(ret_saved_stdout);
80876c20 777
af6da548
LP
778 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
779 if (saved_stdin < 0)
780 return -errno;
80876c20 781
af6da548 782 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
783 if (saved_stdout < 0)
784 return -errno;
80876c20 785
8854d795 786 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
787 if (fd < 0)
788 return fd;
80876c20 789
af6da548
LP
790 r = chown_terminal(fd, getuid());
791 if (r < 0)
3d18b167 792 return r;
02a51aba 793
3d18b167
LP
794 r = reset_terminal_fd(fd, true);
795 if (r < 0)
796 return r;
80876c20 797
51462135
DDM
798 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
799 if (r < 0)
800 return r;
801
aedec452
LP
802 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
803 TAKE_FD(fd);
2b33ab09
LP
804 if (r < 0)
805 return r;
80876c20 806
aedec452
LP
807 *ret_saved_stdin = TAKE_FD(saved_stdin);
808 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 809 return 0;
80876c20
LP
810}
811
63d77c92 812static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
813 assert(err < 0);
814
815 if (err == -ETIMEDOUT)
63d77c92 816 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
817 else {
818 errno = -err;
63d77c92 819 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
820 }
821}
822
63d77c92 823static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 824 _cleanup_close_ int fd = -EBADF;
80876c20 825
3b20f877 826 assert(vc);
80876c20 827
7d5ceb64 828 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 829 if (fd < 0)
3b20f877 830 return;
80876c20 831
63d77c92 832 write_confirm_error_fd(err, fd, u);
af6da548 833}
80876c20 834
3d18b167 835static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 836 int r = 0;
80876c20 837
af6da548
LP
838 assert(saved_stdin);
839 assert(saved_stdout);
840
841 release_terminal();
842
843 if (*saved_stdin >= 0)
80876c20 844 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 845 r = -errno;
80876c20 846
af6da548 847 if (*saved_stdout >= 0)
80876c20 848 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 849 r = -errno;
80876c20 850
3d18b167
LP
851 *saved_stdin = safe_close(*saved_stdin);
852 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
853
854 return r;
855}
856
3b20f877
FB
857enum {
858 CONFIRM_PRETEND_FAILURE = -1,
859 CONFIRM_PRETEND_SUCCESS = 0,
860 CONFIRM_EXECUTE = 1,
861};
862
51462135 863static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 864 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 865 _cleanup_free_ char *e = NULL;
3b20f877 866 char c;
af6da548 867
3b20f877 868 /* For any internal errors, assume a positive response. */
51462135 869 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 870 if (r < 0) {
63d77c92 871 write_confirm_error(r, vc, u);
3b20f877
FB
872 return CONFIRM_EXECUTE;
873 }
af6da548 874
b0eb2944
FB
875 /* confirm_spawn might have been disabled while we were sleeping. */
876 if (manager_is_confirm_spawn_disabled(u->manager)) {
877 r = 1;
878 goto restore_stdio;
879 }
af6da548 880
2bcd3c26
FB
881 e = ellipsize(cmdline, 60, 100);
882 if (!e) {
883 log_oom();
884 r = CONFIRM_EXECUTE;
885 goto restore_stdio;
886 }
af6da548 887
d172b175 888 for (;;) {
539622bd 889 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 890 if (r < 0) {
63d77c92 891 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
892 r = CONFIRM_EXECUTE;
893 goto restore_stdio;
894 }
af6da548 895
d172b175 896 switch (c) {
b0eb2944
FB
897 case 'c':
898 printf("Resuming normal execution.\n");
899 manager_disable_confirm_spawn();
900 r = 1;
901 break;
dd6f9ac0
FB
902 case 'D':
903 unit_dump(u, stdout, " ");
904 continue; /* ask again */
d172b175
FB
905 case 'f':
906 printf("Failing execution.\n");
907 r = CONFIRM_PRETEND_FAILURE;
908 break;
909 case 'h':
b0eb2944
FB
910 printf(" c - continue, proceed without asking anymore\n"
911 " D - dump, show the state of the unit\n"
dd6f9ac0 912 " f - fail, don't execute the command and pretend it failed\n"
d172b175 913 " h - help\n"
eedf223a 914 " i - info, show a short summary of the unit\n"
56fde33a 915 " j - jobs, show jobs that are in progress\n"
d172b175
FB
916 " s - skip, don't execute the command and pretend it succeeded\n"
917 " y - yes, execute the command\n");
dd6f9ac0 918 continue; /* ask again */
eedf223a
FB
919 case 'i':
920 printf(" Description: %s\n"
921 " Unit: %s\n"
922 " Command: %s\n",
923 u->id, u->description, cmdline);
924 continue; /* ask again */
56fde33a 925 case 'j':
d1d8786c 926 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 927 continue; /* ask again */
539622bd
FB
928 case 'n':
929 /* 'n' was removed in favor of 'f'. */
930 printf("Didn't understand 'n', did you mean 'f'?\n");
931 continue; /* ask again */
d172b175
FB
932 case 's':
933 printf("Skipping execution.\n");
934 r = CONFIRM_PRETEND_SUCCESS;
935 break;
936 case 'y':
937 r = CONFIRM_EXECUTE;
938 break;
939 default:
04499a70 940 assert_not_reached();
d172b175 941 }
3b20f877 942 break;
3b20f877 943 }
af6da548 944
3b20f877 945restore_stdio:
af6da548 946 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 947 return r;
80876c20
LP
948}
949
4d885bd3
DH
950static int get_fixed_user(const ExecContext *c, const char **user,
951 uid_t *uid, gid_t *gid,
952 const char **home, const char **shell) {
81a2b7ce 953 int r;
4d885bd3 954 const char *name;
81a2b7ce 955
4d885bd3 956 assert(c);
81a2b7ce 957
23deef88
LP
958 if (!c->user)
959 return 0;
960
4d885bd3
DH
961 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
962 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 963
23deef88 964 name = c->user;
fafff8f1 965 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
966 if (r < 0)
967 return r;
81a2b7ce 968
4d885bd3
DH
969 *user = name;
970 return 0;
971}
972
973static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
974 int r;
975 const char *name;
976
977 assert(c);
978
979 if (!c->group)
980 return 0;
981
982 name = c->group;
fafff8f1 983 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
984 if (r < 0)
985 return r;
986
987 *group = name;
988 return 0;
989}
990
cdc5d5c5
DH
991static int get_supplementary_groups(const ExecContext *c, const char *user,
992 const char *group, gid_t gid,
993 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
994 int r, k = 0;
995 int ngroups_max;
996 bool keep_groups = false;
997 gid_t *groups = NULL;
998 _cleanup_free_ gid_t *l_gids = NULL;
999
1000 assert(c);
1001
bbeea271
DH
1002 /*
1003 * If user is given, then lookup GID and supplementary groups list.
1004 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1005 * here and as early as possible so we keep the list of supplementary
1006 * groups of the caller.
bbeea271
DH
1007 */
1008 if (user && gid_is_valid(gid) && gid != 0) {
1009 /* First step, initialize groups from /etc/groups */
1010 if (initgroups(user, gid) < 0)
1011 return -errno;
1012
1013 keep_groups = true;
1014 }
1015
ac6e8be6 1016 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1017 return 0;
1018
366ddd25
DH
1019 /*
1020 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1021 * be positive, otherwise fail.
1022 */
1023 errno = 0;
1024 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1025 if (ngroups_max <= 0)
1026 return errno_or_else(EOPNOTSUPP);
366ddd25 1027
4d885bd3
DH
1028 l_gids = new(gid_t, ngroups_max);
1029 if (!l_gids)
1030 return -ENOMEM;
81a2b7ce 1031
4d885bd3
DH
1032 if (keep_groups) {
1033 /*
1034 * Lookup the list of groups that the user belongs to, we
1035 * avoid NSS lookups here too for gid=0.
1036 */
1037 k = ngroups_max;
1038 if (getgrouplist(user, gid, l_gids, &k) < 0)
1039 return -EINVAL;
1040 } else
1041 k = 0;
81a2b7ce 1042
4d885bd3
DH
1043 STRV_FOREACH(i, c->supplementary_groups) {
1044 const char *g;
81a2b7ce 1045
4d885bd3
DH
1046 if (k >= ngroups_max)
1047 return -E2BIG;
81a2b7ce 1048
4d885bd3 1049 g = *i;
fafff8f1 1050 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1051 if (r < 0)
1052 return r;
81a2b7ce 1053
4d885bd3
DH
1054 k++;
1055 }
81a2b7ce 1056
4d885bd3
DH
1057 /*
1058 * Sets ngids to zero to drop all supplementary groups, happens
1059 * when we are under root and SupplementaryGroups= is empty.
1060 */
1061 if (k == 0) {
1062 *ngids = 0;
1063 return 0;
1064 }
81a2b7ce 1065
4d885bd3
DH
1066 /* Otherwise get the final list of supplementary groups */
1067 groups = memdup(l_gids, sizeof(gid_t) * k);
1068 if (!groups)
1069 return -ENOMEM;
1070
1071 *supplementary_gids = groups;
1072 *ngids = k;
1073
1074 groups = NULL;
1075
1076 return 0;
1077}
1078
34cf6c43 1079static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1080 int r;
1081
709dbeac
YW
1082 /* Handle SupplementaryGroups= if it is not empty */
1083 if (ngids > 0) {
4d885bd3
DH
1084 r = maybe_setgroups(ngids, supplementary_gids);
1085 if (r < 0)
97f0e76f 1086 return r;
4d885bd3 1087 }
81a2b7ce 1088
4d885bd3
DH
1089 if (gid_is_valid(gid)) {
1090 /* Then set our gids */
1091 if (setresgid(gid, gid, gid) < 0)
1092 return -errno;
81a2b7ce
LP
1093 }
1094
1095 return 0;
1096}
1097
a954b249
LP
1098static int set_securebits(unsigned bits, unsigned mask) {
1099 unsigned applied;
1100 int current;
1101
dbdc4098
TK
1102 current = prctl(PR_GET_SECUREBITS);
1103 if (current < 0)
1104 return -errno;
a954b249 1105
dbdc4098 1106 /* Clear all securebits defined in mask and set bits */
a954b249
LP
1107 applied = ((unsigned) current & ~mask) | bits;
1108 if ((unsigned) current == applied)
dbdc4098 1109 return 0;
a954b249 1110
dbdc4098
TK
1111 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1112 return -errno;
a954b249 1113
dbdc4098
TK
1114 return 1;
1115}
1116
81a2b7ce 1117static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1118 assert(context);
dbdc4098 1119 int r;
81a2b7ce 1120
4d885bd3
DH
1121 if (!uid_is_valid(uid))
1122 return 0;
1123
a954b249
LP
1124 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1125 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1126 * case. */
81a2b7ce 1127
a954b249 1128 if ((context->capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
81a2b7ce 1129
a954b249
LP
1130 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1131 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1132 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1133 if (r < 0)
1134 return r;
81a2b7ce
LP
1135 }
1136
479050b3 1137 /* Second step: actually set the uids */
81a2b7ce
LP
1138 if (setresuid(uid, uid, uid) < 0)
1139 return -errno;
1140
a954b249
LP
1141 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1142 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1143 * outside of this call. */
81a2b7ce
LP
1144 return 0;
1145}
1146
349cc4a5 1147#if HAVE_PAM
5b6319dc
LP
1148
1149static int null_conv(
1150 int num_msg,
1151 const struct pam_message **msg,
1152 struct pam_response **resp,
1153 void *appdata_ptr) {
1154
1155 /* We don't support conversations */
1156
1157 return PAM_CONV_ERR;
1158}
1159
cefc33ae
LP
1160#endif
1161
5b6319dc
LP
1162static int setup_pam(
1163 const char *name,
1164 const char *user,
940c5210 1165 uid_t uid,
2d6fce8d 1166 gid_t gid,
5b6319dc 1167 const char *tty,
421bb42d 1168 char ***env, /* updated on success */
5b8d1f6b 1169 const int fds[], size_t n_fds) {
5b6319dc 1170
349cc4a5 1171#if HAVE_PAM
cefc33ae 1172
5b6319dc
LP
1173 static const struct pam_conv conv = {
1174 .conv = null_conv,
1175 .appdata_ptr = NULL
1176 };
1177
2d7c6aa2 1178 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1179 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1180 pam_handle_t *handle = NULL;
d6e5f3ad 1181 sigset_t old_ss;
7bb70b6e 1182 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1183 bool close_session = false;
1184 pid_t pam_pid = 0, parent_pid;
970edce6 1185 int flags = 0;
5b6319dc
LP
1186
1187 assert(name);
1188 assert(user);
2065ca69 1189 assert(env);
5b6319dc
LP
1190
1191 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1192 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1193 * systemd via the cgroup logic. It will then remove the PAM
1194 * session again. The parent process will exec() the actual
1195 * daemon. We do things this way to ensure that the main PID
1196 * of the daemon is the one we initially fork()ed. */
1197
7bb70b6e
LP
1198 r = barrier_create(&barrier);
1199 if (r < 0)
2d7c6aa2
DH
1200 goto fail;
1201
553d2243 1202 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1203 flags |= PAM_SILENT;
1204
f546241b
ZJS
1205 pam_code = pam_start(name, user, &conv, &handle);
1206 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1207 handle = NULL;
1208 goto fail;
1209 }
1210
3cd24c1a
LP
1211 if (!tty) {
1212 _cleanup_free_ char *q = NULL;
1213
1214 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1215 * out if that's the case, and read the TTY off it. */
1216
1217 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1218 tty = strjoina("/dev/", q);
1219 }
1220
513cf7da
MS
1221 if (tty) {
1222 pam_code = pam_set_item(handle, PAM_TTY, tty);
1223 if (pam_code != PAM_SUCCESS)
1224 goto fail;
1225 }
5b6319dc 1226
84eada2f
JW
1227 STRV_FOREACH(nv, *env) {
1228 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1229 if (pam_code != PAM_SUCCESS)
1230 goto fail;
1231 }
1232
970edce6 1233 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1234 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1235 goto fail;
1236
3bb39ea9
DG
1237 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1238 if (pam_code != PAM_SUCCESS)
46d7c6af 1239 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1240
970edce6 1241 pam_code = pam_open_session(handle, flags);
f546241b 1242 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1243 goto fail;
1244
1245 close_session = true;
1246
f546241b
ZJS
1247 e = pam_getenvlist(handle);
1248 if (!e) {
5b6319dc
LP
1249 pam_code = PAM_BUF_ERR;
1250 goto fail;
1251 }
1252
cafc5ca1 1253 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1254
72c0a2c2 1255 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1256
df0ff127 1257 parent_pid = getpid_cached();
5b6319dc 1258
4c253ed1
LP
1259 r = safe_fork("(sd-pam)", 0, &pam_pid);
1260 if (r < 0)
5b6319dc 1261 goto fail;
4c253ed1 1262 if (r == 0) {
7bb70b6e 1263 int sig, ret = EXIT_PAM;
5b6319dc 1264
cafc5ca1 1265 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1266 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1267
1da37e58
ZJS
1268 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1269 * those fds are open here that have been opened by PAM. */
4c253ed1 1270 (void) close_many(fds, n_fds);
5b6319dc 1271
cafc5ca1
LP
1272 /* Drop privileges - we don't need any to pam_close_session and this will make
1273 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1274 * threads to fail to exit normally */
2d6fce8d 1275
97f0e76f
LP
1276 r = maybe_setgroups(0, NULL);
1277 if (r < 0)
1278 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1279 if (setresgid(gid, gid, gid) < 0)
1280 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1281 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1282 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1283
9c274488 1284 (void) ignore_signals(SIGPIPE);
ce30c8dc 1285
cafc5ca1
LP
1286 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1287 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1288 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1289 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1290 goto child_finish;
1291
cafc5ca1
LP
1292 /* Tell the parent that our setup is done. This is especially important regarding dropping
1293 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1294 *
cafc5ca1 1295 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1296 (void) barrier_place(&barrier);
2d7c6aa2 1297
643f4706 1298 /* Check if our parent process might already have died? */
5b6319dc 1299 if (getppid() == parent_pid) {
d6e5f3ad
DM
1300 sigset_t ss;
1301
1302 assert_se(sigemptyset(&ss) >= 0);
1303 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1304
3dead8d9
LP
1305 for (;;) {
1306 if (sigwait(&ss, &sig) < 0) {
1307 if (errno == EINTR)
1308 continue;
1309
1310 goto child_finish;
1311 }
5b6319dc 1312
3dead8d9
LP
1313 assert(sig == SIGTERM);
1314 break;
1315 }
5b6319dc
LP
1316 }
1317
3bb39ea9
DG
1318 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1319 if (pam_code != PAM_SUCCESS)
1320 goto child_finish;
1321
3dead8d9 1322 /* If our parent died we'll end the session */
f546241b 1323 if (getppid() != parent_pid) {
970edce6 1324 pam_code = pam_close_session(handle, flags);
f546241b 1325 if (pam_code != PAM_SUCCESS)
5b6319dc 1326 goto child_finish;
f546241b 1327 }
5b6319dc 1328
7bb70b6e 1329 ret = 0;
5b6319dc
LP
1330
1331 child_finish:
7feb2b57
LP
1332 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1333 * know about this. See pam_end(3) */
1334 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1335 _exit(ret);
5b6319dc
LP
1336 }
1337
2d7c6aa2
DH
1338 barrier_set_role(&barrier, BARRIER_PARENT);
1339
cafc5ca1
LP
1340 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1341 * here. */
5b6319dc
LP
1342 handle = NULL;
1343
3b8bddde 1344 /* Unblock SIGTERM again in the parent */
72c0a2c2 1345 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1346
cafc5ca1
LP
1347 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1348 * this fd around. */
5b6319dc
LP
1349 closelog();
1350
cafc5ca1
LP
1351 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1352 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1353 if (!barrier_place_and_sync(&barrier))
1354 log_error("PAM initialization failed");
1355
130d3d22 1356 return strv_free_and_replace(*env, e);
5b6319dc
LP
1357
1358fail:
970edce6
ZJS
1359 if (pam_code != PAM_SUCCESS) {
1360 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1361 r = -EPERM; /* PAM errors do not map to errno */
1362 } else
1363 log_error_errno(r, "PAM failed: %m");
9ba35398 1364
5b6319dc
LP
1365 if (handle) {
1366 if (close_session)
970edce6 1367 pam_code = pam_close_session(handle, flags);
5b6319dc 1368
7feb2b57 1369 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1370 }
1371
5b6319dc 1372 closelog();
7bb70b6e 1373 return r;
cefc33ae
LP
1374#else
1375 return 0;
5b6319dc 1376#endif
cefc33ae 1377}
5b6319dc 1378
5d6b1584 1379static void rename_process_from_path(const char *path) {
a99626c1 1380 _cleanup_free_ char *buf = NULL;
5d6b1584 1381 const char *p;
5d6b1584 1382
a99626c1
LP
1383 assert(path);
1384
1385 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1386 * /bin/ps */
5d6b1584 1387
a99626c1 1388 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1389 rename_process("(...)");
1390 return;
1391 }
1392
a99626c1 1393 size_t l = strlen(buf);
5d6b1584 1394 if (l > 8) {
a99626c1 1395 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1396 * "systemd-" */
a99626c1 1397 p = buf + l - 8;
5d6b1584 1398 l = 8;
a99626c1
LP
1399 } else
1400 p = buf;
5d6b1584 1401
a99626c1 1402 char process_name[11];
5d6b1584
LP
1403 process_name[0] = '(';
1404 memcpy(process_name+1, p, l);
1405 process_name[1+l] = ')';
1406 process_name[1+l+1] = 0;
1407
1408 rename_process(process_name);
1409}
1410
469830d1
LP
1411static bool context_has_address_families(const ExecContext *c) {
1412 assert(c);
1413
6b000af4 1414 return c->address_families_allow_list ||
469830d1
LP
1415 !set_isempty(c->address_families);
1416}
1417
1418static bool context_has_syscall_filters(const ExecContext *c) {
1419 assert(c);
1420
6b000af4 1421 return c->syscall_allow_list ||
8cfa775f 1422 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1423}
1424
9df2cdd8
TM
1425static bool context_has_syscall_logs(const ExecContext *c) {
1426 assert(c);
1427
1428 return c->syscall_log_allow_list ||
1429 !hashmap_isempty(c->syscall_log);
1430}
1431
469830d1
LP
1432static bool context_has_no_new_privileges(const ExecContext *c) {
1433 assert(c);
1434
1435 if (c->no_new_privileges)
1436 return true;
1437
26c45a6c 1438 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1439 return false;
1440
1441 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1442 return c->lock_personality ||
469830d1 1443 c->memory_deny_write_execute ||
0538d2a8 1444 c->private_devices ||
fc64760d 1445 c->protect_clock ||
0538d2a8 1446 c->protect_hostname ||
469830d1
LP
1447 c->protect_kernel_tunables ||
1448 c->protect_kernel_modules ||
84703040 1449 c->protect_kernel_logs ||
0538d2a8
YW
1450 context_has_address_families(c) ||
1451 exec_context_restrict_namespaces_set(c) ||
1452 c->restrict_realtime ||
1453 c->restrict_suid_sgid ||
78e864e5 1454 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1455 context_has_syscall_filters(c) ||
1456 context_has_syscall_logs(c);
469830d1
LP
1457}
1458
bb0c0d6f
LP
1459static bool exec_context_has_credentials(const ExecContext *context) {
1460
1461 assert(context);
1462
1463 return !hashmap_isempty(context->set_credentials) ||
43144be4 1464 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1465}
1466
349cc4a5 1467#if HAVE_SECCOMP
17df7223 1468
83f12b27 1469static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1470
1471 if (is_seccomp_available())
1472 return false;
1473
f673b62d 1474 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1475 return true;
83f12b27
FS
1476}
1477
165a31c0 1478static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1479 uint32_t negative_action, default_action, action;
165a31c0 1480 int r;
8351ceae 1481
469830d1 1482 assert(u);
c0467cf3 1483 assert(c);
8351ceae 1484
469830d1 1485 if (!context_has_syscall_filters(c))
83f12b27
FS
1486 return 0;
1487
469830d1
LP
1488 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1489 return 0;
e9642be2 1490
005bfaf1 1491 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1492
6b000af4 1493 if (c->syscall_allow_list) {
469830d1
LP
1494 default_action = negative_action;
1495 action = SCMP_ACT_ALLOW;
7c66bae2 1496 } else {
469830d1
LP
1497 default_action = SCMP_ACT_ALLOW;
1498 action = negative_action;
57183d11 1499 }
8351ceae 1500
165a31c0 1501 if (needs_ambient_hack) {
6b000af4 1502 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1503 if (r < 0)
1504 return r;
1505 }
1506
b54f36c6 1507 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1508}
1509
9df2cdd8
TM
1510static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1511#ifdef SCMP_ACT_LOG
1512 uint32_t default_action, action;
1513#endif
1514
1515 assert(u);
1516 assert(c);
1517
1518 if (!context_has_syscall_logs(c))
1519 return 0;
1520
1521#ifdef SCMP_ACT_LOG
1522 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1523 return 0;
1524
1525 if (c->syscall_log_allow_list) {
1526 /* Log nothing but the ones listed */
1527 default_action = SCMP_ACT_ALLOW;
1528 action = SCMP_ACT_LOG;
1529 } else {
1530 /* Log everything but the ones listed */
1531 default_action = SCMP_ACT_LOG;
1532 action = SCMP_ACT_ALLOW;
1533 }
1534
1535 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1536#else
1537 /* old libseccomp */
1538 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1539 return 0;
1540#endif
1541}
1542
469830d1
LP
1543static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1544 assert(u);
4298d0b5
LP
1545 assert(c);
1546
469830d1 1547 if (set_isempty(c->syscall_archs))
83f12b27
FS
1548 return 0;
1549
469830d1
LP
1550 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1551 return 0;
4298d0b5 1552
469830d1
LP
1553 return seccomp_restrict_archs(c->syscall_archs);
1554}
4298d0b5 1555
469830d1
LP
1556static int apply_address_families(const Unit* u, const ExecContext *c) {
1557 assert(u);
1558 assert(c);
4298d0b5 1559
469830d1
LP
1560 if (!context_has_address_families(c))
1561 return 0;
4298d0b5 1562
469830d1
LP
1563 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1564 return 0;
4298d0b5 1565
6b000af4 1566 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1567}
4298d0b5 1568
83f12b27 1569static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1570 assert(u);
f3e43635
TM
1571 assert(c);
1572
469830d1 1573 if (!c->memory_deny_write_execute)
83f12b27
FS
1574 return 0;
1575
469830d1
LP
1576 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1577 return 0;
f3e43635 1578
469830d1 1579 return seccomp_memory_deny_write_execute();
f3e43635
TM
1580}
1581
83f12b27 1582static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1583 assert(u);
f4170c67
LP
1584 assert(c);
1585
469830d1 1586 if (!c->restrict_realtime)
83f12b27
FS
1587 return 0;
1588
469830d1
LP
1589 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1590 return 0;
f4170c67 1591
469830d1 1592 return seccomp_restrict_realtime();
f4170c67
LP
1593}
1594
f69567cb
LP
1595static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1596 assert(u);
1597 assert(c);
1598
1599 if (!c->restrict_suid_sgid)
1600 return 0;
1601
1602 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1603 return 0;
1604
1605 return seccomp_restrict_suid_sgid();
1606}
1607
59e856c7 1608static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1609 assert(u);
59eeb84b
LP
1610 assert(c);
1611
1612 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1613 * let's protect even those systems where this is left on in the kernel. */
1614
469830d1 1615 if (!c->protect_kernel_tunables)
59eeb84b
LP
1616 return 0;
1617
469830d1
LP
1618 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1619 return 0;
59eeb84b 1620
469830d1 1621 return seccomp_protect_sysctl();
59eeb84b
LP
1622}
1623
59e856c7 1624static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1625 assert(u);
502d704e
DH
1626 assert(c);
1627
25a8d8a0 1628 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1629
469830d1
LP
1630 if (!c->protect_kernel_modules)
1631 return 0;
1632
502d704e
DH
1633 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1634 return 0;
1635
b54f36c6 1636 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1637}
1638
84703040
KK
1639static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1640 assert(u);
1641 assert(c);
1642
1643 if (!c->protect_kernel_logs)
1644 return 0;
1645
1646 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1647 return 0;
1648
1649 return seccomp_protect_syslog();
1650}
1651
daf8f72b 1652static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1653 assert(u);
1654 assert(c);
1655
1656 if (!c->protect_clock)
1657 return 0;
1658
1659 if (skip_seccomp_unavailable(u, "ProtectClock="))
1660 return 0;
1661
1662 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1663}
1664
59e856c7 1665static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1666 assert(u);
ba128bb8
LP
1667 assert(c);
1668
8f81a5f6 1669 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1670
469830d1
LP
1671 if (!c->private_devices)
1672 return 0;
1673
ba128bb8
LP
1674 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1675 return 0;
1676
b54f36c6 1677 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1678}
1679
34cf6c43 1680static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1681 assert(u);
add00535
LP
1682 assert(c);
1683
1684 if (!exec_context_restrict_namespaces_set(c))
1685 return 0;
1686
1687 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1688 return 0;
1689
1690 return seccomp_restrict_namespaces(c->restrict_namespaces);
1691}
1692
78e864e5 1693static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1694 unsigned long personality;
1695 int r;
78e864e5
TM
1696
1697 assert(u);
1698 assert(c);
1699
1700 if (!c->lock_personality)
1701 return 0;
1702
1703 if (skip_seccomp_unavailable(u, "LockPersonality="))
1704 return 0;
1705
e8132d63
LP
1706 personality = c->personality;
1707
1708 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1709 if (personality == PERSONALITY_INVALID) {
1710
1711 r = opinionated_personality(&personality);
1712 if (r < 0)
1713 return r;
1714 }
78e864e5
TM
1715
1716 return seccomp_lock_personality(personality);
1717}
1718
c0467cf3 1719#endif
8351ceae 1720
7a8288f6 1721#if HAVE_LIBBPF
7a8288f6
DM
1722static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1723 assert(u);
1724 assert(c);
1725
1726 if (!exec_context_restrict_filesystems_set(c))
1727 return 0;
1728
46004616
ZJS
1729 if (!u->manager->restrict_fs) {
1730 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1731 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1732 return 0;
46004616 1733 }
7a8288f6
DM
1734
1735 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1736}
1737#endif
1738
daf8f72b 1739static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1740 assert(u);
1741 assert(c);
1742
1743 if (!c->protect_hostname)
1744 return 0;
1745
1746 if (ns_type_supported(NAMESPACE_UTS)) {
1747 if (unshare(CLONE_NEWUTS) < 0) {
1748 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1749 *ret_exit_status = EXIT_NAMESPACE;
1750 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1751 }
1752
1753 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1754 }
1755 } else
1756 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1757
1758#if HAVE_SECCOMP
8f3e342f
ZJS
1759 int r;
1760
daf8f72b
LP
1761 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1762 return 0;
1763
1764 r = seccomp_protect_hostname();
1765 if (r < 0) {
1766 *ret_exit_status = EXIT_SECCOMP;
1767 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1768 }
1769#endif
1770
1771 return 0;
1772}
1773
3042bbeb 1774static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1775 assert(idle_pipe);
1776
54eb2300
LP
1777 idle_pipe[1] = safe_close(idle_pipe[1]);
1778 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1779
1780 if (idle_pipe[0] >= 0) {
1781 int r;
1782
1783 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1784
1785 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1786 ssize_t n;
1787
31a7eb86 1788 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1789 n = write(idle_pipe[3], "x", 1);
1790 if (n > 0)
cd972d69 1791 /* Wait for systemd to react to the signal above. */
54756dce 1792 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1793 }
1794
54eb2300 1795 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1796
1797 }
1798
54eb2300 1799 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1800}
1801
fb2042dd
YW
1802static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1803
7cae38c4 1804static int build_environment(
34cf6c43 1805 const Unit *u,
9fa95f85 1806 const ExecContext *c,
1e22b5cd 1807 const ExecParameters *p,
da6053d0 1808 size_t n_fds,
cd48e23f 1809 char **fdnames,
7cae38c4
LP
1810 const char *home,
1811 const char *username,
1812 const char *shell,
7bce046b
LP
1813 dev_t journal_stream_dev,
1814 ino_t journal_stream_ino,
7cae38c4
LP
1815 char ***ret) {
1816
1817 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1818 size_t n_env = 0;
7cae38c4
LP
1819 char *x;
1820
4b58153d 1821 assert(u);
7cae38c4 1822 assert(c);
7c1cb6f1 1823 assert(p);
7cae38c4
LP
1824 assert(ret);
1825
dc4e2940 1826#define N_ENV_VARS 17
8d5bb13d 1827 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1828 if (!our_env)
1829 return -ENOMEM;
1830
1831 if (n_fds > 0) {
8dd4c05b
LP
1832 _cleanup_free_ char *joined = NULL;
1833
df0ff127 1834 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1835 return -ENOMEM;
1836 our_env[n_env++] = x;
1837
da6053d0 1838 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1839 return -ENOMEM;
1840 our_env[n_env++] = x;
8dd4c05b 1841
cd48e23f 1842 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1843 if (!joined)
1844 return -ENOMEM;
1845
605405c6 1846 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1847 if (!x)
1848 return -ENOMEM;
1849 our_env[n_env++] = x;
7cae38c4
LP
1850 }
1851
b08af3b1 1852 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1853 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1854 return -ENOMEM;
1855 our_env[n_env++] = x;
1856
1e22b5cd 1857 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1858 return -ENOMEM;
1859 our_env[n_env++] = x;
1860 }
1861
de90700f
LP
1862 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1863 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1864 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1865 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1866 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1867 if (!x)
1868 return -ENOMEM;
1869 our_env[n_env++] = x;
1870 }
1871
7cae38c4 1872 if (home) {
b910cc72 1873 x = strjoin("HOME=", home);
7cae38c4
LP
1874 if (!x)
1875 return -ENOMEM;
7bbead1d 1876
4ff361cc 1877 path_simplify(x + 5);
7cae38c4
LP
1878 our_env[n_env++] = x;
1879 }
1880
1881 if (username) {
b910cc72 1882 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886
b910cc72 1887 x = strjoin("USER=", username);
7cae38c4
LP
1888 if (!x)
1889 return -ENOMEM;
1890 our_env[n_env++] = x;
1891 }
1892
1893 if (shell) {
b910cc72 1894 x = strjoin("SHELL=", shell);
7cae38c4
LP
1895 if (!x)
1896 return -ENOMEM;
7bbead1d 1897
4ff361cc 1898 path_simplify(x + 6);
7cae38c4
LP
1899 our_env[n_env++] = x;
1900 }
1901
4b58153d
LP
1902 if (!sd_id128_is_null(u->invocation_id)) {
1903 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1904 return -ENOMEM;
1905
1906 our_env[n_env++] = x;
1907 }
1908
6af760f3
LP
1909 if (exec_context_needs_term(c)) {
1910 const char *tty_path, *term = NULL;
1911
1912 tty_path = exec_context_tty_path(c);
1913
e8cf09b2
LP
1914 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1917
e8cf09b2 1918 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1919 term = getenv("TERM");
e8cf09b2 1920
6af760f3
LP
1921 if (!term)
1922 term = default_term_for_tty(tty_path);
7cae38c4 1923
b910cc72 1924 x = strjoin("TERM=", term);
7cae38c4
LP
1925 if (!x)
1926 return -ENOMEM;
1927 our_env[n_env++] = x;
1928 }
1929
7bce046b
LP
1930 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1931 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1932 return -ENOMEM;
1933
1934 our_env[n_env++] = x;
1935 }
1936
91dd5f7c
LP
1937 if (c->log_namespace) {
1938 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1939 if (!x)
1940 return -ENOMEM;
1941
1942 our_env[n_env++] = x;
1943 }
1944
5b10116e 1945 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1946 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1947 const char *n;
1948
1949 if (!p->prefix[t])
1950 continue;
1951
211a3d87 1952 if (c->directories[t].n_items == 0)
fb2042dd
YW
1953 continue;
1954
1955 n = exec_directory_env_name_to_string(t);
1956 if (!n)
1957 continue;
1958
211a3d87
LB
1959 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1960 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1961
211a3d87
LB
1962 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1963 if (!prefixed)
1964 return -ENOMEM;
1965
1966 if (!strextend_with_separator(&joined, ":", prefixed))
1967 return -ENOMEM;
1968 }
fb2042dd
YW
1969
1970 x = strjoin(n, "=", joined);
1971 if (!x)
1972 return -ENOMEM;
1973
1974 our_env[n_env++] = x;
1975 }
1976
bb0c0d6f
LP
1977 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1978 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1979 if (!x)
1980 return -ENOMEM;
1981
1982 our_env[n_env++] = x;
1983 }
1984
dc4e2940
YW
1985 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1986 return -ENOMEM;
1987
1988 our_env[n_env++] = x;
1989
7cae38c4 1990 our_env[n_env++] = NULL;
8d5bb13d
LP
1991 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1992#undef N_ENV_VARS
7cae38c4 1993
ae2a15bc 1994 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1995
1996 return 0;
1997}
1998
b4c14404
FB
1999static int build_pass_environment(const ExecContext *c, char ***ret) {
2000 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2001 size_t n_env = 0;
b4c14404
FB
2002
2003 STRV_FOREACH(i, c->pass_environment) {
2004 _cleanup_free_ char *x = NULL;
2005 char *v;
2006
2007 v = getenv(*i);
2008 if (!v)
2009 continue;
605405c6 2010 x = strjoin(*i, "=", v);
b4c14404
FB
2011 if (!x)
2012 return -ENOMEM;
00819cc1 2013
319a4f4b 2014 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2015 return -ENOMEM;
00819cc1 2016
1cc6c93a 2017 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2018 pass_env[n_env] = NULL;
b4c14404
FB
2019 }
2020
ae2a15bc 2021 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2022
2023 return 0;
2024}
2025
fbbb9697
YW
2026bool exec_needs_network_namespace(const ExecContext *context) {
2027 assert(context);
2028
2029 return context->private_network || context->network_namespace_path;
2030}
2031
fde36d25
YW
2032static bool exec_needs_ipc_namespace(const ExecContext *context) {
2033 assert(context);
2034
2035 return context->private_ipc || context->ipc_namespace_path;
2036}
2037
5e8deb94 2038bool exec_needs_mount_namespace(
8b44a3d2
LP
2039 const ExecContext *context,
2040 const ExecParameters *params,
4657abb5 2041 const ExecRuntime *runtime) {
8b44a3d2
LP
2042
2043 assert(context);
8b44a3d2 2044
915e6d16
LP
2045 if (context->root_image)
2046 return true;
2047
2a624c36
AP
2048 if (!strv_isempty(context->read_write_paths) ||
2049 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2050 !strv_isempty(context->inaccessible_paths) ||
2051 !strv_isempty(context->exec_paths) ||
2052 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2053 return true;
2054
42b1d8e0 2055 if (context->n_bind_mounts > 0)
d2d6c096
LP
2056 return true;
2057
2abd4e38
YW
2058 if (context->n_temporary_filesystems > 0)
2059 return true;
2060
b3d13314
LB
2061 if (context->n_mount_images > 0)
2062 return true;
2063
93f59701
LB
2064 if (context->n_extension_images > 0)
2065 return true;
2066
a07b9926
LB
2067 if (!strv_isempty(context->extension_directories))
2068 return true;
2069
37ed15d7 2070 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2071 return true;
2072
2073 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2074 return true;
2075
8b44a3d2 2076 if (context->private_devices ||
24002121 2077 context->private_mounts > 0 ||
c2da3bf2 2078 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
8b44a3d2 2079 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2080 context->protect_home != PROTECT_HOME_NO ||
2081 context->protect_kernel_tunables ||
c575770b 2082 context->protect_kernel_modules ||
94a7b275 2083 context->protect_kernel_logs ||
4e399953
LP
2084 context->protect_control_groups ||
2085 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44 2086 context->proc_subset != PROC_SUBSET_ALL ||
fde36d25 2087 exec_needs_ipc_namespace(context))
8b44a3d2
LP
2088 return true;
2089
37c56f89 2090 if (context->root_directory) {
5e98086d 2091 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2092 return true;
2093
5b10116e 2094 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2095 if (params && !params->prefix[t])
37c56f89
YW
2096 continue;
2097
211a3d87 2098 if (context->directories[t].n_items > 0)
37c56f89
YW
2099 return true;
2100 }
2101 }
5d997827 2102
42b1d8e0 2103 if (context->dynamic_user &&
211a3d87
LB
2104 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2105 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2106 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2107 return true;
2108
91dd5f7c
LP
2109 if (context->log_namespace)
2110 return true;
2111
8b44a3d2
LP
2112 return false;
2113}
2114
5749f855 2115static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2116 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2117 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2118 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2119 _cleanup_(sigkill_waitp) pid_t pid = 0;
2120 uint64_t c = 1;
d251207d
LP
2121 ssize_t n;
2122 int r;
2123
5749f855
AZ
2124 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2125 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2126 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2127 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2128 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2129 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2130 * continues execution normally.
2131 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2132 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2133
5749f855 2134 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2135 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2136 r = asprintf(&uid_map,
5749f855 2137 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2138 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2139 ouid, ouid, uid, uid);
2140 else
2141 r = asprintf(&uid_map,
2142 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2143 ouid, ouid);
d251207d 2144
5749f855
AZ
2145 if (r < 0)
2146 return -ENOMEM;
2147
2148 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2149 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2150 r = asprintf(&gid_map,
5749f855 2151 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2152 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2153 ogid, ogid, gid, gid);
2154 else
2155 r = asprintf(&gid_map,
2156 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2157 ogid, ogid);
2158
2159 if (r < 0)
2160 return -ENOMEM;
d251207d
LP
2161
2162 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2163 * namespace. */
2164 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2165 if (unshare_ready_fd < 0)
2166 return -errno;
2167
2168 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2169 * failed. */
2170 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2171 return -errno;
2172
4c253ed1
LP
2173 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2174 if (r < 0)
2175 return r;
2176 if (r == 0) {
254d1313 2177 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2178 const char *a;
2179 pid_t ppid;
2180
2181 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2182 * here, after the parent opened its own user namespace. */
2183
2184 ppid = getppid();
2185 errno_pipe[0] = safe_close(errno_pipe[0]);
2186
2187 /* Wait until the parent unshared the user namespace */
2188 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2189 r = -errno;
2190 goto child_fail;
2191 }
2192
2193 /* Disable the setgroups() system call in the child user namespace, for good. */
2194 a = procfs_file_alloca(ppid, "setgroups");
2195 fd = open(a, O_WRONLY|O_CLOEXEC);
2196 if (fd < 0) {
2197 if (errno != ENOENT) {
2198 r = -errno;
2199 goto child_fail;
2200 }
2201
2202 /* If the file is missing the kernel is too old, let's continue anyway. */
2203 } else {
2204 if (write(fd, "deny\n", 5) < 0) {
2205 r = -errno;
2206 goto child_fail;
2207 }
2208
2209 fd = safe_close(fd);
2210 }
2211
2212 /* First write the GID map */
2213 a = procfs_file_alloca(ppid, "gid_map");
2214 fd = open(a, O_WRONLY|O_CLOEXEC);
2215 if (fd < 0) {
2216 r = -errno;
2217 goto child_fail;
2218 }
2219 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2220 r = -errno;
2221 goto child_fail;
2222 }
2223 fd = safe_close(fd);
2224
2225 /* The write the UID map */
2226 a = procfs_file_alloca(ppid, "uid_map");
2227 fd = open(a, O_WRONLY|O_CLOEXEC);
2228 if (fd < 0) {
2229 r = -errno;
2230 goto child_fail;
2231 }
2232 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2233 r = -errno;
2234 goto child_fail;
2235 }
2236
2237 _exit(EXIT_SUCCESS);
2238
2239 child_fail:
2240 (void) write(errno_pipe[1], &r, sizeof(r));
2241 _exit(EXIT_FAILURE);
2242 }
2243
2244 errno_pipe[1] = safe_close(errno_pipe[1]);
2245
2246 if (unshare(CLONE_NEWUSER) < 0)
2247 return -errno;
2248
2249 /* Let the child know that the namespace is ready now */
2250 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2251 return -errno;
2252
2253 /* Try to read an error code from the child */
2254 n = read(errno_pipe[0], &r, sizeof(r));
2255 if (n < 0)
2256 return -errno;
2257 if (n == sizeof(r)) { /* an error code was sent to us */
2258 if (r < 0)
2259 return r;
2260 return -EIO;
2261 }
2262 if (n != 0) /* on success we should have read 0 bytes */
2263 return -EIO;
2264
8f03de53 2265 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2266 if (r < 0)
2267 return r;
2e87a1fd 2268 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2269 return -EIO;
2270
2271 return 0;
2272}
2273
494d0247
YW
2274static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2275 if (!context->dynamic_user)
2276 return false;
2277
2278 if (type == EXEC_DIRECTORY_CONFIGURATION)
2279 return false;
2280
2281 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2282 return false;
2283
2284 return true;
2285}
2286
211a3d87
LB
2287static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2288 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2289 int r;
2290
2291 assert(source);
2292
2293 src_abs = path_join(root, source);
2294 if (!src_abs)
2295 return -ENOMEM;
2296
2297 STRV_FOREACH(dst, symlinks) {
2298 _cleanup_free_ char *dst_abs = NULL;
2299
2300 dst_abs = path_join(root, *dst);
2301 if (!dst_abs)
2302 return -ENOMEM;
2303
2304 r = mkdir_parents_label(dst_abs, 0755);
2305 if (r < 0)
2306 return r;
2307
2308 r = symlink_idempotent(src_abs, dst_abs, true);
2309 if (r < 0)
2310 return r;
2311 }
2312
2313 return 0;
2314}
2315
3536f49e 2316static int setup_exec_directory(
07689d5d
LP
2317 const ExecContext *context,
2318 const ExecParameters *params,
2319 uid_t uid,
3536f49e 2320 gid_t gid,
3536f49e 2321 ExecDirectoryType type,
211a3d87 2322 bool needs_mount_namespace,
3536f49e 2323 int *exit_status) {
07689d5d 2324
72fd1768 2325 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2326 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2327 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2328 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2329 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2330 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2331 };
07689d5d
LP
2332 int r;
2333
2334 assert(context);
2335 assert(params);
72fd1768 2336 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2337 assert(exit_status);
07689d5d 2338
3536f49e
YW
2339 if (!params->prefix[type])
2340 return 0;
2341
8679efde 2342 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2343 if (!uid_is_valid(uid))
2344 uid = 0;
2345 if (!gid_is_valid(gid))
2346 gid = 0;
2347 }
2348
211a3d87 2349 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2350 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2351
211a3d87 2352 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2353 if (!p) {
2354 r = -ENOMEM;
2355 goto fail;
2356 }
07689d5d 2357
23a7448e
YW
2358 r = mkdir_parents_label(p, 0755);
2359 if (r < 0)
3536f49e 2360 goto fail;
23a7448e 2361
494d0247 2362 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2363 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2364 * case we want to avoid leaving a directory around fully accessible that is owned by
2365 * a dynamic user whose UID is later on reused. To lock this down we use the same
2366 * trick used by container managers to prohibit host users to get access to files of
2367 * the same UID in containers: we place everything inside a directory that has an
2368 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2369 * for unprivileged host code. We then use fs namespacing to make this directory
2370 * permeable for the service itself.
6c47cd7d 2371 *
3f5b1508
LP
2372 * Specifically: for a service which wants a special directory "foo/" we first create
2373 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2374 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2375 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2376 * unprivileged host users can't look into it. Inside of the namespace of the unit
2377 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2378 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2379 * for the service and making sure it only gets access to the dirs it needs but no
2380 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2381 *
3f5b1508
LP
2382 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2383 * to be owned by the service itself.
2384 *
2385 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2386 * for sharing files or sockets with other services. */
6c47cd7d 2387
4ede9802
LP
2388 pp = path_join(params->prefix[type], "private");
2389 if (!pp) {
6c47cd7d
LP
2390 r = -ENOMEM;
2391 goto fail;
2392 }
2393
2394 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2395 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2396 if (r < 0)
2397 goto fail;
2398
211a3d87 2399 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2400 r = -ENOMEM;
2401 goto fail;
2402 }
2403
2404 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2405 r = mkdir_parents_label(pp, 0755);
2406 if (r < 0)
2407 goto fail;
2408
949befd3
LP
2409 if (is_dir(p, false) > 0 &&
2410 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2411
2412 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2413 * it over. Most likely the service has been upgraded from one that didn't use
2414 * DynamicUser=1, to one that does. */
2415
cf52c45d
LP
2416 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2417 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2418 exec_directory_type_to_string(type), p, pp);
2419
949befd3
LP
2420 if (rename(p, pp) < 0) {
2421 r = -errno;
2422 goto fail;
2423 }
2424 } else {
2425 /* Otherwise, create the actual directory for the service */
2426
2427 r = mkdir_label(pp, context->directories[type].mode);
2428 if (r < 0 && r != -EEXIST)
2429 goto fail;
2430 }
6c47cd7d 2431
a2ab603c
YW
2432 if (!context->directories[type].items[i].only_create) {
2433 /* And link it up from the original place.
2434 * Notes
2435 * 1) If a mount namespace is going to be used, then this symlink remains on
2436 * the host, and a new one for the child namespace will be created later.
2437 * 2) It is not necessary to create this symlink when one of its parent
2438 * directories is specified and already created. E.g.
2439 * StateDirectory=foo foo/bar
2440 * In that case, the inode points to pp and p for "foo/bar" are the same:
2441 * pp = "/var/lib/private/foo/bar"
2442 * p = "/var/lib/foo/bar"
2443 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2444 * we do not need to create the symlink, but we cannot create the symlink.
2445 * See issue #24783. */
2446 r = symlink_idempotent(pp, p, true);
2447 if (r < 0)
2448 goto fail;
2449 }
6c47cd7d 2450
6c47cd7d 2451 } else {
5c6d40d1
LP
2452 _cleanup_free_ char *target = NULL;
2453
2454 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2455 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2456 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2457
2458 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2459 * by DynamicUser=1 (see above)?
2460 *
2461 * We do this for all directory types except for ConfigurationDirectory=,
2462 * since they all support the private/ symlink logic at least in some
2463 * configurations, see above. */
5c6d40d1 2464
578dc69f
YW
2465 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2466 if (r < 0)
2467 goto fail;
2468
211a3d87 2469 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2470 if (!q) {
2471 r = -ENOMEM;
2472 goto fail;
2473 }
2474
578dc69f
YW
2475 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2476 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2477 if (r < 0)
2478 goto fail;
2479
2480 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2481
2482 /* Hmm, apparently DynamicUser= was once turned on for this service,
2483 * but is no longer. Let's move the directory back up. */
2484
cf52c45d
LP
2485 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2486 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2487 exec_directory_type_to_string(type), q, p);
2488
5c6d40d1
LP
2489 if (unlink(p) < 0) {
2490 r = -errno;
2491 goto fail;
2492 }
2493
2494 if (rename(q, p) < 0) {
2495 r = -errno;
2496 goto fail;
2497 }
2498 }
2499 }
2500
6c47cd7d 2501 r = mkdir_label(p, context->directories[type].mode);
d484580c 2502 if (r < 0) {
d484580c
LP
2503 if (r != -EEXIST)
2504 goto fail;
2505
206e9864
LP
2506 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2507 struct stat st;
2508
2509 /* Don't change the owner/access mode of the configuration directory,
2510 * as in the common case it is not written to by a service, and shall
2511 * not be writable. */
2512
2513 if (stat(p, &st) < 0) {
2514 r = -errno;
2515 goto fail;
2516 }
2517
2518 /* Still complain if the access mode doesn't match */
2519 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2520 log_warning("%s \'%s\' already exists but the mode is different. "
2521 "(File system: %o %sMode: %o)",
211a3d87 2522 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2523 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2524
6cff72eb 2525 continue;
206e9864 2526 }
6cff72eb 2527 }
a1164ae3 2528 }
07689d5d 2529
206e9864 2530 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2531 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2532 * current UID/GID ownership.) */
2533 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2534 if (r < 0)
2535 goto fail;
c71b2eb7 2536
607b358e
LP
2537 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2538 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2539 * assignments to exist. */
607b358e 2540 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2541 if (r < 0)
3536f49e 2542 goto fail;
07689d5d
LP
2543 }
2544
211a3d87
LB
2545 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2546 * they are set up later, to allow configuring empty var/run/etc. */
2547 if (!needs_mount_namespace)
2548 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2549 r = create_many_symlinks(params->prefix[type],
2550 context->directories[type].items[i].path,
2551 context->directories[type].items[i].symlinks);
2552 if (r < 0)
2553 goto fail;
2554 }
2555
07689d5d 2556 return 0;
3536f49e
YW
2557
2558fail:
2559 *exit_status = exit_status_table[type];
3536f49e 2560 return r;
07689d5d
LP
2561}
2562
bb0c0d6f
LP
2563static int write_credential(
2564 int dfd,
2565 const char *id,
2566 const void *data,
2567 size_t size,
2568 uid_t uid,
2569 bool ownership_ok) {
2570
2571 _cleanup_(unlink_and_freep) char *tmp = NULL;
254d1313 2572 _cleanup_close_ int fd = -EBADF;
bb0c0d6f
LP
2573 int r;
2574
2575 r = tempfn_random_child("", "cred", &tmp);
2576 if (r < 0)
2577 return r;
2578
2579 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2580 if (fd < 0) {
2581 tmp = mfree(tmp);
2582 return -errno;
2583 }
2584
43144be4 2585 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2586 if (r < 0)
2587 return r;
2588
2589 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2590 return -errno;
2591
2592 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2593 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2594 if (r < 0) {
2595 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2596 return r;
2597
2598 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2599 * to express: that the user gets read access and nothing
2600 * else. But if the backing fs can't support that (e.g. ramfs)
2601 * then we can use file ownership instead. But that's only safe if
2602 * we can then re-mount the whole thing read-only, so that the
2603 * user can no longer chmod() the file to gain write access. */
2604 return r;
2605
f5fbe71d 2606 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2607 return -errno;
2608 }
2609 }
2610
2611 if (renameat(dfd, tmp, dfd, id) < 0)
2612 return -errno;
2613
2614 tmp = mfree(tmp);
2615 return 0;
2616}
2617
2ad591a3
LP
2618static char **credential_search_path(
2619 const ExecParameters *params,
2620 bool encrypted) {
2621
2622 _cleanup_strv_free_ char **l = NULL;
2623
2624 assert(params);
2625
2626 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2627 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2628 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2629
2630 if (encrypted) {
2631 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2632 return NULL;
2633
2634 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2635 return NULL;
2636 }
2637
2638 if (params->received_credentials_directory)
2639 if (strv_extend(&l, params->received_credentials_directory) < 0)
2640 return NULL;
2641
2642 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2643 return NULL;
2644
2645 if (DEBUG_LOGGING) {
2646 _cleanup_free_ char *t = strv_join(l, ":");
2647
2648 log_debug("Credential search path is: %s", t);
2649 }
2650
2651 return TAKE_PTR(l);
2652}
2653
3989bdc1
AB
2654static int load_credential(
2655 const ExecContext *context,
2656 const ExecParameters *params,
10b44e1d
LP
2657 const char *id,
2658 const char *path,
2659 bool encrypted,
3989bdc1
AB
2660 const char *unit,
2661 int read_dfd,
2662 int write_dfd,
2663 uid_t uid,
2664 bool ownership_ok,
2665 uint64_t *left) {
2666
3989bdc1 2667 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2668 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2669 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2670 _cleanup_free_ char *bindname = NULL;
2671 const char *source = NULL;
3989bdc1 2672 bool missing_ok = true;
2ad591a3 2673 size_t size, add, maxsz;
3989bdc1
AB
2674 int r;
2675
10b44e1d
LP
2676 assert(context);
2677 assert(params);
2678 assert(id);
2679 assert(path);
2680 assert(unit);
661e4251 2681 assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
10b44e1d
LP
2682 assert(write_dfd >= 0);
2683 assert(left);
2684
2ad591a3
LP
2685 if (read_dfd >= 0) {
2686 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2687 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2688 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2689 * open it. */
2690
2691 if (!filename_is_valid(path)) /* safety check */
2692 return -EINVAL;
2693
2694 missing_ok = true;
10b44e1d 2695 source = path;
2ad591a3
LP
2696
2697 } else if (path_is_absolute(path)) {
2698 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2699 * sockets */
2700
2701 if (!path_is_valid(path)) /* safety check */
2702 return -EINVAL;
2703
3989bdc1
AB
2704 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2705
2706 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2707 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2708 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2709 return -ENOMEM;
2710
2711 missing_ok = false;
2ad591a3 2712 source = path;
3989bdc1 2713
2ad591a3
LP
2714 } else if (credential_name_valid(path)) {
2715 /* If this is a relative path, take it as credential name relative to the credentials
2716 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2717 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2718
2719 search_path = credential_search_path(params, encrypted);
2720 if (!search_path)
3989bdc1
AB
2721 return -ENOMEM;
2722
2ad591a3 2723 missing_ok = true;
3989bdc1
AB
2724 } else
2725 source = NULL;
2726
2ad591a3
LP
2727 if (encrypted)
2728 flags |= READ_FULL_FILE_UNBASE64;
2729
2730 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2731
2732 if (search_path) {
2733 STRV_FOREACH(d, search_path) {
2734 _cleanup_free_ char *j = NULL;
2735
2736 j = path_join(*d, path);
2737 if (!j)
2738 return -ENOMEM;
2739
2740 r = read_full_file_full(
2741 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2742 UINT64_MAX,
2743 maxsz,
2744 flags,
2745 NULL,
2746 &data, &size);
2747 if (r != -ENOENT)
2748 break;
2749 }
2750 } else if (source)
3989bdc1
AB
2751 r = read_full_file_full(
2752 read_dfd, source,
2753 UINT64_MAX,
2ad591a3
LP
2754 maxsz,
2755 flags,
3989bdc1
AB
2756 bindname,
2757 &data, &size);
2758 else
2759 r = -ENOENT;
2760
10b44e1d 2761 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2762 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2763 * will get clear errors if we don't pass such a missing credential on as they
2764 * themselves will get ENOENT when trying to read them, which should not be much
2765 * worse than when we handle the error here and make it fatal.
2766 *
2767 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2768 * we are fine, too. */
10b44e1d 2769 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2770 return 0;
2771 }
2772 if (r < 0)
10b44e1d 2773 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2774
10b44e1d 2775 if (encrypted) {
3989bdc1
AB
2776 _cleanup_free_ void *plaintext = NULL;
2777 size_t plaintext_size = 0;
2778
6a0779cb 2779 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2780 if (r < 0)
2781 return r;
2782
2783 free_and_replace(data, plaintext);
2784 size = plaintext_size;
2785 }
2786
10b44e1d 2787 add = strlen(id) + size;
3989bdc1
AB
2788 if (add > *left)
2789 return -E2BIG;
2790
10b44e1d 2791 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2792 if (r < 0)
94602bff 2793 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2794
2795 *left -= add;
2796 return 0;
2797}
2798
2799struct load_cred_args {
3989bdc1
AB
2800 const ExecContext *context;
2801 const ExecParameters *params;
461345a1 2802 bool encrypted;
3989bdc1
AB
2803 const char *unit;
2804 int dfd;
2805 uid_t uid;
2806 bool ownership_ok;
2807 uint64_t *left;
2808};
2809
2810static int load_cred_recurse_dir_cb(
2811 RecurseDirEvent event,
2812 const char *path,
2813 int dir_fd,
2814 int inode_fd,
2815 const struct dirent *de,
2816 const struct statx *sx,
2817 void *userdata) {
2818
6394e5cd 2819 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2820 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2821 int r;
2822
2823 if (event != RECURSE_DIR_ENTRY)
2824 return RECURSE_DIR_CONTINUE;
2825
2826 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2827 return RECURSE_DIR_CONTINUE;
2828
11348386 2829 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2830 if (!sub_id)
2831 return -ENOMEM;
2832
2833 if (!credential_name_valid(sub_id))
1451435c 2834 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2835
5bec447a 2836 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2837 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2838 return RECURSE_DIR_CONTINUE;
2839 }
5bec447a
LP
2840 if (errno != ENOENT)
2841 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2842
10b44e1d
LP
2843 r = load_credential(
2844 args->context,
2845 args->params,
2846 sub_id,
2847 de->d_name,
461345a1 2848 args->encrypted,
10b44e1d
LP
2849 args->unit,
2850 dir_fd,
2851 args->dfd,
2852 args->uid,
2853 args->ownership_ok,
2854 args->left);
3989bdc1
AB
2855 if (r < 0)
2856 return r;
2857
2858 return RECURSE_DIR_CONTINUE;
2859}
2860
bb0c0d6f
LP
2861static int acquire_credentials(
2862 const ExecContext *context,
2863 const ExecParameters *params,
d3dcf4e3 2864 const char *unit,
bb0c0d6f
LP
2865 const char *p,
2866 uid_t uid,
2867 bool ownership_ok) {
2868
43144be4 2869 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
254d1313 2870 _cleanup_close_ int dfd = -EBADF;
43144be4 2871 ExecLoadCredential *lc;
bb0c0d6f 2872 ExecSetCredential *sc;
bb0c0d6f
LP
2873 int r;
2874
2875 assert(context);
2876 assert(p);
2877
2878 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2879 if (dfd < 0)
2880 return -errno;
2881
43144be4
LP
2882 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2883 HASHMAP_FOREACH(lc, context->load_credentials) {
254d1313 2884 _cleanup_close_ int sub_fd = -EBADF;
d3dcf4e3 2885
f344f7fd
LP
2886 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2887 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2888 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2889 * propagate a credential passed to us from further up. */
43144be4 2890
f344f7fd
LP
2891 if (path_is_absolute(lc->path)) {
2892 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2893 if (sub_fd < 0 && !IN_SET(errno,
2894 ENOTDIR, /* Not a directory */
2895 ENOENT)) /* Doesn't exist? */
2896 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2897 }
43144be4 2898
61c5a49e 2899 if (sub_fd < 0)
f344f7fd 2900 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2901 r = load_credential(
2902 context,
2903 params,
2904 lc->id,
2905 lc->path,
2906 lc->encrypted,
2907 unit,
661e4251 2908 AT_FDCWD,
10b44e1d
LP
2909 dfd,
2910 uid,
2911 ownership_ok,
2912 &left);
61c5a49e 2913 else
10b44e1d 2914 /* Directory */
3989bdc1
AB
2915 r = recurse_dir(
2916 sub_fd,
11348386 2917 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2918 /* statx_mask= */ 0,
2919 /* n_depth_max= */ UINT_MAX,
9883cbb2 2920 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2921 load_cred_recurse_dir_cb,
2922 &(struct load_cred_args) {
3989bdc1
AB
2923 .context = context,
2924 .params = params,
461345a1 2925 .encrypted = lc->encrypted,
3989bdc1
AB
2926 .unit = unit,
2927 .dfd = dfd,
2928 .uid = uid,
2929 .ownership_ok = ownership_ok,
2930 .left = &left,
2931 });
61c5a49e
LP
2932 if (r < 0)
2933 return r;
bb0c0d6f
LP
2934 }
2935
9e6e9d61
LP
2936 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2937 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2938 HASHMAP_FOREACH(sc, context->set_credentials) {
2939 _cleanup_(erase_and_freep) void *plaintext = NULL;
2940 const char *data;
2941 size_t size, add;
2942
9e6e9d61
LP
2943 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2944 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2945 * slow and involved, hence it's nice to be able to skip that if the credential already
2946 * exists anyway. */
43144be4
LP
2947 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2948 continue;
2949 if (errno != ENOENT)
2950 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2951
2952 if (sc->encrypted) {
6a0779cb 2953 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
43144be4
LP
2954 if (r < 0)
2955 return r;
2956
2957 data = plaintext;
2958 } else {
2959 data = sc->data;
2960 size = sc->size;
2961 }
2962
2963 add = strlen(sc->id) + size;
2964 if (add > left)
2965 return -E2BIG;
2966
2967 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2968 if (r < 0)
2969 return r;
2970
43144be4
LP
2971 left -= add;
2972 }
2973
bb0c0d6f
LP
2974 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2975 return -errno;
2976
2977 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2978 * accessible */
2979
2980 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2981 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2982 if (r < 0) {
2983 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2984 return r;
2985
2986 if (!ownership_ok)
2987 return r;
2988
f5fbe71d 2989 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2990 return -errno;
2991 }
2992 }
2993
2994 return 0;
2995}
2996
2997static int setup_credentials_internal(
2998 const ExecContext *context,
2999 const ExecParameters *params,
d3dcf4e3 3000 const char *unit,
bb0c0d6f
LP
3001 const char *final, /* This is where the credential store shall eventually end up at */
3002 const char *workspace, /* This is where we can prepare it before moving it to the final place */
3003 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
3004 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
3005 uid_t uid) {
3006
3007 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
3008 * if we mounted something; false if we definitely can't mount anything */
3009 bool final_mounted;
3010 const char *where;
3011
3012 assert(context);
3013 assert(final);
3014 assert(workspace);
3015
3016 if (reuse_workspace) {
3017 r = path_is_mount_point(workspace, NULL, 0);
3018 if (r < 0)
3019 return r;
3020 if (r > 0)
3021 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
3022 else
3023 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
3024 } else
3025 workspace_mounted = -1; /* ditto */
3026
3027 r = path_is_mount_point(final, NULL, 0);
3028 if (r < 0)
3029 return r;
3030 if (r > 0) {
3031 /* If the final place already has something mounted, we use that. If the workspace also has
3032 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3033 * different). */
3034 final_mounted = true;
3035
3036 if (workspace_mounted < 0) {
f0353cf2 3037 /* If the final place is mounted, but the workspace isn't, then let's bind mount
bb0c0d6f
LP
3038 * the final version to the workspace, and make it writable, so that we can make
3039 * changes */
3040
21935150
LP
3041 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3042 if (r < 0)
3043 return r;
bb0c0d6f 3044
21935150
LP
3045 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3046 if (r < 0)
3047 return r;
bb0c0d6f
LP
3048
3049 workspace_mounted = true;
3050 }
3051 } else
3052 final_mounted = false;
3053
3054 if (workspace_mounted < 0) {
3055 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3056 for (int try = 0;; try++) {
3057
3058 if (try == 0) {
3059 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3060 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3061 if (r >= 0) {
bb0c0d6f
LP
3062 workspace_mounted = true;
3063 break;
3064 }
3065
3066 } else if (try == 1) {
3067 _cleanup_free_ char *opts = NULL;
3068
43144be4 3069 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3070 return -ENOMEM;
3071
3072 /* Fall back to "tmpfs" otherwise */
21935150
LP
3073 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3074 if (r >= 0) {
bb0c0d6f
LP
3075 workspace_mounted = true;
3076 break;
3077 }
3078
3079 } else {
3080 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3081 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3082 if (r < 0) {
3083 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3084 return r;
bb0c0d6f
LP
3085
3086 if (must_mount) /* If we it's not OK to use the plain directory
3087 * fallback, propagate all errors too */
21935150 3088 return r;
bb0c0d6f
LP
3089
3090 /* If we lack privileges to bind mount stuff, then let's gracefully
3091 * proceed for compat with container envs, and just use the final dir
3092 * as is. */
3093
3094 workspace_mounted = false;
3095 break;
3096 }
3097
3098 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3099 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3100 if (r < 0)
3101 return r;
bb0c0d6f
LP
3102
3103 workspace_mounted = true;
3104 break;
3105 }
3106 }
3107 }
3108
3109 assert(!must_mount || workspace_mounted > 0);
3110 where = workspace_mounted ? workspace : final;
3111
03bc11d1 3112 (void) label_fix_full(AT_FDCWD, where, final, 0);
e3a0a862 3113
d3dcf4e3 3114 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3115 if (r < 0)
3116 return r;
3117
3118 if (workspace_mounted) {
3119 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3120 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3121 if (r < 0)
3122 return r;
bb0c0d6f
LP
3123
3124 /* And mount it to the final place, read-only */
21935150
LP
3125 if (final_mounted)
3126 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3127 else
3128 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3129 if (r < 0)
3130 return r;
bb0c0d6f
LP
3131 } else {
3132 _cleanup_free_ char *parent = NULL;
3133
3134 /* If we do not have our own mount put used the plain directory fallback, then we need to
3135 * open access to the top-level credential directory and the per-service directory now */
3136
45519d13
LP
3137 r = path_extract_directory(final, &parent);
3138 if (r < 0)
3139 return r;
bb0c0d6f
LP
3140 if (chmod(parent, 0755) < 0)
3141 return -errno;
3142 }
3143
3144 return 0;
3145}
3146
3147static int setup_credentials(
3148 const ExecContext *context,
3149 const ExecParameters *params,
3150 const char *unit,
3151 uid_t uid) {
3152
3153 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3154 int r;
3155
3156 assert(context);
3157 assert(params);
3158
3159 if (!exec_context_has_credentials(context))
3160 return 0;
3161
3162 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3163 return -EINVAL;
3164
3165 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3166 * and the subdir we mount over with a read-only file system readable by the service's user */
3167 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3168 if (!q)
3169 return -ENOMEM;
3170
3171 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3172 if (r < 0 && r != -EEXIST)
3173 return r;
3174
3175 p = path_join(q, unit);
3176 if (!p)
3177 return -ENOMEM;
3178
3179 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3180 if (r < 0 && r != -EEXIST)
3181 return r;
3182
3183 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3184 if (r < 0) {
3185 _cleanup_free_ char *t = NULL, *u = NULL;
3186
3187 /* If this is not a privilege or support issue then propagate the error */
3188 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3189 return r;
3190
3191 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3192 * it into place, so that users can't access half-initialized credential stores. */
3193 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3194 if (!t)
3195 return -ENOMEM;
3196
3197 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3198 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3199 * after it is fully set up */
3200 u = path_join(t, unit);
3201 if (!u)
3202 return -ENOMEM;
3203
3204 FOREACH_STRING(i, t, u) {
3205 r = mkdir_label(i, 0700);
3206 if (r < 0 && r != -EEXIST)
3207 return r;
3208 }
3209
3210 r = setup_credentials_internal(
3211 context,
3212 params,
d3dcf4e3 3213 unit,
bb0c0d6f
LP
3214 p, /* final mount point */
3215 u, /* temporary workspace to overmount */
3216 true, /* reuse the workspace if it is already a mount */
3217 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3218 uid);
3219
3220 (void) rmdir(u); /* remove the workspace again if we can. */
3221
3222 if (r < 0)
3223 return r;
3224
3225 } else if (r == 0) {
3226
3227 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3228 * we can use the same directory for all cases, after turning off propagation. Question
3229 * though is: where do we turn off propagation exactly, and where do we place the workspace
3230 * directory? We need some place that is guaranteed to be a mount point in the host, and
3231 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3232 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3233 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3234 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3235 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3236 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3237 * propagation on the former, and then overmount the latter.
3238 *
3239 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3240 * for this purpose, but there are few other candidates that work equally well for us, and
3241 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3242 * that no one else sees this should be OK to do. */
bb0c0d6f 3243
21935150
LP
3244 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3245 if (r < 0)
bb0c0d6f
LP
3246 goto child_fail;
3247
3248 r = setup_credentials_internal(
3249 context,
3250 params,
d3dcf4e3 3251 unit,
bb0c0d6f
LP
3252 p, /* final mount point */
3253 "/dev/shm", /* temporary workspace to overmount */
3254 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3255 true, /* insist that something is mounted, do not allow fallback to plain directory */
3256 uid);
3257 if (r < 0)
3258 goto child_fail;
3259
3260 _exit(EXIT_SUCCESS);
3261
3262 child_fail:
3263 _exit(EXIT_FAILURE);
3264 }
3265
3266 return 0;
3267}
3268
92b423b9 3269#if ENABLE_SMACK
cefc33ae 3270static int setup_smack(
aa5ae971 3271 const Manager *manager,
cefc33ae 3272 const ExecContext *context,
b83d5050 3273 int executable_fd) {
cefc33ae
LP
3274 int r;
3275
3276 assert(context);
b83d5050 3277 assert(executable_fd >= 0);
cefc33ae 3278
cefc33ae
LP
3279 if (context->smack_process_label) {
3280 r = mac_smack_apply_pid(0, context->smack_process_label);
3281 if (r < 0)
3282 return r;
aa5ae971 3283 } else if (manager->default_smack_process_label) {
cefc33ae
LP
3284 _cleanup_free_ char *exec_label = NULL;
3285
b83d5050 3286 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 3287 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
3288 return r;
3289
aa5ae971 3290 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
cefc33ae
LP
3291 if (r < 0)
3292 return r;
3293 }
cefc33ae
LP
3294
3295 return 0;
3296}
92b423b9 3297#endif
cefc33ae 3298
6c47cd7d
LP
3299static int compile_bind_mounts(
3300 const ExecContext *context,
3301 const ExecParameters *params,
3302 BindMount **ret_bind_mounts,
da6053d0 3303 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3304 char ***ret_empty_directories) {
3305
3306 _cleanup_strv_free_ char **empty_directories = NULL;
3307 BindMount *bind_mounts;
5b10116e 3308 size_t n, h = 0;
6c47cd7d
LP
3309 int r;
3310
3311 assert(context);
3312 assert(params);
3313 assert(ret_bind_mounts);
3314 assert(ret_n_bind_mounts);
3315 assert(ret_empty_directories);
3316
3317 n = context->n_bind_mounts;
5b10116e 3318 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3319 if (!params->prefix[t])
3320 continue;
3321
a2ab603c
YW
3322 for (size_t i = 0; i < context->directories[t].n_items; i++)
3323 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
3324 }
3325
3326 if (n <= 0) {
3327 *ret_bind_mounts = NULL;
3328 *ret_n_bind_mounts = 0;
3329 *ret_empty_directories = NULL;
3330 return 0;
3331 }
3332
3333 bind_mounts = new(BindMount, n);
3334 if (!bind_mounts)
3335 return -ENOMEM;
3336
5b10116e 3337 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3338 BindMount *item = context->bind_mounts + i;
3339 char *s, *d;
3340
3341 s = strdup(item->source);
3342 if (!s) {
3343 r = -ENOMEM;
3344 goto finish;
3345 }
3346
3347 d = strdup(item->destination);
3348 if (!d) {
3349 free(s);
3350 r = -ENOMEM;
3351 goto finish;
3352 }
3353
3354 bind_mounts[h++] = (BindMount) {
3355 .source = s,
3356 .destination = d,
3357 .read_only = item->read_only,
3358 .recursive = item->recursive,
3359 .ignore_enoent = item->ignore_enoent,
3360 };
3361 }
3362
5b10116e 3363 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3364 if (!params->prefix[t])
3365 continue;
3366
211a3d87 3367 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3368 continue;
3369
494d0247 3370 if (exec_directory_is_private(context, t) &&
74e12520 3371 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3372 char *private_root;
3373
3374 /* So this is for a dynamic user, and we need to make sure the process can access its own
3375 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3376 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3377
657ee2d8 3378 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3379 if (!private_root) {
3380 r = -ENOMEM;
3381 goto finish;
3382 }
3383
3384 r = strv_consume(&empty_directories, private_root);
a635a7ae 3385 if (r < 0)
6c47cd7d 3386 goto finish;
6c47cd7d
LP
3387 }
3388
211a3d87 3389 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3390 char *s, *d;
3391
a2ab603c
YW
3392 /* When one of the parent directories is in the list, we cannot create the symlink
3393 * for the child directory. See also the comments in setup_exec_directory(). */
3394 if (context->directories[t].items[i].only_create)
3395 continue;
3396
494d0247 3397 if (exec_directory_is_private(context, t))
211a3d87 3398 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3399 else
211a3d87 3400 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3401 if (!s) {
3402 r = -ENOMEM;
3403 goto finish;
3404 }
3405
494d0247 3406 if (exec_directory_is_private(context, t) &&
74e12520 3407 exec_context_with_rootfs(context))
5609f688
YW
3408 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3409 * directory is not created on the root directory. So, let's bind-mount the directory
3410 * on the 'non-private' place. */
211a3d87 3411 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3412 else
3413 d = strdup(s);
6c47cd7d
LP
3414 if (!d) {
3415 free(s);
3416 r = -ENOMEM;
3417 goto finish;
3418 }
3419
3420 bind_mounts[h++] = (BindMount) {
3421 .source = s,
3422 .destination = d,
3423 .read_only = false,
9ce4e4b0 3424 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3425 .recursive = true,
3426 .ignore_enoent = false,
3427 };
3428 }
3429 }
3430
3431 assert(h == n);
3432
3433 *ret_bind_mounts = bind_mounts;
3434 *ret_n_bind_mounts = n;
ae2a15bc 3435 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3436
3437 return (int) n;
3438
3439finish:
3440 bind_mount_free_many(bind_mounts, h);
3441 return r;
3442}
3443
df61e79a
LB
3444/* ret_symlinks will contain a list of pairs src:dest that describes
3445 * the symlinks to create later on. For example, the symlinks needed
3446 * to safely give private directories to DynamicUser=1 users. */
3447static int compile_symlinks(
3448 const ExecContext *context,
3449 const ExecParameters *params,
3450 char ***ret_symlinks) {
3451
3452 _cleanup_strv_free_ char **symlinks = NULL;
3453 int r;
3454
3455 assert(context);
3456 assert(params);
3457 assert(ret_symlinks);
3458
3459 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3460 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3461 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3462
211a3d87
LB
3463 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3464 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3465
211a3d87
LB
3466 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3467 dst_abs = path_join(params->prefix[dt], *symlink);
3468 if (!src_abs || !dst_abs)
3469 return -ENOMEM;
df61e79a 3470
211a3d87
LB
3471 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3472 if (r < 0)
3473 return r;
3474 }
3475
a2ab603c
YW
3476 if (!exec_directory_is_private(context, dt) ||
3477 exec_context_with_rootfs(context) ||
3478 context->directories[dt].items[i].only_create)
211a3d87
LB
3479 continue;
3480
3481 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3482 if (!private_path)
3483 return -ENOMEM;
3484
211a3d87 3485 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3486 if (!path)
3487 return -ENOMEM;
3488
3489 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3490 if (r < 0)
3491 return r;
3492 }
3493 }
3494
3495 *ret_symlinks = TAKE_PTR(symlinks);
3496
3497 return 0;
3498}
3499
4e677599
LP
3500static bool insist_on_sandboxing(
3501 const ExecContext *context,
3502 const char *root_dir,
3503 const char *root_image,
3504 const BindMount *bind_mounts,
3505 size_t n_bind_mounts) {
3506
4e677599
LP
3507 assert(context);
3508 assert(n_bind_mounts == 0 || bind_mounts);
3509
3510 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3511 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3512 * rearrange stuff in a way we cannot ignore gracefully. */
3513
3514 if (context->n_temporary_filesystems > 0)
3515 return true;
3516
3517 if (root_dir || root_image)
3518 return true;
3519
b3d13314
LB
3520 if (context->n_mount_images > 0)
3521 return true;
3522
4e677599
LP
3523 if (context->dynamic_user)
3524 return true;
3525
4355c04f
LB
3526 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3527 return true;
3528
4e677599
LP
3529 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3530 * essential. */
5b10116e 3531 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3532 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3533 return true;
3534
91dd5f7c
LP
3535 if (context->log_namespace)
3536 return true;
3537
4e677599
LP
3538 return false;
3539}
3540
6818c54c 3541static int apply_mount_namespace(
34cf6c43 3542 const Unit *u,
9f71ba8d 3543 ExecCommandFlags command_flags,
6818c54c
LP
3544 const ExecContext *context,
3545 const ExecParameters *params,
7cc5ef5f
ZJS
3546 const ExecRuntime *runtime,
3547 char **error_path) {
6818c54c 3548
df61e79a 3549 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3550 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3551 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3552 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3553 *extension_dir = NULL;
228af36f 3554 NamespaceInfo ns_info;
165a31c0 3555 bool needs_sandboxing;
6c47cd7d 3556 BindMount *bind_mounts = NULL;
da6053d0 3557 size_t n_bind_mounts = 0;
6818c54c 3558 int r;
93c6bb51 3559
2b3c1b9e
DH
3560 assert(context);
3561
915e6d16
LP
3562 if (params->flags & EXEC_APPLY_CHROOT) {
3563 root_image = context->root_image;
3564
3565 if (!root_image)
3566 root_dir = context->root_directory;
3567 }
93c6bb51 3568
6c47cd7d
LP
3569 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3570 if (r < 0)
3571 return r;
3572
211a3d87 3573 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3574 r = compile_symlinks(context, params, &symlinks);
3575 if (r < 0)
41abd7f6 3576 goto finalize;
df61e79a 3577
9f71ba8d 3578 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3579 if (needs_sandboxing) {
3580 /* The runtime struct only contains the parent of the private /tmp,
3581 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3582 * that is sticky, and that's the one we want to use here.
3583 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3584
3585 if (context->private_tmp && runtime) {
56a13a49
ZJS
3586 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3587 tmp_dir = runtime->tmp_dir;
3588 else if (runtime->tmp_dir)
3589 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3590
3591 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3592 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3593 else if (runtime->var_tmp_dir)
56a13a49 3594 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3595 }
3596
b5a33299
YW
3597 ns_info = (NamespaceInfo) {
3598 .ignore_protect_paths = false,
3599 .private_dev = context->private_devices,
3600 .protect_control_groups = context->protect_control_groups,
3601 .protect_kernel_tunables = context->protect_kernel_tunables,
3602 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3603 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3604 .protect_hostname = context->protect_hostname,
5e98086d 3605 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
52b3d652
LP
3606 .protect_home = context->protect_home,
3607 .protect_system = context->protect_system,
4e399953
LP
3608 .protect_proc = context->protect_proc,
3609 .proc_subset = context->proc_subset,
c2da3bf2 3610 .private_network = exec_needs_network_namespace(context),
fde36d25 3611 .private_ipc = exec_needs_ipc_namespace(context),
6720e356 3612 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3613 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3614 };
ecf63c91 3615 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3616 /*
3617 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3618 * sandbox info, otherwise enforce it, don't ignore protected paths and
3619 * fail if we are enable to apply the sandbox inside the mount namespace.
3620 */
3621 ns_info = (NamespaceInfo) {
3622 .ignore_protect_paths = true,
3623 };
3624 else
3625 ns_info = (NamespaceInfo) {};
b5a33299 3626
37ed15d7
FB
3627 if (context->mount_flags == MS_SHARED)
3628 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3629
a631cbfa
LP
3630 if (exec_context_has_credentials(context) &&
3631 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3632 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3633 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3634 if (!creds_path) {
3635 r = -ENOMEM;
3636 goto finalize;
3637 }
bbb4e7f3
LP
3638 }
3639
5e8deb94
LB
3640 if (MANAGER_IS_SYSTEM(u->manager)) {
3641 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3642 if (!propagate_dir) {
3643 r = -ENOMEM;
3644 goto finalize;
3645 }
3646
5e8deb94 3647 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3648 if (!incoming_dir) {
3649 r = -ENOMEM;
3650 goto finalize;
3651 }
24759d8f
LB
3652
3653 extension_dir = strdup("/run/systemd/unit-extensions");
3654 if (!extension_dir) {
3655 r = -ENOMEM;
3656 goto finalize;
3657 }
3658 } else
3659 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3660 r = -ENOMEM;
3661 goto finalize;
3662 }
5e8deb94 3663
18d73705 3664 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3665 &ns_info, context->read_write_paths,
165a31c0
LP
3666 needs_sandboxing ? context->read_only_paths : NULL,
3667 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3668 needs_sandboxing ? context->exec_paths : NULL,
3669 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3670 empty_directories,
df61e79a 3671 symlinks,
6c47cd7d
LP
3672 bind_mounts,
3673 n_bind_mounts,
2abd4e38
YW
3674 context->temporary_filesystems,
3675 context->n_temporary_filesystems,
b3d13314
LB
3676 context->mount_images,
3677 context->n_mount_images,
56a13a49
ZJS
3678 tmp_dir,
3679 var_tmp_dir,
bbb4e7f3 3680 creds_path,
91dd5f7c 3681 context->log_namespace,
915e6d16 3682 context->mount_flags,
d4d55b0d
LB
3683 context->root_hash, context->root_hash_size, context->root_hash_path,
3684 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3685 context->root_verity,
93f59701
LB
3686 context->extension_images,
3687 context->n_extension_images,
a07b9926 3688 context->extension_directories,
5e8deb94
LB
3689 propagate_dir,
3690 incoming_dir,
24759d8f 3691 extension_dir,
3bdc25a4 3692 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3693 error_path);
93c6bb51 3694
1beab8b0 3695 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3696 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3697 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3698 * completely different execution environment. */
aca835ed 3699 if (r == -ENOANO) {
4e677599
LP
3700 if (insist_on_sandboxing(
3701 context,
3702 root_dir, root_image,
3703 bind_mounts,
3704 n_bind_mounts)) {
3705 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3706 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3707 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3708
3709 r = -EOPNOTSUPP;
3710 } else {
aca835ed 3711 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3712 r = 0;
aca835ed 3713 }
93c6bb51
DH
3714 }
3715
8062e643 3716finalize:
4e677599 3717 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3718 return r;
3719}
3720
915e6d16
LP
3721static int apply_working_directory(
3722 const ExecContext *context,
3723 const ExecParameters *params,
3724 const char *home,
376fecf6 3725 int *exit_status) {
915e6d16 3726
6732edab 3727 const char *d, *wd;
2b3c1b9e
DH
3728
3729 assert(context);
376fecf6 3730 assert(exit_status);
2b3c1b9e 3731
6732edab
LP
3732 if (context->working_directory_home) {
3733
376fecf6
LP
3734 if (!home) {
3735 *exit_status = EXIT_CHDIR;
6732edab 3736 return -ENXIO;
376fecf6 3737 }
6732edab 3738
2b3c1b9e 3739 wd = home;
6732edab 3740
14eb3285
LP
3741 } else
3742 wd = empty_to_root(context->working_directory);
e7f1e7c6 3743
fa97f630 3744 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3745 d = wd;
fa97f630 3746 else
3b0e5bb5 3747 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3748
376fecf6
LP
3749 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3750 *exit_status = EXIT_CHDIR;
2b3c1b9e 3751 return -errno;
376fecf6 3752 }
e7f1e7c6
DH
3753
3754 return 0;
3755}
3756
fa97f630
JB
3757static int apply_root_directory(
3758 const ExecContext *context,
3759 const ExecParameters *params,
3760 const bool needs_mount_ns,
3761 int *exit_status) {
3762
3763 assert(context);
3764 assert(exit_status);
3765
5b10116e 3766 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3767 if (!needs_mount_ns && context->root_directory)
3768 if (chroot(context->root_directory) < 0) {
3769 *exit_status = EXIT_CHROOT;
3770 return -errno;
3771 }
fa97f630
JB
3772
3773 return 0;
3774}
3775
b1edf445 3776static int setup_keyring(
34cf6c43 3777 const Unit *u,
b1edf445
LP
3778 const ExecContext *context,
3779 const ExecParameters *p,
3780 uid_t uid, gid_t gid) {
3781
74dd6b51 3782 key_serial_t keyring;
e64c2d0b
DJL
3783 int r = 0;
3784 uid_t saved_uid;
3785 gid_t saved_gid;
74dd6b51
LP
3786
3787 assert(u);
b1edf445 3788 assert(context);
74dd6b51
LP
3789 assert(p);
3790
3791 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3792 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3793 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3794 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3795 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3796 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3797
b1edf445
LP
3798 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3799 return 0;
3800
e64c2d0b
DJL
3801 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3802 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3803 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3804 * & group is just as nasty as acquiring a reference to the user keyring. */
3805
3806 saved_uid = getuid();
3807 saved_gid = getgid();
3808
3809 if (gid_is_valid(gid) && gid != saved_gid) {
3810 if (setregid(gid, -1) < 0)
3811 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3812 }
3813
3814 if (uid_is_valid(uid) && uid != saved_uid) {
3815 if (setreuid(uid, -1) < 0) {
3816 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3817 goto out;
3818 }
3819 }
3820
74dd6b51
LP
3821 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3822 if (keyring == -1) {
3823 if (errno == ENOSYS)
8002fb97 3824 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3825 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3826 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3827 else if (errno == EDQUOT)
8002fb97 3828 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3829 else
e64c2d0b 3830 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3831
e64c2d0b 3832 goto out;
74dd6b51
LP
3833 }
3834
e64c2d0b
DJL
3835 /* When requested link the user keyring into the session keyring. */
3836 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3837
3838 if (keyctl(KEYCTL_LINK,
3839 KEY_SPEC_USER_KEYRING,
3840 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3841 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3842 goto out;
3843 }
3844 }
3845
3846 /* Restore uid/gid back */
3847 if (uid_is_valid(uid) && uid != saved_uid) {
3848 if (setreuid(saved_uid, -1) < 0) {
3849 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3850 goto out;
3851 }
3852 }
3853
3854 if (gid_is_valid(gid) && gid != saved_gid) {
3855 if (setregid(saved_gid, -1) < 0)
3856 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3857 }
3858
3859 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3860 if (!sd_id128_is_null(u->invocation_id)) {
3861 key_serial_t key;
3862
3863 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3864 if (key == -1)
8002fb97 3865 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3866 else {
3867 if (keyctl(KEYCTL_SETPERM, key,
3868 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3869 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3870 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3871 }
3872 }
3873
e64c2d0b 3874out:
37b22b3b 3875 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3876 /* no extra logging, as only the first already reported error matters */
3877 if (getuid() != saved_uid)
3878 (void) setreuid(saved_uid, -1);
b1edf445 3879
e64c2d0b
DJL
3880 if (getgid() != saved_gid)
3881 (void) setregid(saved_gid, -1);
b1edf445 3882
e64c2d0b 3883 return r;
74dd6b51
LP
3884}
3885
3042bbeb 3886static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3887 assert(array);
3888 assert(n);
2caa38e9 3889 assert(pair);
29206d46
LP
3890
3891 if (pair[0] >= 0)
3892 array[(*n)++] = pair[0];
3893 if (pair[1] >= 0)
3894 array[(*n)++] = pair[1];
3895}
3896
a34ceba6
LP
3897static int close_remaining_fds(
3898 const ExecParameters *params,
34cf6c43
YW
3899 const ExecRuntime *runtime,
3900 const DynamicCreds *dcreds,
00d9ef85 3901 int user_lookup_fd,
a34ceba6 3902 int socket_fd,
5b8d1f6b 3903 const int *fds, size_t n_fds) {
a34ceba6 3904
da6053d0 3905 size_t n_dont_close = 0;
00d9ef85 3906 int dont_close[n_fds + 12];
a34ceba6
LP
3907
3908 assert(params);
3909
3910 if (params->stdin_fd >= 0)
3911 dont_close[n_dont_close++] = params->stdin_fd;
3912 if (params->stdout_fd >= 0)
3913 dont_close[n_dont_close++] = params->stdout_fd;
3914 if (params->stderr_fd >= 0)
3915 dont_close[n_dont_close++] = params->stderr_fd;
3916
3917 if (socket_fd >= 0)
3918 dont_close[n_dont_close++] = socket_fd;
3919 if (n_fds > 0) {
3920 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3921 n_dont_close += n_fds;
3922 }
3923
a70581ff 3924 if (runtime) {
29206d46 3925 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3926 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3927 }
29206d46
LP
3928
3929 if (dcreds) {
3930 if (dcreds->user)
3931 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3932 if (dcreds->group)
3933 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3934 }
3935
00d9ef85
LP
3936 if (user_lookup_fd >= 0)
3937 dont_close[n_dont_close++] = user_lookup_fd;
3938
a34ceba6
LP
3939 return close_all_fds(dont_close, n_dont_close);
3940}
3941
00d9ef85
LP
3942static int send_user_lookup(
3943 Unit *unit,
3944 int user_lookup_fd,
3945 uid_t uid,
3946 gid_t gid) {
3947
3948 assert(unit);
3949
3950 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3951 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3952 * specified. */
3953
3954 if (user_lookup_fd < 0)
3955 return 0;
3956
3957 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3958 return 0;
3959
3960 if (writev(user_lookup_fd,
3961 (struct iovec[]) {
e6a7ec4b
LP
3962 IOVEC_INIT(&uid, sizeof(uid)),
3963 IOVEC_INIT(&gid, sizeof(gid)),
3964 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3965 return -errno;
3966
3967 return 0;
3968}
3969
6732edab
LP
3970static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3971 int r;
3972
3973 assert(c);
3974 assert(home);
3975 assert(buf);
3976
3977 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3978
3979 if (*home)
3980 return 0;
3981
3982 if (!c->working_directory_home)
3983 return 0;
3984
6732edab
LP
3985 r = get_home_dir(buf);
3986 if (r < 0)
3987 return r;
3988
3989 *home = *buf;
3990 return 1;
3991}
3992
da50b85a
LP
3993static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3994 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3995 int r;
3996
3997 assert(c);
3998 assert(p);
3999 assert(ret);
4000
4001 assert(c->dynamic_user);
4002
4003 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
4004 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
4005 * directories. */
4006
5b10116e 4007 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
4008 if (t == EXEC_DIRECTORY_CONFIGURATION)
4009 continue;
4010
4011 if (!p->prefix[t])
4012 continue;
4013
211a3d87 4014 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
4015 char *e;
4016
494d0247 4017 if (exec_directory_is_private(c, t))
211a3d87 4018 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 4019 else
211a3d87 4020 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
4021 if (!e)
4022 return -ENOMEM;
4023
4024 r = strv_consume(&list, e);
4025 if (r < 0)
4026 return r;
4027 }
4028 }
4029
ae2a15bc 4030 *ret = TAKE_PTR(list);
da50b85a
LP
4031
4032 return 0;
4033}
4034
78f93209
LP
4035static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4036 bool using_subcgroup;
4037 char *p;
4038
4039 assert(params);
4040 assert(ret);
4041
4042 if (!params->cgroup_path)
4043 return -EINVAL;
4044
4045 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4046 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4047 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4048 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4049 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4050 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4051 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4052 * flag, which is only passed for the former statements, not for the latter. */
4053
4054 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4055 if (using_subcgroup)
657ee2d8 4056 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4057 else
4058 p = strdup(params->cgroup_path);
4059 if (!p)
4060 return -ENOMEM;
4061
4062 *ret = p;
4063 return using_subcgroup;
4064}
4065
e2b2fb7f
MS
4066static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4067 _cleanup_(cpu_set_reset) CPUSet s = {};
4068 int r;
4069
4070 assert(c);
4071 assert(ret);
4072
4073 if (!c->numa_policy.nodes.set) {
4074 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4075 return 0;
4076 }
4077
4078 r = numa_to_cpu_set(&c->numa_policy, &s);
4079 if (r < 0)
4080 return r;
4081
4082 cpu_set_reset(ret);
4083
4084 return cpu_set_add_all(ret, &s);
4085}
4086
4087bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4088 assert(c);
4089
4090 return c->cpu_affinity_from_numa;
4091}
4092
1da37e58
ZJS
4093static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4094 int r;
4095
4096 assert(fds);
4097 assert(n_fds);
4098 assert(*n_fds < fds_size);
4099 assert(ret_fd);
4100
4101 if (fd < 0) {
254d1313 4102 *ret_fd = -EBADF;
1da37e58
ZJS
4103 return 0;
4104 }
4105
4106 if (fd < 3 + (int) *n_fds) {
4107 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4108 * the fds we pass to the process (or which are closed only during execve). */
4109
4110 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4111 if (r < 0)
4112 return -errno;
4113
ee3455cf 4114 close_and_replace(fd, r);
1da37e58
ZJS
4115 }
4116
4117 *ret_fd = fds[*n_fds] = fd;
4118 (*n_fds) ++;
4119 return 1;
4120}
4121
cd48e23f
RP
4122static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
4123 union sockaddr_union addr = {
4124 .un.sun_family = AF_UNIX,
4125 };
4126 socklen_t sa_len;
4127 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
4128 int r;
4129
4130 assert(u);
4131 assert(of);
4132 assert(ofd >= 0);
4133
4134 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
4135 if (r < 0)
4136 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
4137
4138 sa_len = r;
4139
4140 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
4141 _cleanup_close_ int fd = -EBADF;
4142
4143 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
4144 if (fd < 0)
4145 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
4146
4147 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
4148 if (r == -EPROTOTYPE)
4149 continue;
4150 if (r < 0)
4151 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
4152
4153 return TAKE_FD(fd);
4154 }
4155
4156 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
4157}
4158
4159static int get_open_file_fd(Unit *u, const OpenFile *of) {
4160 struct stat st;
4161 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
4162
4163 assert(u);
4164 assert(of);
4165
4166 ofd = open(of->path, O_PATH | O_CLOEXEC);
4167 if (ofd < 0)
4168 return log_error_errno(errno, "Could not open \"%s\": %m", of->path);
4169 if (fstat(ofd, &st) < 0)
4170 return log_error_errno(errno, "Failed to stat %s: %m", of->path);
4171
4172 if (S_ISSOCK(st.st_mode)) {
4173 fd = connect_unix_harder(u, of, ofd);
4174 if (fd < 0)
4175 return fd;
4176
4177 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
4178 return log_error_errno(errno, "Failed to shutdown send for socket %s: %m", of->path);
4179
4180 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
4181 } else {
4182 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
4183 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
4184 flags |= O_APPEND;
4185 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
4186 flags |= O_TRUNC;
4187
4188 fd = fd_reopen(ofd, flags | O_CLOEXEC);
4189 if (fd < 0)
4190 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
4191
4192 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
4193 }
4194
4195 return TAKE_FD(fd);
4196}
4197
4198static int collect_open_file_fds(
4199 Unit *u,
4200 OpenFile* open_files,
4201 int **fds,
4202 char ***fdnames,
4203 size_t *n_fds) {
4204 int r;
4205
4206 assert(u);
4207 assert(fds);
4208 assert(fdnames);
4209 assert(n_fds);
4210
4211 LIST_FOREACH(open_files, of, open_files) {
4212 _cleanup_close_ int fd = -EBADF;
4213
4214 fd = get_open_file_fd(u, of);
4215 if (fd < 0) {
4216 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
4217 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
4218 continue;
4219 }
4220
4221 return fd;
4222 }
4223
4224 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
4225 return -ENOMEM;
4226
4227 r = strv_extend(fdnames, of->fdname);
4228 if (r < 0)
4229 return r;
4230
4231 (*fds)[*n_fds] = TAKE_FD(fd);
4232
4233 (*n_fds)++;
4234 }
4235
4236 return 0;
4237}
4238
ff0af2a1 4239static int exec_child(
f2341e0a 4240 Unit *unit,
34cf6c43 4241 const ExecCommand *command,
ff0af2a1
LP
4242 const ExecContext *context,
4243 const ExecParameters *params,
4244 ExecRuntime *runtime,
29206d46 4245 DynamicCreds *dcreds,
ff0af2a1 4246 int socket_fd,
2caa38e9 4247 const int named_iofds[static 3],
cd48e23f 4248 int *params_fds,
da6053d0 4249 size_t n_socket_fds,
25b583d7 4250 size_t n_storage_fds,
ff0af2a1 4251 char **files_env,
00d9ef85 4252 int user_lookup_fd,
12145637 4253 int *exit_status) {
d35fbf6b 4254
8c35c10d 4255 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4256 int r, ngids = 0, exec_fd;
4d885bd3
DH
4257 _cleanup_free_ gid_t *supplementary_gids = NULL;
4258 const char *username = NULL, *groupname = NULL;
5686391b 4259 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 4260 const char *home = NULL, *shell = NULL;
7ca69792 4261 char **final_argv = NULL;
7bce046b
LP
4262 dev_t journal_stream_dev = 0;
4263 ino_t journal_stream_ino = 0;
5749f855 4264 bool userns_set_up = false;
165a31c0
LP
4265 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4266 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4267 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4268 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4269#if HAVE_SELINUX
7f59dd35 4270 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4271 bool use_selinux = false;
ecfbc84f 4272#endif
f9fa32f0 4273#if ENABLE_SMACK
43b1f709 4274 bool use_smack = false;
ecfbc84f 4275#endif
349cc4a5 4276#if HAVE_APPARMOR
43b1f709 4277 bool use_apparmor = false;
ecfbc84f 4278#endif
5749f855
AZ
4279 uid_t saved_uid = getuid();
4280 gid_t saved_gid = getgid();
fed1e721
LP
4281 uid_t uid = UID_INVALID;
4282 gid_t gid = GID_INVALID;
1da37e58
ZJS
4283 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4284 n_keep_fds; /* total number of fds not to close */
165a31c0 4285 int secure_bits;
afb11bf1
DG
4286 _cleanup_free_ gid_t *gids_after_pam = NULL;
4287 int ngids_after_pam = 0;
cd48e23f
RP
4288 _cleanup_free_ int *fds = NULL;
4289 _cleanup_strv_free_ char **fdnames = NULL;
034c6ed7 4290
f2341e0a 4291 assert(unit);
5cb5a6ff
LP
4292 assert(command);
4293 assert(context);
d35fbf6b 4294 assert(params);
ff0af2a1 4295 assert(exit_status);
d35fbf6b 4296
69339ae9
LP
4297 /* Explicitly test for CVE-2021-4034 inspired invocations */
4298 assert(command->path);
4299 assert(!strv_isempty(command->argv));
4300
d35fbf6b
DM
4301 rename_process_from_path(command->path);
4302
9c274488
LP
4303 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4304 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4305 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4306 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4307 SIGNALS_IGNORE);
d35fbf6b
DM
4308
4309 if (context->ignore_sigpipe)
9c274488 4310 (void) ignore_signals(SIGPIPE);
d35fbf6b 4311
ff0af2a1
LP
4312 r = reset_signal_mask();
4313 if (r < 0) {
4314 *exit_status = EXIT_SIGNAL_MASK;
12145637 4315 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4316 }
034c6ed7 4317
d35fbf6b
DM
4318 if (params->idle_pipe)
4319 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4320
2c027c62
LP
4321 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4322 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4323 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4324 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4325
d35fbf6b 4326 log_forget_fds();
2c027c62 4327 log_set_open_when_needed(true);
4f2d528d 4328
40a80078
LP
4329 /* In case anything used libc syslog(), close this here, too */
4330 closelog();
4331
cd48e23f
RP
4332 fds = newdup(int, params_fds, n_fds);
4333 if (!fds) {
4334 *exit_status = EXIT_MEMORY;
4335 return log_oom();
4336 }
4337
4338 fdnames = strv_copy((char**) params->fd_names);
4339 if (!fdnames) {
4340 *exit_status = EXIT_MEMORY;
4341 return log_oom();
4342 }
4343
4344 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4345 if (r < 0) {
4346 *exit_status = EXIT_FDS;
4347 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4348 }
4349
b1994387 4350 int keep_fds[n_fds + 3];
1da37e58
ZJS
4351 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4352 n_keep_fds = n_fds;
4353
4354 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4355 if (r < 0) {
4356 *exit_status = EXIT_FDS;
4357 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4358 }
4359
b1994387 4360#if HAVE_LIBBPF
46004616
ZJS
4361 if (unit->manager->restrict_fs) {
4362 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4363 if (bpf_map_fd < 0) {
4364 *exit_status = EXIT_FDS;
46004616 4365 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4366 }
4367
4368 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4369 if (r < 0) {
4370 *exit_status = EXIT_FDS;
4371 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4372 }
4373 }
4374#endif
4375
1da37e58 4376 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4377 if (r < 0) {
4378 *exit_status = EXIT_FDS;
12145637 4379 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4380 }
4381
0af07108
ZJS
4382 if (!context->same_pgrp &&
4383 setsid() < 0) {
4384 *exit_status = EXIT_SETSID;
4385 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4386 }
9e2f7c11 4387
1e22b5cd 4388 exec_context_tty_reset(context, params);
d35fbf6b 4389
c891efaf 4390 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4391 _cleanup_free_ char *cmdline = NULL;
4392
4ef15008 4393 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4394 if (!cmdline) {
0460aa5c 4395 *exit_status = EXIT_MEMORY;
12145637 4396 return log_oom();
3b20f877 4397 }
d35fbf6b 4398
4ef15008 4399 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4400 if (r != CONFIRM_EXECUTE) {
4401 if (r == CONFIRM_PRETEND_SUCCESS) {
4402 *exit_status = EXIT_SUCCESS;
4403 return 0;
4404 }
ff0af2a1 4405 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4406 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4407 "Execution cancelled by the user");
d35fbf6b
DM
4408 }
4409 }
1a63a750 4410
d521916d
LP
4411 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4412 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4413 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4414 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4415 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4416 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4417 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4418 *exit_status = EXIT_MEMORY;
4419 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4420 }
4421
29206d46 4422 if (context->dynamic_user && dcreds) {
da50b85a 4423 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4424
d521916d 4425 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4426 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4427 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4428 *exit_status = EXIT_USER;
12145637 4429 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4430 }
4431
da50b85a
LP
4432 r = compile_suggested_paths(context, params, &suggested_paths);
4433 if (r < 0) {
4434 *exit_status = EXIT_MEMORY;
4435 return log_oom();
4436 }
4437
4438 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4439 if (r < 0) {
4440 *exit_status = EXIT_USER;
d85ff944
YW
4441 if (r == -EILSEQ)
4442 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4443 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4444 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4445 }
524daa8c 4446
70dd455c 4447 if (!uid_is_valid(uid)) {
29206d46 4448 *exit_status = EXIT_USER;
d85ff944 4449 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4450 }
4451
4452 if (!gid_is_valid(gid)) {
4453 *exit_status = EXIT_USER;
d85ff944 4454 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4455 }
5bc7452b 4456
29206d46
LP
4457 if (dcreds->user)
4458 username = dcreds->user->name;
4459
4460 } else {
4d885bd3
DH
4461 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4462 if (r < 0) {
4463 *exit_status = EXIT_USER;
12145637 4464 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4465 }
5bc7452b 4466
4d885bd3
DH
4467 r = get_fixed_group(context, &groupname, &gid);
4468 if (r < 0) {
4469 *exit_status = EXIT_GROUP;
12145637 4470 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4471 }
cdc5d5c5 4472 }
29206d46 4473
cdc5d5c5
DH
4474 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4475 r = get_supplementary_groups(context, username, groupname, gid,
4476 &supplementary_gids, &ngids);
4477 if (r < 0) {
4478 *exit_status = EXIT_GROUP;
12145637 4479 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4480 }
5bc7452b 4481
00d9ef85
LP
4482 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4483 if (r < 0) {
4484 *exit_status = EXIT_USER;
12145637 4485 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4486 }
4487
4488 user_lookup_fd = safe_close(user_lookup_fd);
4489
6732edab
LP
4490 r = acquire_home(context, uid, &home, &home_buffer);
4491 if (r < 0) {
4492 *exit_status = EXIT_CHDIR;
12145637 4493 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4494 }
4495
d35fbf6b
DM
4496 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4497 * must sure to drop O_NONBLOCK */
4498 if (socket_fd >= 0)
a34ceba6 4499 (void) fd_nonblock(socket_fd, false);
acbb0225 4500
4c70a4a7
MS
4501 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4502 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4503 if (params->cgroup_path) {
4504 _cleanup_free_ char *p = NULL;
4505
4506 r = exec_parameters_get_cgroup_path(params, &p);
4507 if (r < 0) {
4508 *exit_status = EXIT_CGROUP;
4509 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4510 }
4511
4512 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4513 if (r == -EUCLEAN) {
4514 *exit_status = EXIT_CGROUP;
4515 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4516 "because the cgroup or one of its parents or "
4517 "siblings is in the threaded mode: %m", p);
4518 }
4c70a4a7
MS
4519 if (r < 0) {
4520 *exit_status = EXIT_CGROUP;
4521 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4522 }
4523 }
4524
a8d08f39 4525 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4526 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4527 if (r < 0) {
4528 *exit_status = EXIT_NETWORK;
4529 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4530 }
4531 }
4532
a70581ff
XR
4533 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4534 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4535 if (r < 0) {
4536 *exit_status = EXIT_NAMESPACE;
4537 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4538 }
4539 }
4540
52c239d7 4541 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4542 if (r < 0) {
4543 *exit_status = EXIT_STDIN;
12145637 4544 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4545 }
034c6ed7 4546
52c239d7 4547 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4548 if (r < 0) {
4549 *exit_status = EXIT_STDOUT;
12145637 4550 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4551 }
4552
52c239d7 4553 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4554 if (r < 0) {
4555 *exit_status = EXIT_STDERR;
12145637 4556 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4557 }
4558
d35fbf6b 4559 if (context->oom_score_adjust_set) {
9f8168eb
LP
4560 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4561 * prohibit write access to this file, and we shouldn't trip up over that. */
4562 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4563 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4564 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4565 else if (r < 0) {
ff0af2a1 4566 *exit_status = EXIT_OOM_ADJUST;
12145637 4567 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4568 }
d35fbf6b
DM
4569 }
4570
ad21e542
ZJS
4571 if (context->coredump_filter_set) {
4572 r = set_coredump_filter(context->coredump_filter);
4573 if (ERRNO_IS_PRIVILEGE(r))
4574 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4575 else if (r < 0)
4576 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4577 }
4578
39090201
DJL
4579 if (context->nice_set) {
4580 r = setpriority_closest(context->nice);
4581 if (r < 0)
4582 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4583 }
613b411c 4584
d35fbf6b
DM
4585 if (context->cpu_sched_set) {
4586 struct sched_param param = {
4587 .sched_priority = context->cpu_sched_priority,
4588 };
4589
ff0af2a1
LP
4590 r = sched_setscheduler(0,
4591 context->cpu_sched_policy |
4592 (context->cpu_sched_reset_on_fork ?
4593 SCHED_RESET_ON_FORK : 0),
4594 &param);
4595 if (r < 0) {
4596 *exit_status = EXIT_SETSCHEDULER;
12145637 4597 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4598 }
d35fbf6b 4599 }
fc9b2a84 4600
e2b2fb7f
MS
4601 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4602 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4603 const CPUSet *cpu_set;
4604
4605 if (context->cpu_affinity_from_numa) {
4606 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4607 if (r < 0) {
4608 *exit_status = EXIT_CPUAFFINITY;
4609 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4610 }
4611
4612 cpu_set = &converted_cpu_set;
4613 } else
4614 cpu_set = &context->cpu_set;
4615
4616 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4617 *exit_status = EXIT_CPUAFFINITY;
12145637 4618 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4619 }
e2b2fb7f 4620 }
034c6ed7 4621
b070c7c0
MS
4622 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4623 r = apply_numa_policy(&context->numa_policy);
4624 if (r == -EOPNOTSUPP)
33fe9e3f 4625 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4626 else if (r < 0) {
4627 *exit_status = EXIT_NUMA_POLICY;
4628 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4629 }
4630 }
4631
d35fbf6b
DM
4632 if (context->ioprio_set)
4633 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4634 *exit_status = EXIT_IOPRIO;
12145637 4635 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4636 }
da726a4d 4637
d35fbf6b
DM
4638 if (context->timer_slack_nsec != NSEC_INFINITY)
4639 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4640 *exit_status = EXIT_TIMERSLACK;
12145637 4641 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4642 }
9eba9da4 4643
21022b9d
LP
4644 if (context->personality != PERSONALITY_INVALID) {
4645 r = safe_personality(context->personality);
4646 if (r < 0) {
ff0af2a1 4647 *exit_status = EXIT_PERSONALITY;
12145637 4648 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4649 }
21022b9d 4650 }
94f04347 4651
33331d11
VB
4652 if (context->utmp_id) {
4653 const char *line = context->tty_path ?
4654 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4655 NULL;
df0ff127 4656 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4657 line,
023a4f67
LP
4658 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4659 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4660 USER_PROCESS,
6a93917d 4661 username);
33331d11 4662 }
d35fbf6b 4663
08f67696 4664 if (uid_is_valid(uid)) {
ff0af2a1
LP
4665 r = chown_terminal(STDIN_FILENO, uid);
4666 if (r < 0) {
4667 *exit_status = EXIT_STDIN;
12145637 4668 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4669 }
d35fbf6b 4670 }
8e274523 4671
4e1dfa45 4672 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4673 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4674 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4675 * touch a single hierarchy too. */
584b8688 4676 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4677 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4678 if (r < 0) {
4679 *exit_status = EXIT_CGROUP;
12145637 4680 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4681 }
d35fbf6b 4682 }
034c6ed7 4683
211a3d87
LB
4684 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4685
5b10116e 4686 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4687 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4688 if (r < 0)
4689 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4690 }
94f04347 4691
bb0c0d6f
LP
4692 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4693 r = setup_credentials(context, params, unit->id, uid);
4694 if (r < 0) {
4695 *exit_status = EXIT_CREDENTIALS;
4696 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4697 }
4698 }
4699
7bce046b 4700 r = build_environment(
fd63e712 4701 unit,
7bce046b
LP
4702 context,
4703 params,
4704 n_fds,
cd48e23f 4705 fdnames,
7bce046b
LP
4706 home,
4707 username,
4708 shell,
4709 journal_stream_dev,
4710 journal_stream_ino,
4711 &our_env);
2065ca69
JW
4712 if (r < 0) {
4713 *exit_status = EXIT_MEMORY;
12145637 4714 return log_oom();
2065ca69
JW
4715 }
4716
4717 r = build_pass_environment(context, &pass_env);
4718 if (r < 0) {
4719 *exit_status = EXIT_MEMORY;
12145637 4720 return log_oom();
2065ca69
JW
4721 }
4722
adf769b0
ZJS
4723 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4724 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4725 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4726 if (!strv_isempty(context->exec_search_path)) {
4727 _cleanup_free_ char *joined = NULL;
4728
4729 joined = strv_join(context->exec_search_path, ":");
4730 if (!joined) {
4731 *exit_status = EXIT_MEMORY;
4732 return log_oom();
4733 }
4734
4735 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4736 if (r < 0) {
4737 *exit_status = EXIT_MEMORY;
4738 return log_oom();
4739 }
4740 }
4741
4ab3d29f 4742 accum_env = strv_env_merge(params->environment,
2065ca69 4743 our_env,
8c35c10d 4744 joined_exec_search_path,
2065ca69
JW
4745 pass_env,
4746 context->environment,
44e5d006 4747 files_env);
2065ca69
JW
4748 if (!accum_env) {
4749 *exit_status = EXIT_MEMORY;
12145637 4750 return log_oom();
2065ca69 4751 }
1280503b 4752 accum_env = strv_env_clean(accum_env);
2065ca69 4753
096424d1 4754 (void) umask(context->umask);
b213e1c1 4755
b1edf445 4756 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4757 if (r < 0) {
4758 *exit_status = EXIT_KEYRING;
12145637 4759 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4760 }
4761
adf769b0
ZJS
4762 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4763 * from it. */
1703fa41 4764 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4765
adf769b0
ZJS
4766 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4767 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4768 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4769
adf769b0
ZJS
4770 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4771 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4772 * desired. */
165a31c0
LP
4773 if (needs_ambient_hack)
4774 needs_setuid = false;
4775 else
4776 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4777
4778 if (needs_sandboxing) {
adf769b0
ZJS
4779 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4780 * /sys being present. The actual MAC context application will happen later, as late as
4781 * possible, to avoid impacting our own code paths. */
7f18ef0a 4782
349cc4a5 4783#if HAVE_SELINUX
43b1f709 4784 use_selinux = mac_selinux_use();
7f18ef0a 4785#endif
f9fa32f0 4786#if ENABLE_SMACK
43b1f709 4787 use_smack = mac_smack_use();
7f18ef0a 4788#endif
349cc4a5 4789#if HAVE_APPARMOR
43b1f709 4790 use_apparmor = mac_apparmor_use();
7f18ef0a 4791#endif
165a31c0 4792 }
7f18ef0a 4793
ce932d2d
LP
4794 if (needs_sandboxing) {
4795 int which_failed;
4796
4797 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4798 * is set here. (See below.) */
4799
4800 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4801 if (r < 0) {
4802 *exit_status = EXIT_LIMITS;
4803 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4804 }
4805 }
4806
0af07108 4807 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4808 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4809 * wins here. (See above.) */
4810
1da37e58 4811 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4812 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4813 if (r < 0) {
4814 *exit_status = EXIT_PAM;
4815 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4816 }
ac45f971 4817
0af07108
ZJS
4818 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4819 if (ngids_after_pam < 0) {
4820 *exit_status = EXIT_MEMORY;
4821 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4822 }
b213e1c1 4823 }
5749f855 4824
26c45a6c 4825 if (needs_sandboxing && context->private_users && have_effective_cap(CAP_SYS_ADMIN) <= 0) {
5749f855
AZ
4826 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4827 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4828 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4829
4830 userns_set_up = true;
4831 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4832 if (r < 0) {
4833 *exit_status = EXIT_USER;
4834 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4835 }
4836 }
4837
fbbb9697 4838 if (exec_needs_network_namespace(context) && runtime && runtime->netns_storage_socket[0] >= 0) {
a8d08f39 4839
6e2d7c4f 4840 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4841 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4842 if (r == -EPERM)
4843 log_unit_warning_errno(unit, r,
4844 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4845 else if (r < 0) {
6e2d7c4f
MS
4846 *exit_status = EXIT_NETWORK;
4847 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4848 }
a8d08f39
LP
4849 } else if (context->network_namespace_path) {
4850 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4851 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4852 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4853 } else
4854 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4855 }
169c1bda 4856
fde36d25 4857 if (exec_needs_ipc_namespace(context) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
a70581ff
XR
4858
4859 if (ns_type_supported(NAMESPACE_IPC)) {
4860 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4861 if (r == -EPERM)
4862 log_unit_warning_errno(unit, r,
4863 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4864 else if (r < 0) {
4865 *exit_status = EXIT_NAMESPACE;
4866 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4867 }
4868 } else if (context->ipc_namespace_path) {
4869 *exit_status = EXIT_NAMESPACE;
4870 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4871 "IPCNamespacePath= is not supported, refusing.");
4872 } else
4873 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4874 }
4875
ee818b89 4876 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4877 _cleanup_free_ char *error_path = NULL;
4878
9f71ba8d 4879 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4880 if (r < 0) {
4881 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4882 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4883 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4884 }
d35fbf6b 4885 }
81a2b7ce 4886
daf8f72b
LP
4887 if (needs_sandboxing) {
4888 r = apply_protect_hostname(unit, context, exit_status);
4889 if (r < 0)
4890 return r;
aecd5ac6
TM
4891 }
4892
5749f855
AZ
4893 /* Drop groups as early as possible.
4894 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4895 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4896 if (needs_setuid) {
afb11bf1
DG
4897 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4898 int ngids_to_enforce = 0;
4899
4900 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4901 ngids,
4902 gids_after_pam,
4903 ngids_after_pam,
4904 &gids_to_enforce);
4905 if (ngids_to_enforce < 0) {
4906 *exit_status = EXIT_MEMORY;
4907 return log_unit_error_errno(unit,
4908 ngids_to_enforce,
4909 "Failed to merge group lists. Group membership might be incorrect: %m");
4910 }
4911
4912 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4913 if (r < 0) {
4914 *exit_status = EXIT_GROUP;
12145637 4915 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4916 }
165a31c0 4917 }
096424d1 4918
5749f855
AZ
4919 /* If the user namespace was not set up above, try to do it now.
4920 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 4921 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
4922 * case of mount namespaces being less privileged when the mount point list is copied from a
4923 * different user namespace). */
9008e1ac 4924
5749f855
AZ
4925 if (needs_sandboxing && context->private_users && !userns_set_up) {
4926 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4927 if (r < 0) {
4928 *exit_status = EXIT_USER;
4929 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4930 }
4931 }
4932
9f71ba8d
ZJS
4933 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4934 * shall execute. */
4935
4936 _cleanup_free_ char *executable = NULL;
254d1313 4937 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 4938 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4939 if (r < 0) {
4940 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4941 log_unit_struct_errno(unit, LOG_INFO, r,
4942 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4943 LOG_UNIT_INVOCATION_ID(unit),
4944 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4945 command->path),
4946 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4947 return 0;
4948 }
4949
4950 *exit_status = EXIT_EXEC;
c2503e35
RH
4951
4952 return log_unit_struct_errno(unit, LOG_INFO, r,
4953 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4954 LOG_UNIT_INVOCATION_ID(unit),
4955 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4956 command->path),
4957 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4958 }
4959
b83d5050
ZJS
4960 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4961 if (r < 0) {
4962 *exit_status = EXIT_FDS;
4963 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4964 }
4965
9f71ba8d 4966#if HAVE_SELINUX
49590d67 4967 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 4968 int fd = -EBADF;
49590d67
MS
4969
4970 if (socket_fd >= 0)
4971 fd = socket_fd;
4972 else if (params->n_socket_fds == 1)
4973 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4974 * use context from that fd to compute the label. */
4975 fd = params->fds[0];
4976
4977 if (fd >= 0) {
4978 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4979 if (r < 0) {
4980 if (!context->selinux_context_ignore) {
4981 *exit_status = EXIT_SELINUX_CONTEXT;
4982 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4983 }
4984 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4985 }
9f71ba8d
ZJS
4986 }
4987 }
4988#endif
4989
165a31c0 4990 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4991 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4992 * however if we have it as we want to keep it open until the final execve(). */
4993
1da37e58 4994 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4995 if (r >= 0)
4996 r = shift_fds(fds, n_fds);
4997 if (r >= 0)
cd48e23f 4998 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
4999 if (r < 0) {
5000 *exit_status = EXIT_FDS;
12145637 5001 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 5002 }
e66cf1a3 5003
5686391b
LP
5004 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
5005 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
5006 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
5007 * came this far. */
5008
165a31c0 5009 secure_bits = context->secure_bits;
e66cf1a3 5010
165a31c0
LP
5011 if (needs_sandboxing) {
5012 uint64_t bset;
e66cf1a3 5013
ce932d2d
LP
5014 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
5015 * requested. (Note this is placed after the general resource limit initialization, see
5016 * above, in order to take precedence.) */
f4170c67
LP
5017 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
5018 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
5019 *exit_status = EXIT_LIMITS;
12145637 5020 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
5021 }
5022 }
5023
37ac2744
JB
5024#if ENABLE_SMACK
5025 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
5026 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
5027 if (use_smack) {
aa5ae971 5028 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 5029 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
5030 *exit_status = EXIT_SMACK_PROCESS_LABEL;
5031 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
5032 }
5033 }
5034#endif
5035
165a31c0
LP
5036 bset = context->capability_bounding_set;
5037 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
5038 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
5039 * instead of us doing that */
5040 if (needs_ambient_hack)
5041 bset |= (UINT64_C(1) << CAP_SETPCAP) |
5042 (UINT64_C(1) << CAP_SETUID) |
5043 (UINT64_C(1) << CAP_SETGID);
5044
5045 if (!cap_test_all(bset)) {
5046 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
5047 if (r < 0) {
5048 *exit_status = EXIT_CAPABILITIES;
12145637 5049 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 5050 }
4c2630eb 5051 }
3b8bddde 5052
16fcb191
TK
5053 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
5054 * keep-caps set.
a954b249
LP
5055 *
5056 * To be able to raise the ambient capabilities after setresuid() they have to be added to
5057 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
5058 * the ambient capabilities can be raised as they are present in the permitted and
5059 * inhertiable set. However it is possible that someone wants to set ambient capabilities
5060 * without changing the user, so we also set the ambient capabilities here.
5061 *
5062 * The requested ambient capabilities are raised in the inheritable set if the second
5063 * argument is true. */
943800f4 5064 if (!needs_ambient_hack) {
755d4b67
IP
5065 r = capability_ambient_set_apply(context->capability_ambient_set, true);
5066 if (r < 0) {
5067 *exit_status = EXIT_CAPABILITIES;
12145637 5068 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 5069 }
755d4b67 5070 }
165a31c0 5071 }
755d4b67 5072
fa97f630
JB
5073 /* chroot to root directory first, before we lose the ability to chroot */
5074 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
5075 if (r < 0)
5076 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
5077
165a31c0 5078 if (needs_setuid) {
08f67696 5079 if (uid_is_valid(uid)) {
ff0af2a1
LP
5080 r = enforce_user(context, uid);
5081 if (r < 0) {
5082 *exit_status = EXIT_USER;
12145637 5083 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 5084 }
165a31c0
LP
5085
5086 if (!needs_ambient_hack &&
5087 context->capability_ambient_set != 0) {
755d4b67 5088
16fcb191 5089 /* Raise the ambient capabilities after user change. */
755d4b67
IP
5090 r = capability_ambient_set_apply(context->capability_ambient_set, false);
5091 if (r < 0) {
5092 *exit_status = EXIT_CAPABILITIES;
12145637 5093 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 5094 }
755d4b67 5095 }
5b6319dc 5096 }
165a31c0 5097 }
d35fbf6b 5098
56ef8db9
JB
5099 /* Apply working directory here, because the working directory might be on NFS and only the user running
5100 * this service might have the correct privilege to change to the working directory */
fa97f630 5101 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
5102 if (r < 0)
5103 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
5104
165a31c0 5105 if (needs_sandboxing) {
37ac2744 5106 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
5107 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
5108 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
5109 * are restricted. */
5110
349cc4a5 5111#if HAVE_SELINUX
43b1f709 5112 if (use_selinux) {
5cd9cd35
LP
5113 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
5114
5115 if (exec_context) {
5116 r = setexeccon(exec_context);
006d1864
TM
5117 if (r < 0) {
5118 if (!context->selinux_context_ignore) {
5119 *exit_status = EXIT_SELINUX_CONTEXT;
5120 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5121 }
5122 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
5123 }
5124 }
5125 }
5126#endif
5127
349cc4a5 5128#if HAVE_APPARMOR
43b1f709 5129 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
5130 r = aa_change_onexec(context->apparmor_profile);
5131 if (r < 0 && !context->apparmor_profile_ignore) {
5132 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 5133 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
5134 }
5135 }
5136#endif
5137
a954b249
LP
5138 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
5139 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
5140 * requires CAP_SETPCAP. */
dbdc4098 5141 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 5142 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098 5143 * effective set here.
a954b249
LP
5144 *
5145 * The effective set is overwritten during execve() with the following values:
5146 *
dbdc4098 5147 * - ambient set (for non-root processes)
a954b249 5148 *
dbdc4098
TK
5149 * - (inheritable | bounding) set for root processes)
5150 *
5151 * Hence there is no security impact to raise it in the effective set before execve
5152 */
a954b249 5153 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
dbdc4098
TK
5154 if (r < 0) {
5155 *exit_status = EXIT_CAPABILITIES;
5156 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
5157 }
755d4b67 5158 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 5159 *exit_status = EXIT_SECUREBITS;
12145637 5160 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 5161 }
dbdc4098 5162 }
5b6319dc 5163
59eeb84b 5164 if (context_has_no_new_privileges(context))
d35fbf6b 5165 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 5166 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 5167 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
5168 }
5169
349cc4a5 5170#if HAVE_SECCOMP
469830d1
LP
5171 r = apply_address_families(unit, context);
5172 if (r < 0) {
5173 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 5174 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 5175 }
04aa0cb9 5176
469830d1
LP
5177 r = apply_memory_deny_write_execute(unit, context);
5178 if (r < 0) {
5179 *exit_status = EXIT_SECCOMP;
12145637 5180 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5181 }
f4170c67 5182
469830d1
LP
5183 r = apply_restrict_realtime(unit, context);
5184 if (r < 0) {
5185 *exit_status = EXIT_SECCOMP;
12145637 5186 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5187 }
5188
f69567cb
LP
5189 r = apply_restrict_suid_sgid(unit, context);
5190 if (r < 0) {
5191 *exit_status = EXIT_SECCOMP;
5192 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5193 }
5194
add00535
LP
5195 r = apply_restrict_namespaces(unit, context);
5196 if (r < 0) {
5197 *exit_status = EXIT_SECCOMP;
12145637 5198 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5199 }
5200
469830d1
LP
5201 r = apply_protect_sysctl(unit, context);
5202 if (r < 0) {
5203 *exit_status = EXIT_SECCOMP;
12145637 5204 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5205 }
5206
469830d1
LP
5207 r = apply_protect_kernel_modules(unit, context);
5208 if (r < 0) {
5209 *exit_status = EXIT_SECCOMP;
12145637 5210 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5211 }
5212
84703040
KK
5213 r = apply_protect_kernel_logs(unit, context);
5214 if (r < 0) {
5215 *exit_status = EXIT_SECCOMP;
5216 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5217 }
5218
fc64760d
KK
5219 r = apply_protect_clock(unit, context);
5220 if (r < 0) {
5221 *exit_status = EXIT_SECCOMP;
5222 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5223 }
5224
469830d1
LP
5225 r = apply_private_devices(unit, context);
5226 if (r < 0) {
5227 *exit_status = EXIT_SECCOMP;
12145637 5228 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5229 }
5230
5231 r = apply_syscall_archs(unit, context);
5232 if (r < 0) {
5233 *exit_status = EXIT_SECCOMP;
12145637 5234 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5235 }
5236
78e864e5
TM
5237 r = apply_lock_personality(unit, context);
5238 if (r < 0) {
5239 *exit_status = EXIT_SECCOMP;
12145637 5240 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5241 }
5242
9df2cdd8
TM
5243 r = apply_syscall_log(unit, context);
5244 if (r < 0) {
5245 *exit_status = EXIT_SECCOMP;
5246 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5247 }
5248
5cd9cd35
LP
5249 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5250 * by the filter as little as possible. */
165a31c0 5251 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5252 if (r < 0) {
5253 *exit_status = EXIT_SECCOMP;
12145637 5254 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5255 }
5256#endif
b1994387
ILG
5257
5258#if HAVE_LIBBPF
5259 r = apply_restrict_filesystems(unit, context);
5260 if (r < 0) {
5261 *exit_status = EXIT_BPF;
5262 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5263 }
5264#endif
5265
d35fbf6b 5266 }
034c6ed7 5267
00819cc1
LP
5268 if (!strv_isempty(context->unset_environment)) {
5269 char **ee = NULL;
5270
5271 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5272 if (!ee) {
5273 *exit_status = EXIT_MEMORY;
12145637 5274 return log_oom();
00819cc1
LP
5275 }
5276
130d3d22 5277 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5278 }
5279
7ca69792
AZ
5280 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5281 replaced_argv = replace_env_argv(command->argv, accum_env);
5282 if (!replaced_argv) {
5283 *exit_status = EXIT_MEMORY;
5284 return log_oom();
5285 }
5286 final_argv = replaced_argv;
5287 } else
5288 final_argv = command->argv;
034c6ed7 5289
f1d34068 5290 if (DEBUG_LOGGING) {
c2b2df60 5291 _cleanup_free_ char *line = NULL;
81a2b7ce 5292
4ef15008 5293 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5294 if (!line) {
5295 *exit_status = EXIT_MEMORY;
5296 return log_oom();
5297 }
5298
5299 log_unit_struct(unit, LOG_DEBUG,
5300 "EXECUTABLE=%s", executable,
5301 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5302 }
dd305ec9 5303
5686391b
LP
5304 if (exec_fd >= 0) {
5305 uint8_t hot = 1;
5306
5307 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5308 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5309
5310 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5311 *exit_status = EXIT_EXEC;
5312 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5313 }
5314 }
5315
a6d9111c 5316 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5317
5318 if (exec_fd >= 0) {
5319 uint8_t hot = 0;
5320
5321 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5322 * that POLLHUP on it no longer means execve() succeeded. */
5323
5324 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5325 *exit_status = EXIT_EXEC;
5326 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5327 }
5328 }
12145637 5329
ff0af2a1 5330 *exit_status = EXIT_EXEC;
9f71ba8d 5331 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5332}
81a2b7ce 5333
34cf6c43 5334static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5335static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5336
f2341e0a
LP
5337int exec_spawn(Unit *unit,
5338 ExecCommand *command,
d35fbf6b
DM
5339 const ExecContext *context,
5340 const ExecParameters *params,
5341 ExecRuntime *runtime,
29206d46 5342 DynamicCreds *dcreds,
d35fbf6b 5343 pid_t *ret) {
8351ceae 5344
ee39ca20 5345 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5346 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5347 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5348 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5349 _cleanup_free_ char *line = NULL;
d35fbf6b 5350 pid_t pid;
8351ceae 5351
f2341e0a 5352 assert(unit);
d35fbf6b
DM
5353 assert(command);
5354 assert(context);
5355 assert(ret);
5356 assert(params);
25b583d7 5357 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5358
d35fbf6b
DM
5359 if (context->std_input == EXEC_INPUT_SOCKET ||
5360 context->std_output == EXEC_OUTPUT_SOCKET ||
5361 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5362
d85ff944
YW
5363 if (params->n_socket_fds > 1)
5364 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5365
d85ff944
YW
5366 if (params->n_socket_fds == 0)
5367 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5368
d35fbf6b
DM
5369 socket_fd = params->fds[0];
5370 } else {
254d1313 5371 socket_fd = -EBADF;
d35fbf6b 5372 fds = params->fds;
9b141911 5373 n_socket_fds = params->n_socket_fds;
25b583d7 5374 n_storage_fds = params->n_storage_fds;
d35fbf6b 5375 }
94f04347 5376
34cf6c43 5377 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5378 if (r < 0)
5379 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5380
f2341e0a 5381 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5382 if (r < 0)
f2341e0a 5383 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5384
4ef15008 5385 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5386 if (!line)
5387 return log_oom();
fab56fc5 5388
9f71ba8d
ZJS
5389 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5390 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5391 mac_selinux_maybe_reload();
5392
c2503e35
RH
5393 log_unit_struct(unit, LOG_DEBUG,
5394 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5395 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5396 the mount namespace in the child, but we want to log
5397 from the parent, so we need to use the (possibly
5398 inaccurate) path here. */
5399 LOG_UNIT_INVOCATION_ID(unit));
12145637 5400
78f93209
LP
5401 if (params->cgroup_path) {
5402 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5403 if (r < 0)
5404 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5405 if (r > 0) { /* We are using a child cgroup */
5406 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5407 if (r < 0)
5408 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa 5409
523ea123 5410 /* Normally we would not propagate the xattrs to children but since we created this
4e806bfa
AZ
5411 * sub-cgroup internally we should do it. */
5412 cgroup_oomd_xattr_apply(unit, subcgroup_path);
523ea123 5413 cgroup_log_xattr_apply(unit, subcgroup_path);
78f93209
LP
5414 }
5415 }
5416
d35fbf6b
DM
5417 pid = fork();
5418 if (pid < 0)
74129a12 5419 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5420
5421 if (pid == 0) {
12145637 5422 int exit_status = EXIT_SUCCESS;
ff0af2a1 5423
f2341e0a
LP
5424 r = exec_child(unit,
5425 command,
ff0af2a1
LP
5426 context,
5427 params,
5428 runtime,
29206d46 5429 dcreds,
ff0af2a1 5430 socket_fd,
52c239d7 5431 named_iofds,
4c47affc 5432 fds,
9b141911 5433 n_socket_fds,
25b583d7 5434 n_storage_fds,
ff0af2a1 5435 files_env,
00d9ef85 5436 unit->manager->user_lookup_fds[1],
12145637
LP
5437 &exit_status);
5438
e1714f02
ZJS
5439 if (r < 0) {
5440 const char *status =
5441 exit_status_to_string(exit_status,
e04ed6db 5442 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5443
c2503e35
RH
5444 log_unit_struct_errno(unit, LOG_ERR, r,
5445 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5446 LOG_UNIT_INVOCATION_ID(unit),
5447 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5448 status, command->path),
5449 "EXECUTABLE=%s", command->path);
e1714f02 5450 }
4c2630eb 5451
ff0af2a1 5452 _exit(exit_status);
034c6ed7
LP
5453 }
5454
f2341e0a 5455 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5456
78f93209
LP
5457 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5458 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5459 * process will be killed too). */
5460 if (subcgroup_path)
5461 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5462
b58b4116 5463 exec_status_start(&command->exec_status, pid);
9fb86720 5464
034c6ed7 5465 *ret = pid;
5cb5a6ff
LP
5466 return 0;
5467}
5468
034c6ed7
LP
5469void exec_context_init(ExecContext *c) {
5470 assert(c);
5471
4c12626c 5472 c->umask = 0022;
0692548c 5473 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5474 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5475 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5476 c->syslog_level_prefix = true;
353e12c2 5477 c->ignore_sigpipe = true;
3a43da28 5478 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5479 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5480 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5481 c->directories[t].mode = 0755;
12213aed 5482 c->timeout_clean_usec = USEC_INFINITY;
3fd5190b 5483 c->capability_bounding_set = CAP_MASK_UNSET;
aa9d574d
YW
5484 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5485 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5486 c->log_level_max = -1;
005bfaf1
TM
5487#if HAVE_SECCOMP
5488 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5489#endif
51462135
DDM
5490 c->tty_rows = UINT_MAX;
5491 c->tty_cols = UINT_MAX;
b070c7c0 5492 numa_policy_reset(&c->numa_policy);
24002121 5493 c->private_mounts = -1;
034c6ed7
LP
5494}
5495
613b411c 5496void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5497 assert(c);
5498
6796073e
LP
5499 c->environment = strv_free(c->environment);
5500 c->environment_files = strv_free(c->environment_files);
b4c14404 5501 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5502 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5503
31ce987c 5504 rlimit_free_all(c->rlimit);
034c6ed7 5505
5b10116e 5506 for (size_t l = 0; l < 3; l++) {
52c239d7 5507 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5508 c->stdio_file[l] = mfree(c->stdio_file[l]);
5509 }
52c239d7 5510
a1e58e8e
LP
5511 c->working_directory = mfree(c->working_directory);
5512 c->root_directory = mfree(c->root_directory);
915e6d16 5513 c->root_image = mfree(c->root_image);
18d73705 5514 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5515 c->root_hash = mfree(c->root_hash);
5516 c->root_hash_size = 0;
5517 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5518 c->root_hash_sig = mfree(c->root_hash_sig);
5519 c->root_hash_sig_size = 0;
5520 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5521 c->root_verity = mfree(c->root_verity);
93f59701 5522 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5523 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5524 c->tty_path = mfree(c->tty_path);
5525 c->syslog_identifier = mfree(c->syslog_identifier);
5526 c->user = mfree(c->user);
5527 c->group = mfree(c->group);
034c6ed7 5528
6796073e 5529 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5530
a1e58e8e 5531 c->pam_name = mfree(c->pam_name);
5b6319dc 5532
2a624c36
AP
5533 c->read_only_paths = strv_free(c->read_only_paths);
5534 c->read_write_paths = strv_free(c->read_write_paths);
5535 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5536 c->exec_paths = strv_free(c->exec_paths);
5537 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5538 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5539
d2d6c096 5540 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5541 c->bind_mounts = NULL;
5542 c->n_bind_mounts = 0;
2abd4e38
YW
5543 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5544 c->temporary_filesystems = NULL;
5545 c->n_temporary_filesystems = 0;
b3d13314 5546 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5547
0985c7c4 5548 cpu_set_reset(&c->cpu_set);
b070c7c0 5549 numa_policy_reset(&c->numa_policy);
86a3475b 5550
a1e58e8e
LP
5551 c->utmp_id = mfree(c->utmp_id);
5552 c->selinux_context = mfree(c->selinux_context);
5553 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5554 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5555
b1994387
ILG
5556 c->restrict_filesystems = set_free(c->restrict_filesystems);
5557
8cfa775f 5558 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5559 c->syscall_archs = set_free(c->syscall_archs);
5560 c->address_families = set_free(c->address_families);
e66cf1a3 5561
5b10116e 5562 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5563 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5564
5565 c->log_level_max = -1;
5566
5567 exec_context_free_log_extra_fields(c);
523ea123
QD
5568 c->log_filter_allowed_patterns = set_free(c->log_filter_allowed_patterns);
5569 c->log_filter_denied_patterns = set_free(c->log_filter_denied_patterns);
08f3be7a 5570
5ac1530e
ZJS
5571 c->log_ratelimit_interval_usec = 0;
5572 c->log_ratelimit_burst = 0;
90fc172e 5573
08f3be7a
LP
5574 c->stdin_data = mfree(c->stdin_data);
5575 c->stdin_data_size = 0;
a8d08f39
LP
5576
5577 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5578 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5579
5580 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5581
43144be4 5582 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5583 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5584}
5585
34cf6c43 5586int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5587 assert(c);
5588
5589 if (!runtime_prefix)
5590 return 0;
5591
211a3d87 5592 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5593 _cleanup_free_ char *p = NULL;
e66cf1a3 5594
494d0247 5595 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5596 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5597 else
211a3d87 5598 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5599 if (!p)
5600 return -ENOMEM;
5601
7bc4bf4a
LP
5602 /* We execute this synchronously, since we need to be sure this is gone when we start the
5603 * service next. */
c6878637 5604 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5605
211a3d87
LB
5606 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5607 _cleanup_free_ char *symlink_abs = NULL;
5608
5609 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5610 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5611 else
5612 symlink_abs = path_join(runtime_prefix, *symlink);
5613 if (!symlink_abs)
5614 return -ENOMEM;
5615
5616 (void) unlink(symlink_abs);
5617 }
e66cf1a3
LP
5618 }
5619
5620 return 0;
5cb5a6ff
LP
5621}
5622
bb0c0d6f
LP
5623int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5624 _cleanup_free_ char *p = NULL;
5625
5626 assert(c);
5627
5628 if (!runtime_prefix || !unit)
5629 return 0;
5630
5631 p = path_join(runtime_prefix, "credentials", unit);
5632 if (!p)
5633 return -ENOMEM;
5634
5635 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5636 * unmount it, and afterwards remove the mount point */
5637 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5638 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5639
5640 return 0;
5641}
5642
b9f976fb
MK
5643int exec_context_destroy_mount_ns_dir(Unit *u) {
5644 _cleanup_free_ char *p = NULL;
5645
5646 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5647 return 0;
5648
5649 p = path_join("/run/systemd/propagate/", u->id);
5650 if (!p)
5651 return -ENOMEM;
5652
5653 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5654 if (rmdir(p) < 0 && errno != ENOENT)
5655 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5656
5657 return 0;
5658}
5659
34cf6c43 5660static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5661 assert(c);
5662
a1e58e8e 5663 c->path = mfree(c->path);
6796073e 5664 c->argv = strv_free(c->argv);
43d0fcbd
LP
5665}
5666
da6053d0 5667void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5668 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5669 exec_command_done(c+i);
5670}
5671
f1acf85a 5672ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5673 ExecCommand *i;
5674
5675 while ((i = c)) {
71fda00f 5676 LIST_REMOVE(command, c, i);
43d0fcbd 5677 exec_command_done(i);
5cb5a6ff
LP
5678 free(i);
5679 }
f1acf85a
ZJS
5680
5681 return NULL;
5cb5a6ff
LP
5682}
5683
da6053d0 5684void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5685 for (size_t i = 0; i < n; i++)
f1acf85a 5686 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5687}
5688
6a1d4d9f 5689void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5690 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5691 exec_status_reset(&c[i].exec_status);
5692}
5693
5694void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5695 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5696 LIST_FOREACH(command, z, c[i])
5697 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5698}
5699
039f0e70 5700typedef struct InvalidEnvInfo {
34cf6c43 5701 const Unit *unit;
039f0e70
LP
5702 const char *path;
5703} InvalidEnvInfo;
5704
5705static void invalid_env(const char *p, void *userdata) {
5706 InvalidEnvInfo *info = userdata;
5707
f2341e0a 5708 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5709}
5710
52c239d7
LB
5711const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5712 assert(c);
5713
5714 switch (fd_index) {
5073ff6b 5715
52c239d7
LB
5716 case STDIN_FILENO:
5717 if (c->std_input != EXEC_INPUT_NAMED_FD)
5718 return NULL;
5073ff6b 5719
52c239d7 5720 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5721
52c239d7
LB
5722 case STDOUT_FILENO:
5723 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5724 return NULL;
5073ff6b 5725
52c239d7 5726 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5727
52c239d7
LB
5728 case STDERR_FILENO:
5729 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5730 return NULL;
5073ff6b 5731
52c239d7 5732 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5733
52c239d7
LB
5734 default:
5735 return NULL;
5736 }
5737}
5738
2caa38e9
LP
5739static int exec_context_named_iofds(
5740 const ExecContext *c,
5741 const ExecParameters *p,
5742 int named_iofds[static 3]) {
5743
5b10116e 5744 size_t targets;
56fbd561 5745 const char* stdio_fdname[3];
da6053d0 5746 size_t n_fds;
52c239d7
LB
5747
5748 assert(c);
5749 assert(p);
2caa38e9 5750 assert(named_iofds);
52c239d7
LB
5751
5752 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5753 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5754 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5755
5b10116e 5756 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5757 stdio_fdname[i] = exec_context_fdname(c, i);
5758
4c47affc
FB
5759 n_fds = p->n_storage_fds + p->n_socket_fds;
5760
5b10116e 5761 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5762 if (named_iofds[STDIN_FILENO] < 0 &&
5763 c->std_input == EXEC_INPUT_NAMED_FD &&
5764 stdio_fdname[STDIN_FILENO] &&
5765 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5766
52c239d7
LB
5767 named_iofds[STDIN_FILENO] = p->fds[i];
5768 targets--;
56fbd561
ZJS
5769
5770 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5771 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5772 stdio_fdname[STDOUT_FILENO] &&
5773 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5774
52c239d7
LB
5775 named_iofds[STDOUT_FILENO] = p->fds[i];
5776 targets--;
56fbd561
ZJS
5777
5778 } else if (named_iofds[STDERR_FILENO] < 0 &&
5779 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5780 stdio_fdname[STDERR_FILENO] &&
5781 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5782
52c239d7
LB
5783 named_iofds[STDERR_FILENO] = p->fds[i];
5784 targets--;
5785 }
5786
56fbd561 5787 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5788}
5789
398a5009
ZJS
5790static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5791 _cleanup_strv_free_ char **v = NULL;
398a5009 5792 int r;
8c7be95e
LP
5793
5794 assert(c);
398a5009 5795 assert(ret);
8c7be95e
LP
5796
5797 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5798 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5799 bool ignore = false;
5800 char *fn = *i;
8c7be95e
LP
5801
5802 if (fn[0] == '-') {
5803 ignore = true;
313cefa1 5804 fn++;
8c7be95e
LP
5805 }
5806
5807 if (!path_is_absolute(fn)) {
8c7be95e
LP
5808 if (ignore)
5809 continue;
8c7be95e
LP
5810 return -EINVAL;
5811 }
5812
2bef10ab 5813 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5814 r = safe_glob(fn, 0, &pglob);
5815 if (r < 0) {
2bef10ab
PL
5816 if (ignore)
5817 continue;
398a5009 5818 return r;
2bef10ab 5819 }
8c7be95e 5820
d8c92e8b
ZJS
5821 /* When we don't match anything, -ENOENT should be returned */
5822 assert(pglob.gl_pathc > 0);
5823
5b10116e 5824 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5825 _cleanup_strv_free_ char **p = NULL;
5826
5827 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5828 if (r < 0) {
2bef10ab
PL
5829 if (ignore)
5830 continue;
398a5009 5831 return r;
e9c1ea9d 5832 }
398a5009 5833
ebc05a09 5834 /* Log invalid environment variables with filename */
039f0e70
LP
5835 if (p) {
5836 InvalidEnvInfo info = {
f2341e0a 5837 .unit = unit,
039f0e70
LP
5838 .path = pglob.gl_pathv[n]
5839 };
5840
5841 p = strv_env_clean_with_callback(p, invalid_env, &info);
5842 }
8c7be95e 5843
398a5009
ZJS
5844 if (!v)
5845 v = TAKE_PTR(p);
2bef10ab 5846 else {
398a5009 5847 char **m = strv_env_merge(v, p);
c84a9488 5848 if (!m)
2bef10ab 5849 return -ENOMEM;
2bef10ab 5850
398a5009 5851 strv_free_and_replace(v, m);
2bef10ab 5852 }
8c7be95e
LP
5853 }
5854 }
5855
398a5009 5856 *ret = TAKE_PTR(v);
8c7be95e
LP
5857
5858 return 0;
5859}
5860
6ac8fdc9 5861static bool tty_may_match_dev_console(const char *tty) {
7b912648 5862 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5863
1e22b5cd
LP
5864 if (!tty)
5865 return true;
5866
a119ec7c 5867 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5868
5869 /* trivial identity? */
5870 if (streq(tty, "console"))
5871 return true;
5872
7b912648
LP
5873 if (resolve_dev_console(&resolved) < 0)
5874 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5875
5876 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5877 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5878}
5879
6c0ae739
LP
5880static bool exec_context_may_touch_tty(const ExecContext *ec) {
5881 assert(ec);
1e22b5cd 5882
6c0ae739 5883 return ec->tty_reset ||
1e22b5cd
LP
5884 ec->tty_vhangup ||
5885 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5886 is_terminal_input(ec->std_input) ||
5887 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5888 is_terminal_output(ec->std_error);
5889}
5890
5891bool exec_context_may_touch_console(const ExecContext *ec) {
5892
5893 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5894 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5895}
5896
15ae422b 5897static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5898 assert(f);
5899
5900 STRV_FOREACH(g, l)
5901 fprintf(f, " %s", *g);
5902}
5903
ddc155b2
TM
5904static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5905 assert(f);
5906 assert(prefix);
5907 assert(name);
5908
5909 if (!strv_isempty(strv)) {
a7bd1656 5910 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5911 strv_fprintf(f, strv);
5912 fputs("\n", f);
5913 }
5914}
5915
34cf6c43 5916void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5917 int r;
9eba9da4 5918
5cb5a6ff
LP
5919 assert(c);
5920 assert(f);
5921
4ad49000 5922 prefix = strempty(prefix);
5cb5a6ff
LP
5923
5924 fprintf(f,
94f04347
LP
5925 "%sUMask: %04o\n"
5926 "%sWorkingDirectory: %s\n"
451a074f 5927 "%sRootDirectory: %s\n"
15ae422b 5928 "%sNonBlocking: %s\n"
64747e2d 5929 "%sPrivateTmp: %s\n"
7f112f50 5930 "%sPrivateDevices: %s\n"
59eeb84b 5931 "%sProtectKernelTunables: %s\n"
e66a2f65 5932 "%sProtectKernelModules: %s\n"
84703040 5933 "%sProtectKernelLogs: %s\n"
fc64760d 5934 "%sProtectClock: %s\n"
59eeb84b 5935 "%sProtectControlGroups: %s\n"
d251207d
LP
5936 "%sPrivateNetwork: %s\n"
5937 "%sPrivateUsers: %s\n"
1b8689f9
LP
5938 "%sProtectHome: %s\n"
5939 "%sProtectSystem: %s\n"
5d997827 5940 "%sMountAPIVFS: %s\n"
f3e43635 5941 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5942 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5943 "%sRestrictRealtime: %s\n"
f69567cb 5944 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5945 "%sKeyringMode: %s\n"
4e399953
LP
5946 "%sProtectHostname: %s\n"
5947 "%sProtectProc: %s\n"
5948 "%sProcSubset: %s\n",
5cb5a6ff 5949 prefix, c->umask,
14eb3285
LP
5950 prefix, empty_to_root(c->working_directory),
5951 prefix, empty_to_root(c->root_directory),
15ae422b 5952 prefix, yes_no(c->non_blocking),
64747e2d 5953 prefix, yes_no(c->private_tmp),
7f112f50 5954 prefix, yes_no(c->private_devices),
59eeb84b 5955 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5956 prefix, yes_no(c->protect_kernel_modules),
84703040 5957 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5958 prefix, yes_no(c->protect_clock),
59eeb84b 5959 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5960 prefix, yes_no(c->private_network),
5961 prefix, yes_no(c->private_users),
1b8689f9
LP
5962 prefix, protect_home_to_string(c->protect_home),
5963 prefix, protect_system_to_string(c->protect_system),
5e98086d 5964 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5965 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5966 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5967 prefix, yes_no(c->restrict_realtime),
f69567cb 5968 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5969 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5970 prefix, yes_no(c->protect_hostname),
5971 prefix, protect_proc_to_string(c->protect_proc),
5972 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5973
915e6d16
LP
5974 if (c->root_image)
5975 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5976
18d73705 5977 if (c->root_image_options) {
18d73705
LB
5978 fprintf(f, "%sRootImageOptions:", prefix);
5979 LIST_FOREACH(mount_options, o, c->root_image_options)
5980 if (!isempty(o->options))
9ece6444
LB
5981 fprintf(f, " %s:%s",
5982 partition_designator_to_string(o->partition_designator),
5983 o->options);
18d73705
LB
5984 fprintf(f, "\n");
5985 }
5986
0389f4fa
LB
5987 if (c->root_hash) {
5988 _cleanup_free_ char *encoded = NULL;
5989 encoded = hexmem(c->root_hash, c->root_hash_size);
5990 if (encoded)
5991 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5992 }
5993
5994 if (c->root_hash_path)
5995 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5996
d4d55b0d
LB
5997 if (c->root_hash_sig) {
5998 _cleanup_free_ char *encoded = NULL;
5999 ssize_t len;
6000 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
6001 if (len)
6002 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
6003 }
6004
6005 if (c->root_hash_sig_path)
6006 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
6007
0389f4fa
LB
6008 if (c->root_verity)
6009 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
6010
8c7be95e
LP
6011 STRV_FOREACH(e, c->environment)
6012 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
6013
6014 STRV_FOREACH(e, c->environment_files)
6015 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 6016
b4c14404
FB
6017 STRV_FOREACH(e, c->pass_environment)
6018 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
6019
00819cc1
LP
6020 STRV_FOREACH(e, c->unset_environment)
6021 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
6022
53f47dfc
YW
6023 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
6024
5b10116e 6025 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
6026 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
6027
211a3d87
LB
6028 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
6029 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
6030
6031 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
6032 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
6033 }
3536f49e 6034 }
c2bbd90b 6035
5291f26d 6036 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 6037
fb33a393 6038 if (c->nice_set)
5291f26d 6039 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 6040
dd6c17b1 6041 if (c->oom_score_adjust_set)
5291f26d 6042 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 6043
ad21e542 6044 if (c->coredump_filter_set)
5291f26d 6045 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 6046
5b10116e 6047 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 6048 if (c->rlimit[i]) {
4c3a2b84 6049 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 6050 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 6051 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
6052 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
6053 }
94f04347 6054
f8b69d1d 6055 if (c->ioprio_set) {
1756a011 6056 _cleanup_free_ char *class_str = NULL;
f8b69d1d 6057
5bead76e 6058 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
6059 if (r >= 0)
6060 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
6061
5bead76e 6062 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 6063 }
94f04347 6064
f8b69d1d 6065 if (c->cpu_sched_set) {
1756a011 6066 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 6067
837df140
YW
6068 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
6069 if (r >= 0)
6070 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
6071
94f04347 6072 fprintf(f,
38b48754
LP
6073 "%sCPUSchedulingPriority: %i\n"
6074 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
6075 prefix, c->cpu_sched_priority,
6076 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 6077 }
94f04347 6078
0985c7c4 6079 if (c->cpu_set.set) {
e7fca352
MS
6080 _cleanup_free_ char *affinity = NULL;
6081
6082 affinity = cpu_set_to_range_string(&c->cpu_set);
6083 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
6084 }
6085
b070c7c0
MS
6086 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
6087 _cleanup_free_ char *nodes = NULL;
6088
6089 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
6090 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
6091 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
6092 }
6093
3a43da28 6094 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 6095 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
6096
6097 fprintf(f,
80876c20
LP
6098 "%sStandardInput: %s\n"
6099 "%sStandardOutput: %s\n"
6100 "%sStandardError: %s\n",
6101 prefix, exec_input_to_string(c->std_input),
6102 prefix, exec_output_to_string(c->std_output),
6103 prefix, exec_output_to_string(c->std_error));
6104
befc4a80
LP
6105 if (c->std_input == EXEC_INPUT_NAMED_FD)
6106 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
6107 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
6108 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
6109 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
6110 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
6111
6112 if (c->std_input == EXEC_INPUT_FILE)
6113 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
6114 if (c->std_output == EXEC_OUTPUT_FILE)
6115 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
6116 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
6117 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
6118 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
6119 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
6120 if (c->std_error == EXEC_OUTPUT_FILE)
6121 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
6122 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
6123 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
6124 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
6125 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 6126
80876c20
LP
6127 if (c->tty_path)
6128 fprintf(f,
6ea832a2
LP
6129 "%sTTYPath: %s\n"
6130 "%sTTYReset: %s\n"
6131 "%sTTYVHangup: %s\n"
51462135
DDM
6132 "%sTTYVTDisallocate: %s\n"
6133 "%sTTYRows: %u\n"
6134 "%sTTYColumns: %u\n",
6ea832a2
LP
6135 prefix, c->tty_path,
6136 prefix, yes_no(c->tty_reset),
6137 prefix, yes_no(c->tty_vhangup),
51462135
DDM
6138 prefix, yes_no(c->tty_vt_disallocate),
6139 prefix, c->tty_rows,
6140 prefix, c->tty_cols);
94f04347 6141
9f6444eb 6142 if (IN_SET(c->std_output,
9f6444eb
LP
6143 EXEC_OUTPUT_KMSG,
6144 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6145 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6146 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
6147 IN_SET(c->std_error,
9f6444eb
LP
6148 EXEC_OUTPUT_KMSG,
6149 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
6150 EXEC_OUTPUT_KMSG_AND_CONSOLE,
6151 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 6152
5ce70e5b 6153 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 6154
837df140
YW
6155 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
6156 if (r >= 0)
6157 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 6158
837df140
YW
6159 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
6160 if (r >= 0)
6161 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 6162 }
94f04347 6163
d3070fbd
LP
6164 if (c->log_level_max >= 0) {
6165 _cleanup_free_ char *t = NULL;
6166
6167 (void) log_level_to_string_alloc(c->log_level_max, &t);
6168
6169 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
6170 }
6171
5291f26d 6172 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
6173 fprintf(f,
6174 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 6175 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 6176
5ac1530e
ZJS
6177 if (c->log_ratelimit_burst > 0)
6178 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 6179
523ea123
QD
6180 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
6181 fprintf(f, "%sLogFilterPatterns:", prefix);
6182
6183 char *pattern;
6184 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6185 fprintf(f, " %s", pattern);
6186 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6187 fprintf(f, " ~%s", pattern);
6188 fputc('\n', f);
6189 }
6190
5b10116e
ZJS
6191 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6192 fprintf(f, "%sLogExtraFields: ", prefix);
6193 fwrite(c->log_extra_fields[j].iov_base,
6194 1, c->log_extra_fields[j].iov_len,
6195 f);
6196 fputc('\n', f);
d3070fbd
LP
6197 }
6198
91dd5f7c
LP
6199 if (c->log_namespace)
6200 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6201
07d46372
YW
6202 if (c->secure_bits) {
6203 _cleanup_free_ char *str = NULL;
6204
6205 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6206 if (r >= 0)
6207 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6208 }
94f04347 6209
3fd5190b 6210 if (c->capability_bounding_set != CAP_MASK_UNSET) {
dd1f5bd0 6211 _cleanup_free_ char *str = NULL;
94f04347 6212
8142d735 6213 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
6214 if (r >= 0)
6215 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6216 }
6217
6218 if (c->capability_ambient_set != 0) {
dd1f5bd0 6219 _cleanup_free_ char *str = NULL;
755d4b67 6220
8142d735 6221 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
6222 if (r >= 0)
6223 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6224 }
6225
6226 if (c->user)
f2d3769a 6227 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6228 if (c->group)
f2d3769a 6229 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6230
29206d46
LP
6231 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6232
ddc155b2 6233 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6234
5b6319dc 6235 if (c->pam_name)
f2d3769a 6236 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6237
ddc155b2
TM
6238 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6239 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6240 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6241 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6242 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6243 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6244
5b10116e
ZJS
6245 for (size_t i = 0; i < c->n_bind_mounts; i++)
6246 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6247 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6248 c->bind_mounts[i].ignore_enoent ? "-": "",
6249 c->bind_mounts[i].source,
6250 c->bind_mounts[i].destination,
6251 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6252
5b10116e
ZJS
6253 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6254 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6255
5b10116e
ZJS
6256 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6257 t->path,
6258 isempty(t->options) ? "" : ":",
6259 strempty(t->options));
6260 }
2abd4e38 6261
169c1bda
LP
6262 if (c->utmp_id)
6263 fprintf(f,
6264 "%sUtmpIdentifier: %s\n",
6265 prefix, c->utmp_id);
7b52a628
MS
6266
6267 if (c->selinux_context)
6268 fprintf(f,
5f8640fb
LP
6269 "%sSELinuxContext: %s%s\n",
6270 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6271
80c21aea
WC
6272 if (c->apparmor_profile)
6273 fprintf(f,
6274 "%sAppArmorProfile: %s%s\n",
6275 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6276
6277 if (c->smack_process_label)
6278 fprintf(f,
6279 "%sSmackProcessLabel: %s%s\n",
6280 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6281
050f7277 6282 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6283 fprintf(f,
6284 "%sPersonality: %s\n",
6285 prefix, strna(personality_to_string(c->personality)));
6286
78e864e5
TM
6287 fprintf(f,
6288 "%sLockPersonality: %s\n",
6289 prefix, yes_no(c->lock_personality));
6290
17df7223 6291 if (c->syscall_filter) {
17df7223 6292 fprintf(f,
57183d11 6293 "%sSystemCallFilter: ",
17df7223
LP
6294 prefix);
6295
6b000af4 6296 if (!c->syscall_allow_list)
17df7223
LP
6297 fputc('~', f);
6298
349cc4a5 6299#if HAVE_SECCOMP
d5a99b7c
JJ
6300 void *id, *val;
6301 bool first = true;
90e74a66 6302 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6303 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6304 const char *errno_name = NULL;
6305 int num = PTR_TO_INT(val);
17df7223
LP
6306
6307 if (first)
6308 first = false;
6309 else
6310 fputc(' ', f);
6311
57183d11 6312 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6313 fputs(strna(name), f);
8cfa775f
YW
6314
6315 if (num >= 0) {
005bfaf1 6316 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6317 if (errno_name)
6318 fprintf(f, ":%s", errno_name);
6319 else
6320 fprintf(f, ":%d", num);
6321 }
17df7223 6322 }
351a19b1 6323#endif
17df7223
LP
6324
6325 fputc('\n', f);
6326 }
6327
57183d11 6328 if (c->syscall_archs) {
57183d11
LP
6329 fprintf(f,
6330 "%sSystemCallArchitectures:",
6331 prefix);
6332
349cc4a5 6333#if HAVE_SECCOMP
d5a99b7c 6334 void *id;
90e74a66 6335 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6336 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6337#endif
6338 fputc('\n', f);
6339 }
6340
add00535
LP
6341 if (exec_context_restrict_namespaces_set(c)) {
6342 _cleanup_free_ char *s = NULL;
6343
86c2a9f1 6344 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6345 if (r >= 0)
6346 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6347 prefix, strna(s));
add00535
LP
6348 }
6349
b1994387 6350#if HAVE_LIBBPF
8fe84dc8
YW
6351 if (exec_context_restrict_filesystems_set(c)) {
6352 char *fs;
6353 SET_FOREACH(fs, c->restrict_filesystems)
6354 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6355 }
b1994387
ILG
6356#endif
6357
a8d08f39
LP
6358 if (c->network_namespace_path)
6359 fprintf(f,
6360 "%sNetworkNamespacePath: %s\n",
6361 prefix, c->network_namespace_path);
6362
3df90f24 6363 if (c->syscall_errno > 0) {
3df90f24
YW
6364 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6365
005bfaf1 6366#if HAVE_SECCOMP
d5a99b7c 6367 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6368 if (errno_name)
005bfaf1 6369 fputs(errno_name, f);
3df90f24 6370 else
005bfaf1
TM
6371 fprintf(f, "%d", c->syscall_errno);
6372#endif
6373 fputc('\n', f);
3df90f24 6374 }
b3d13314 6375
5b10116e 6376 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6377 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6378 c->mount_images[i].ignore_enoent ? "-": "",
6379 c->mount_images[i].source,
79e20ceb 6380 c->mount_images[i].destination);
427353f6 6381 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6382 fprintf(f, ":%s:%s",
427353f6 6383 partition_designator_to_string(o->partition_designator),
79e20ceb 6384 strempty(o->options));
427353f6
LB
6385 fprintf(f, "\n");
6386 }
93f59701
LB
6387
6388 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6389 fprintf(f, "%sExtensionImages: %s%s", prefix,
6390 c->extension_images[i].ignore_enoent ? "-": "",
6391 c->extension_images[i].source);
6392 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6393 fprintf(f, ":%s:%s",
6394 partition_designator_to_string(o->partition_designator),
6395 strempty(o->options));
6396 fprintf(f, "\n");
6397 }
a07b9926
LB
6398
6399 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6400}
6401
34cf6c43 6402bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6403 assert(c);
6404
61233823 6405 /* Returns true if the process forked off would run under
a931ad47
LP
6406 * an unchanged UID or as root. */
6407
6408 if (!c->user)
6409 return true;
6410
6411 if (streq(c->user, "root") || streq(c->user, "0"))
6412 return true;
6413
6414 return false;
6415}
6416
34cf6c43 6417int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6418 int p;
6419
6420 assert(c);
6421
6422 if (c->ioprio_set)
6423 return c->ioprio;
6424
6425 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6426 if (p < 0)
0692548c 6427 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6428
8b330d7d 6429 return ioprio_normalize(p);
7f452159
LP
6430}
6431
5e98086d
ZJS
6432bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6433 assert(c);
6434
61198784 6435 /* Explicit setting wins */
5e98086d
ZJS
6436 if (c->mount_apivfs_set)
6437 return c->mount_apivfs;
6438
61198784 6439 /* Default to "yes" if root directory or image are specified */
74e12520 6440 if (exec_context_with_rootfs(c))
61198784
ZJS
6441 return true;
6442
5e98086d
ZJS
6443 return false;
6444}
6445
d3070fbd 6446void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6447 assert(c);
6448
5b10116e 6449 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6450 free(c->log_extra_fields[l].iov_base);
6451 c->log_extra_fields = mfree(c->log_extra_fields);
6452 c->n_log_extra_fields = 0;
6453}
6454
6f765baf 6455void exec_context_revert_tty(ExecContext *c) {
254d1313 6456 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6457 const char *path;
6458 struct stat st;
6f765baf
LP
6459 int r;
6460
6461 assert(c);
6462
6463 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6464 exec_context_tty_reset(c, NULL);
6465
6466 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6467 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6468 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6469 if (!exec_context_may_touch_tty(c))
6470 return;
6f765baf 6471
0ba976e8
LP
6472 path = exec_context_tty_path(c);
6473 if (!path)
6474 return;
6f765baf 6475
0ba976e8
LP
6476 fd = open(path, O_PATH|O_CLOEXEC);
6477 if (fd < 0)
6478 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6479 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6480 path);
6481
6482 if (fstat(fd, &st) < 0)
6483 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6484
6485 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6486 * if things are a character device, since a proper check either means we'd have to open the TTY and
6487 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6488 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6489 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6490 if (!S_ISCHR(st.st_mode))
6491 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6492
6493 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6494 if (r < 0)
6495 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6496}
6497
4c2f5842
LP
6498int exec_context_get_clean_directories(
6499 ExecContext *c,
6500 char **prefix,
6501 ExecCleanMask mask,
6502 char ***ret) {
6503
6504 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6505 int r;
6506
6507 assert(c);
6508 assert(prefix);
6509 assert(ret);
6510
5b10116e 6511 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6512 if (!FLAGS_SET(mask, 1U << t))
6513 continue;
6514
6515 if (!prefix[t])
6516 continue;
6517
211a3d87 6518 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6519 char *j;
6520
211a3d87 6521 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6522 if (!j)
6523 return -ENOMEM;
6524
6525 r = strv_consume(&l, j);
6526 if (r < 0)
6527 return r;
7f622a19
YW
6528
6529 /* Also remove private directories unconditionally. */
6530 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6531 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6532 if (!j)
6533 return -ENOMEM;
6534
6535 r = strv_consume(&l, j);
6536 if (r < 0)
6537 return r;
6538 }
6539
211a3d87
LB
6540 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6541 j = path_join(prefix[t], *symlink);
7f622a19
YW
6542 if (!j)
6543 return -ENOMEM;
6544
6545 r = strv_consume(&l, j);
6546 if (r < 0)
6547 return r;
6548 }
4c2f5842
LP
6549 }
6550 }
6551
6552 *ret = TAKE_PTR(l);
6553 return 0;
6554}
6555
6556int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6557 ExecCleanMask mask = 0;
6558
6559 assert(c);
6560 assert(ret);
6561
6562 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6563 if (c->directories[t].n_items > 0)
4c2f5842
LP
6564 mask |= 1U << t;
6565
6566 *ret = mask;
6567 return 0;
6568}
6569
b58b4116 6570void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6571 assert(s);
5cb5a6ff 6572
2ed26ed0
LP
6573 *s = (ExecStatus) {
6574 .pid = pid,
6575 };
6576
b58b4116
LP
6577 dual_timestamp_get(&s->start_timestamp);
6578}
6579
34cf6c43 6580void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6581 assert(s);
6582
d46b79bb 6583 if (s->pid != pid)
2ed26ed0
LP
6584 *s = (ExecStatus) {
6585 .pid = pid,
6586 };
b58b4116 6587
63983207 6588 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6589
034c6ed7
LP
6590 s->code = code;
6591 s->status = status;
169c1bda 6592
6f765baf
LP
6593 if (context && context->utmp_id)
6594 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6595}
6596
6a1d4d9f
LP
6597void exec_status_reset(ExecStatus *s) {
6598 assert(s);
6599
6600 *s = (ExecStatus) {};
6601}
6602
34cf6c43 6603void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6604 assert(s);
6605 assert(f);
6606
9fb86720
LP
6607 if (s->pid <= 0)
6608 return;
6609
4c940960
LP
6610 prefix = strempty(prefix);
6611
9fb86720 6612 fprintf(f,
ccd06097
ZJS
6613 "%sPID: "PID_FMT"\n",
6614 prefix, s->pid);
9fb86720 6615
af9d16e1 6616 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6617 fprintf(f,
6618 "%sStart Timestamp: %s\n",
04f5c018 6619 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6620
af9d16e1 6621 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6622 fprintf(f,
6623 "%sExit Timestamp: %s\n"
6624 "%sExit Code: %s\n"
6625 "%sExit Status: %i\n",
04f5c018 6626 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6627 prefix, sigchld_code_to_string(s->code),
6628 prefix, s->status);
5cb5a6ff 6629}
44d8db9e 6630
34cf6c43 6631static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6632 _cleanup_free_ char *cmd = NULL;
4c940960 6633 const char *prefix2;
44d8db9e
LP
6634
6635 assert(c);
6636 assert(f);
6637
4c940960 6638 prefix = strempty(prefix);
63c372cb 6639 prefix2 = strjoina(prefix, "\t");
44d8db9e 6640
4ef15008 6641 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6642
44d8db9e
LP
6643 fprintf(f,
6644 "%sCommand Line: %s\n",
38553034 6645 prefix, strnull(cmd));
44d8db9e 6646
9fb86720 6647 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6648}
6649
6650void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6651 assert(f);
6652
4c940960 6653 prefix = strempty(prefix);
44d8db9e 6654
03677889
YW
6655 LIST_FOREACH(command, i, c)
6656 exec_command_dump(i, f, prefix);
44d8db9e 6657}
94f04347 6658
a6a80b4f
LP
6659void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6660 ExecCommand *end;
6661
6662 assert(l);
6663 assert(e);
6664
6665 if (*l) {
35b8ca3a 6666 /* It's kind of important, that we keep the order here */
cc232fa0 6667 end = LIST_FIND_TAIL(command, *l);
71fda00f 6668 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6669 } else
6670 *l = e;
6671}
6672
26fd040d
LP
6673int exec_command_set(ExecCommand *c, const char *path, ...) {
6674 va_list ap;
6675 char **l, *p;
6676
6677 assert(c);
6678 assert(path);
6679
6680 va_start(ap, path);
6681 l = strv_new_ap(path, ap);
6682 va_end(ap);
6683
6684 if (!l)
6685 return -ENOMEM;
6686
250a918d
LP
6687 p = strdup(path);
6688 if (!p) {
26fd040d
LP
6689 strv_free(l);
6690 return -ENOMEM;
6691 }
6692
6897dfe8 6693 free_and_replace(c->path, p);
26fd040d 6694
130d3d22 6695 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6696}
6697
86b23b07 6698int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6699 _cleanup_strv_free_ char **l = NULL;
86b23b07 6700 va_list ap;
86b23b07
JS
6701 int r;
6702
6703 assert(c);
6704 assert(path);
6705
6706 va_start(ap, path);
6707 l = strv_new_ap(path, ap);
6708 va_end(ap);
6709
6710 if (!l)
6711 return -ENOMEM;
6712
e287086b 6713 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6714 if (r < 0)
86b23b07 6715 return r;
86b23b07
JS
6716
6717 return 0;
6718}
6719
e8a565cb
YW
6720static void *remove_tmpdir_thread(void *p) {
6721 _cleanup_free_ char *path = p;
86b23b07 6722
e8a565cb
YW
6723 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6724 return NULL;
6725}
6726
6727static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6728 int r;
6729
6730 if (!rt)
6731 return NULL;
6732
6733 if (rt->manager)
6734 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6735
6736 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6737
6738 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6739 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6740
6741 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6742 if (r < 0)
e8a565cb 6743 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6744 else
6745 rt->tmp_dir = NULL;
e8a565cb 6746 }
613b411c 6747
56a13a49 6748 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6749 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6750
6751 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6752 if (r < 0)
e8a565cb 6753 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6754 else
6755 rt->var_tmp_dir = NULL;
e8a565cb
YW
6756 }
6757
6758 rt->id = mfree(rt->id);
6759 rt->tmp_dir = mfree(rt->tmp_dir);
6760 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6761 safe_close_pair(rt->netns_storage_socket);
a70581ff 6762 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6763 return mfree(rt);
6764}
6765
6766static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6767 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6768}
6769
56a13a49
ZJS
6770static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6771 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6772 ExecRuntime *n;
613b411c 6773
8e8009dc 6774 assert(ret);
613b411c 6775
56a13a49
ZJS
6776 id_copy = strdup(id);
6777 if (!id_copy)
6778 return -ENOMEM;
6779
8e8009dc
LP
6780 n = new(ExecRuntime, 1);
6781 if (!n)
613b411c
LP
6782 return -ENOMEM;
6783
8e8009dc 6784 *n = (ExecRuntime) {
56a13a49 6785 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6786 .netns_storage_socket = PIPE_EBADF,
6787 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6788 };
6789
6790 *ret = n;
613b411c
LP
6791 return 0;
6792}
6793
e8a565cb
YW
6794static int exec_runtime_add(
6795 Manager *m,
6796 const char *id,
56a13a49
ZJS
6797 char **tmp_dir,
6798 char **var_tmp_dir,
6799 int netns_storage_socket[2],
a70581ff 6800 int ipcns_storage_socket[2],
e8a565cb
YW
6801 ExecRuntime **ret) {
6802
6803 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6804 int r;
6805
e8a565cb 6806 assert(m);
613b411c
LP
6807 assert(id);
6808
a70581ff 6809 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6810
56a13a49 6811 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6812 if (r < 0)
6813 return r;
6814
63083706 6815 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6816 if (r < 0)
6817 return r;
e8a565cb 6818
56a13a49
ZJS
6819 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6820 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6821 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6822
6823 if (netns_storage_socket) {
56a13a49
ZJS
6824 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6825 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6826 }
6827
a70581ff
XR
6828 if (ipcns_storage_socket) {
6829 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6830 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6831 }
6832
e8a565cb
YW
6833 rt->manager = m;
6834
6835 if (ret)
6836 *ret = rt;
e8a565cb 6837 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6838 TAKE_PTR(rt);
e8a565cb
YW
6839 return 0;
6840}
6841
74aaf59b
LP
6842static int exec_runtime_make(
6843 Manager *m,
6844 const ExecContext *c,
6845 const char *id,
6846 ExecRuntime **ret) {
6847
56a13a49 6848 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6849 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6850 int r;
6851
6852 assert(m);
6853 assert(c);
6854 assert(id);
6855
6856 /* It is not necessary to create ExecRuntime object. */
fde36d25 6857 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
74aaf59b 6858 *ret = NULL;
e8a565cb 6859 return 0;
74aaf59b 6860 }
e8a565cb 6861
efa2f3a1
TM
6862 if (c->private_tmp &&
6863 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6864 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6865 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6866 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6867 if (r < 0)
6868 return r;
6869 }
6870
fbbb9697 6871 if (exec_needs_network_namespace(c)) {
e8a565cb
YW
6872 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6873 return -errno;
6874 }
6875
fde36d25 6876 if (exec_needs_ipc_namespace(c)) {
a70581ff
XR
6877 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6878 return -errno;
6879 }
6880
6881 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6882 if (r < 0)
6883 return r;
6884
613b411c
LP
6885 return 1;
6886}
6887
e8a565cb
YW
6888int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6889 ExecRuntime *rt;
6890 int r;
613b411c 6891
e8a565cb
YW
6892 assert(m);
6893 assert(id);
6894 assert(ret);
6895
6896 rt = hashmap_get(m->exec_runtime_by_id, id);
6897 if (rt)
387f6955 6898 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6899 goto ref;
6900
74aaf59b
LP
6901 if (!create) {
6902 *ret = NULL;
e8a565cb 6903 return 0;
74aaf59b 6904 }
e8a565cb
YW
6905
6906 /* If not found, then create a new object. */
6907 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6908 if (r < 0)
e8a565cb 6909 return r;
74aaf59b
LP
6910 if (r == 0) {
6911 /* When r == 0, it is not necessary to create ExecRuntime object. */
6912 *ret = NULL;
6913 return 0;
6914 }
613b411c 6915
e8a565cb
YW
6916ref:
6917 /* increment reference counter. */
6918 rt->n_ref++;
6919 *ret = rt;
6920 return 1;
6921}
613b411c 6922
e8a565cb
YW
6923ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6924 if (!rt)
613b411c
LP
6925 return NULL;
6926
e8a565cb 6927 assert(rt->n_ref > 0);
613b411c 6928
e8a565cb
YW
6929 rt->n_ref--;
6930 if (rt->n_ref > 0)
f2341e0a
LP
6931 return NULL;
6932
e8a565cb 6933 return exec_runtime_free(rt, destroy);
613b411c
LP
6934}
6935
e8a565cb
YW
6936int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6937 ExecRuntime *rt;
e8a565cb
YW
6938
6939 assert(m);
613b411c
LP
6940 assert(f);
6941 assert(fds);
6942
90e74a66 6943 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6944 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6945
e8a565cb
YW
6946 if (rt->tmp_dir)
6947 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6948
e8a565cb
YW
6949 if (rt->var_tmp_dir)
6950 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6951
e8a565cb
YW
6952 if (rt->netns_storage_socket[0] >= 0) {
6953 int copy;
613b411c 6954
e8a565cb
YW
6955 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6956 if (copy < 0)
6957 return copy;
613b411c 6958
e8a565cb
YW
6959 fprintf(f, " netns-socket-0=%i", copy);
6960 }
613b411c 6961
e8a565cb
YW
6962 if (rt->netns_storage_socket[1] >= 0) {
6963 int copy;
613b411c 6964
e8a565cb
YW
6965 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6966 if (copy < 0)
6967 return copy;
613b411c 6968
e8a565cb
YW
6969 fprintf(f, " netns-socket-1=%i", copy);
6970 }
6971
a70581ff
XR
6972 if (rt->ipcns_storage_socket[0] >= 0) {
6973 int copy;
6974
6975 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6976 if (copy < 0)
6977 return copy;
6978
6979 fprintf(f, " ipcns-socket-0=%i", copy);
6980 }
6981
6982 if (rt->ipcns_storage_socket[1] >= 0) {
6983 int copy;
6984
6985 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6986 if (copy < 0)
6987 return copy;
6988
6989 fprintf(f, " ipcns-socket-1=%i", copy);
6990 }
6991
e8a565cb 6992 fputc('\n', f);
613b411c
LP
6993 }
6994
6995 return 0;
6996}
6997
e8a565cb
YW
6998int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6999 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
7000 ExecRuntime *rt;
613b411c
LP
7001 int r;
7002
e8a565cb
YW
7003 /* This is for the migration from old (v237 or earlier) deserialization text.
7004 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
7005 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
7006 * so or not from the serialized text, then we always creates a new object owned by this. */
7007
7008 assert(u);
613b411c
LP
7009 assert(key);
7010 assert(value);
7011
e8a565cb
YW
7012 /* Manager manages ExecRuntime objects by the unit id.
7013 * So, we omit the serialized text when the unit does not have id (yet?)... */
7014 if (isempty(u->id)) {
7015 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
7016 return 0;
7017 }
613b411c 7018
cbc165d1
ZJS
7019 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
7020 return log_oom();
e8a565cb
YW
7021
7022 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
7023 if (!rt) {
cbc165d1 7024 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 7025 return log_oom();
613b411c 7026
e8a565cb
YW
7027 rt = rt_create;
7028 }
7029
7030 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
7031 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
7032 return -ENOMEM;
613b411c
LP
7033
7034 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
7035 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
7036 return -ENOMEM;
613b411c
LP
7037
7038 } else if (streq(key, "netns-socket-0")) {
7039 int fd;
7040
e8a565cb 7041 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7042 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7043 return 0;
613b411c 7044 }
e8a565cb
YW
7045
7046 safe_close(rt->netns_storage_socket[0]);
7047 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
7048
613b411c
LP
7049 } else if (streq(key, "netns-socket-1")) {
7050 int fd;
7051
e8a565cb 7052 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 7053 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 7054 return 0;
613b411c 7055 }
e8a565cb
YW
7056
7057 safe_close(rt->netns_storage_socket[1]);
7058 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 7059
613b411c
LP
7060 } else
7061 return 0;
7062
e8a565cb
YW
7063 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
7064 if (rt_create) {
7065 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
7066 if (r < 0) {
3fe91079 7067 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
7068 return 0;
7069 }
613b411c 7070
e8a565cb 7071 rt_create->manager = u->manager;
613b411c 7072
e8a565cb 7073 /* Avoid cleanup */
56a13a49 7074 TAKE_PTR(rt_create);
e8a565cb 7075 }
98b47d54 7076
e8a565cb
YW
7077 return 1;
7078}
613b411c 7079
56a13a49
ZJS
7080int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
7081 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
7082 char *id = NULL;
a70581ff 7083 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 7084 const char *p, *v = ASSERT_PTR(value);
e8a565cb 7085 size_t n;
613b411c 7086
e8a565cb 7087 assert(m);
e8a565cb 7088 assert(fds);
98b47d54 7089
e8a565cb 7090 n = strcspn(v, " ");
2f82562b 7091 id = strndupa_safe(v, n);
e8a565cb
YW
7092 if (v[n] != ' ')
7093 goto finalize;
7094 p = v + n + 1;
7095
7096 v = startswith(p, "tmp-dir=");
7097 if (v) {
7098 n = strcspn(v, " ");
56a13a49
ZJS
7099 tmp_dir = strndup(v, n);
7100 if (!tmp_dir)
7101 return log_oom();
e8a565cb
YW
7102 if (v[n] != ' ')
7103 goto finalize;
7104 p = v + n + 1;
7105 }
7106
7107 v = startswith(p, "var-tmp-dir=");
7108 if (v) {
7109 n = strcspn(v, " ");
56a13a49
ZJS
7110 var_tmp_dir = strndup(v, n);
7111 if (!var_tmp_dir)
7112 return log_oom();
e8a565cb
YW
7113 if (v[n] != ' ')
7114 goto finalize;
7115 p = v + n + 1;
7116 }
7117
7118 v = startswith(p, "netns-socket-0=");
7119 if (v) {
7120 char *buf;
7121
7122 n = strcspn(v, " ");
2f82562b 7123 buf = strndupa_safe(v, n);
c413bb28 7124
a70581ff 7125 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
7126 if (r < 0)
7127 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 7128 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 7129 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7130 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
7131 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
7132 if (v[n] != ' ')
7133 goto finalize;
7134 p = v + n + 1;
613b411c
LP
7135 }
7136
e8a565cb
YW
7137 v = startswith(p, "netns-socket-1=");
7138 if (v) {
7139 char *buf;
98b47d54 7140
e8a565cb 7141 n = strcspn(v, " ");
2f82562b 7142 buf = strndupa_safe(v, n);
a70581ff
XR
7143
7144 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
7145 if (r < 0)
7146 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
7147 if (!fdset_contains(fds, netns_fdpair[1]))
7148 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7149 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
7150 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
7151 if (v[n] != ' ')
7152 goto finalize;
7153 p = v + n + 1;
7154 }
7155
7156 v = startswith(p, "ipcns-socket-0=");
7157 if (v) {
7158 char *buf;
7159
7160 n = strcspn(v, " ");
2f82562b 7161 buf = strndupa_safe(v, n);
a70581ff
XR
7162
7163 r = safe_atoi(buf, &ipcns_fdpair[0]);
7164 if (r < 0)
7165 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
7166 if (!fdset_contains(fds, ipcns_fdpair[0]))
7167 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
7168 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
7169 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
7170 if (v[n] != ' ')
7171 goto finalize;
7172 p = v + n + 1;
7173 }
7174
7175 v = startswith(p, "ipcns-socket-1=");
7176 if (v) {
7177 char *buf;
7178
7179 n = strcspn(v, " ");
2f82562b 7180 buf = strndupa_safe(v, n);
a70581ff
XR
7181
7182 r = safe_atoi(buf, &ipcns_fdpair[1]);
7183 if (r < 0)
7184 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
7185 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 7186 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
7187 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
7188 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 7189 }
98b47d54 7190
e8a565cb 7191finalize:
a70581ff 7192 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 7193 if (r < 0)
56a13a49
ZJS
7194 return log_debug_errno(r, "Failed to add exec-runtime: %m");
7195 return 0;
e8a565cb 7196}
613b411c 7197
e8a565cb
YW
7198void exec_runtime_vacuum(Manager *m) {
7199 ExecRuntime *rt;
e8a565cb
YW
7200
7201 assert(m);
7202
7203 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
7204
90e74a66 7205 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
7206 if (rt->n_ref > 0)
7207 continue;
7208
7209 (void) exec_runtime_free(rt, false);
7210 }
613b411c
LP
7211}
7212
b9c04eaf
YW
7213void exec_params_clear(ExecParameters *p) {
7214 if (!p)
7215 return;
7216
c3f8a065
LP
7217 p->environment = strv_free(p->environment);
7218 p->fd_names = strv_free(p->fd_names);
7219 p->fds = mfree(p->fds);
7220 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7221}
7222
bb0c0d6f
LP
7223ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7224 if (!sc)
7225 return NULL;
7226
7227 free(sc->id);
7228 free(sc->data);
7229 return mfree(sc);
7230}
7231
43144be4
LP
7232ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7233 if (!lc)
7234 return NULL;
7235
7236 free(lc->id);
7237 free(lc->path);
7238 return mfree(lc);
7239}
7240
211a3d87
LB
7241void exec_directory_done(ExecDirectory *d) {
7242 if (!d)
7243 return;
7244
7245 for (size_t i = 0; i < d->n_items; i++) {
7246 free(d->items[i].path);
7247 strv_free(d->items[i].symlinks);
7248 }
7249
7250 d->items = mfree(d->items);
7251 d->n_items = 0;
7252 d->mode = 0755;
7253}
7254
564e5c98
YW
7255static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7256 assert(d);
7257 assert(path);
7258
7259 for (size_t i = 0; i < d->n_items; i++)
7260 if (path_equal(d->items[i].path, path))
7261 return &d->items[i];
7262
7263 return NULL;
7264}
7265
7266int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7267 _cleanup_strv_free_ char **s = NULL;
7268 _cleanup_free_ char *p = NULL;
564e5c98
YW
7269 ExecDirectoryItem *existing;
7270 int r;
211a3d87
LB
7271
7272 assert(d);
211a3d87
LB
7273 assert(path);
7274
564e5c98
YW
7275 existing = exec_directory_find(d, path);
7276 if (existing) {
7277 r = strv_extend(&existing->symlinks, symlink);
7278 if (r < 0)
7279 return r;
7280
7281 return 0; /* existing item is updated */
7282 }
7283
211a3d87
LB
7284 p = strdup(path);
7285 if (!p)
7286 return -ENOMEM;
7287
564e5c98
YW
7288 if (symlink) {
7289 s = strv_new(symlink);
211a3d87
LB
7290 if (!s)
7291 return -ENOMEM;
7292 }
7293
564e5c98 7294 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7295 return -ENOMEM;
7296
564e5c98 7297 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7298 .path = TAKE_PTR(p),
7299 .symlinks = TAKE_PTR(s),
7300 };
7301
564e5c98 7302 return 1; /* new item is added */
211a3d87
LB
7303}
7304
a2ab603c
YW
7305static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7306 assert(a);
7307 assert(b);
7308
7309 return path_compare(a->path, b->path);
7310}
7311
7312void exec_directory_sort(ExecDirectory *d) {
7313 assert(d);
7314
7315 /* Sort the exec directories to make always parent directories processed at first in
7316 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7317 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7318 * list. See also comments in setup_exec_directory() and issue #24783. */
7319
7320 if (d->n_items <= 1)
7321 return;
7322
7323 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7324
7325 for (size_t i = 1; i < d->n_items; i++)
7326 for (size_t j = 0; j < i; j++)
7327 if (path_startswith(d->items[i].path, d->items[j].path)) {
7328 d->items[i].only_create = true;
7329 break;
7330 }
211a3d87
LB
7331}
7332
bb0c0d6f 7333DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7334DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7335
80876c20
LP
7336static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7337 [EXEC_INPUT_NULL] = "null",
7338 [EXEC_INPUT_TTY] = "tty",
7339 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7340 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7341 [EXEC_INPUT_SOCKET] = "socket",
7342 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7343 [EXEC_INPUT_DATA] = "data",
2038c3f5 7344 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7345};
7346
8a0867d6
LP
7347DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7348
94f04347 7349static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7350 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7351 [EXEC_OUTPUT_NULL] = "null",
80876c20 7352 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7353 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7354 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7355 [EXEC_OUTPUT_JOURNAL] = "journal",
7356 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7357 [EXEC_OUTPUT_SOCKET] = "socket",
7358 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7359 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7360 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7361 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7362};
7363
7364DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7365
7366static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7367 [EXEC_UTMP_INIT] = "init",
7368 [EXEC_UTMP_LOGIN] = "login",
7369 [EXEC_UTMP_USER] = "user",
7370};
7371
7372DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7373
7374static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7375 [EXEC_PRESERVE_NO] = "no",
7376 [EXEC_PRESERVE_YES] = "yes",
7377 [EXEC_PRESERVE_RESTART] = "restart",
7378};
7379
7380DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7381
6b7b2ed9 7382/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7383static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7384 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7385 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7386 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7387 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7388 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7389};
7390
7391DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7392
211a3d87
LB
7393/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7394static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7395 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7396 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7397 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7398 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7399 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7400};
7401
7402DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7403
6b7b2ed9
LP
7404/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7405 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7406 * directories, specifically .timer units with their timestamp touch file. */
7407static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7408 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7409 [EXEC_DIRECTORY_STATE] = "state",
7410 [EXEC_DIRECTORY_CACHE] = "cache",
7411 [EXEC_DIRECTORY_LOGS] = "logs",
7412 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7413};
7414
7415DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7416
7417/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7418 * the service payload in. */
fb2042dd
YW
7419static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7420 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7421 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7422 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7423 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7424 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7425};
7426
7427DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7428
b1edf445
LP
7429static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7430 [EXEC_KEYRING_INHERIT] = "inherit",
7431 [EXEC_KEYRING_PRIVATE] = "private",
7432 [EXEC_KEYRING_SHARED] = "shared",
7433};
7434
7435DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);