]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #21217 from keszybz/debug-test-process-util
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
a1164ae3 66#include "label.h"
8dd4c05b
LP
67#include "log.h"
68#include "macro.h"
e8a565cb 69#include "manager.h"
2a341bb9 70#include "manager-dump.h"
0a970718 71#include "memory-util.h"
f5947a5e 72#include "missing_fs.h"
5bead76e 73#include "missing_ioprio.h"
8dd4c05b 74#include "mkdir.h"
21935150 75#include "mount-util.h"
bb0c0d6f 76#include "mountpoint-util.h"
8dd4c05b 77#include "namespace.h"
6bedfcbb 78#include "parse-util.h"
8dd4c05b 79#include "path-util.h"
0b452006 80#include "process-util.h"
d3dcf4e3 81#include "random-util.h"
78f22b97 82#include "rlimit-util.h"
8dd4c05b 83#include "rm-rf.h"
349cc4a5 84#if HAVE_SECCOMP
3ffd4af2
LP
85#include "seccomp-util.h"
86#endif
07d46372 87#include "securebits-util.h"
8dd4c05b 88#include "selinux-util.h"
24882e06 89#include "signal-util.h"
8dd4c05b 90#include "smack-util.h"
57b7a260 91#include "socket-util.h"
fd63e712 92#include "special.h"
949befd3 93#include "stat-util.h"
8b43440b 94#include "string-table.h"
07630cea 95#include "string-util.h"
8dd4c05b 96#include "strv.h"
7ccbd1ae 97#include "syslog-util.h"
8dd4c05b 98#include "terminal-util.h"
bb0c0d6f 99#include "tmpfile-util.h"
566b7d23 100#include "umask-util.h"
2d3b784d 101#include "unit-serialize.h"
b1d4f8e1 102#include "user-util.h"
8dd4c05b 103#include "utmp-wtmp.h"
5cb5a6ff 104
e056b01d 105#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 106#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 107
531dca78
LP
108#define SNDBUF_SIZE (8*1024*1024)
109
da6053d0 110static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
111 if (n_fds <= 0)
112 return 0;
113
a0d40ac5
LP
114 /* Modifies the fds array! (sorts it) */
115
034c6ed7
LP
116 assert(fds);
117
5b10116e
ZJS
118 for (int start = 0;;) {
119 int restart_from = -1;
034c6ed7 120
5b10116e 121 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
122 int nfd;
123
124 /* Already at right index? */
125 if (fds[i] == i+3)
126 continue;
127
3cc2aff1
LP
128 nfd = fcntl(fds[i], F_DUPFD, i + 3);
129 if (nfd < 0)
034c6ed7
LP
130 return -errno;
131
03e334a1 132 safe_close(fds[i]);
034c6ed7
LP
133 fds[i] = nfd;
134
135 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 136 * let's remember that and try again from here */
034c6ed7
LP
137 if (nfd != i+3 && restart_from < 0)
138 restart_from = i;
139 }
140
141 if (restart_from < 0)
142 break;
143
144 start = restart_from;
145 }
146
147 return 0;
148}
149
25b583d7 150static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 151 size_t n_fds;
e2c76839 152 int r;
47a71eed 153
25b583d7 154 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
155 if (n_fds <= 0)
156 return 0;
157
158 assert(fds);
159
9b141911
FB
160 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
161 * O_NONBLOCK only applies to socket activation though. */
47a71eed 162
5b10116e 163 for (size_t i = 0; i < n_fds; i++) {
47a71eed 164
9b141911
FB
165 if (i < n_socket_fds) {
166 r = fd_nonblock(fds[i], nonblock);
167 if (r < 0)
168 return r;
169 }
47a71eed 170
451a074f
LP
171 /* We unconditionally drop FD_CLOEXEC from the fds,
172 * since after all we want to pass these fds to our
173 * children */
47a71eed 174
3cc2aff1
LP
175 r = fd_cloexec(fds[i], false);
176 if (r < 0)
e2c76839 177 return r;
47a71eed
LP
178 }
179
180 return 0;
181}
182
1e22b5cd 183static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
184 assert(context);
185
1e22b5cd
LP
186 if (context->stdio_as_fds)
187 return NULL;
188
80876c20
LP
189 if (context->tty_path)
190 return context->tty_path;
191
192 return "/dev/console";
193}
194
1e22b5cd
LP
195static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
196 const char *path;
197
6ea832a2
LP
198 assert(context);
199
1e22b5cd 200 path = exec_context_tty_path(context);
6ea832a2 201
1e22b5cd
LP
202 if (context->tty_vhangup) {
203 if (p && p->stdin_fd >= 0)
204 (void) terminal_vhangup_fd(p->stdin_fd);
205 else if (path)
206 (void) terminal_vhangup(path);
207 }
6ea832a2 208
1e22b5cd
LP
209 if (context->tty_reset) {
210 if (p && p->stdin_fd >= 0)
211 (void) reset_terminal_fd(p->stdin_fd, true);
212 else if (path)
213 (void) reset_terminal(path);
214 }
215
216 if (context->tty_vt_disallocate && path)
217 (void) vt_disallocate(path);
6ea832a2
LP
218}
219
6af760f3
LP
220static bool is_terminal_input(ExecInput i) {
221 return IN_SET(i,
222 EXEC_INPUT_TTY,
223 EXEC_INPUT_TTY_FORCE,
224 EXEC_INPUT_TTY_FAIL);
225}
226
3a1286b6 227static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
228 return IN_SET(o,
229 EXEC_OUTPUT_TTY,
6af760f3
LP
230 EXEC_OUTPUT_KMSG_AND_CONSOLE,
231 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
232}
233
aac8c0c3
LP
234static bool is_kmsg_output(ExecOutput o) {
235 return IN_SET(o,
236 EXEC_OUTPUT_KMSG,
237 EXEC_OUTPUT_KMSG_AND_CONSOLE);
238}
239
6af760f3
LP
240static bool exec_context_needs_term(const ExecContext *c) {
241 assert(c);
242
243 /* Return true if the execution context suggests we should set $TERM to something useful. */
244
245 if (is_terminal_input(c->std_input))
246 return true;
247
248 if (is_terminal_output(c->std_output))
249 return true;
250
251 if (is_terminal_output(c->std_error))
252 return true;
253
254 return !!c->tty_path;
3a1286b6
MS
255}
256
80876c20 257static int open_null_as(int flags, int nfd) {
046a82c1 258 int fd;
071830ff 259
80876c20 260 assert(nfd >= 0);
071830ff 261
613b411c
LP
262 fd = open("/dev/null", flags|O_NOCTTY);
263 if (fd < 0)
071830ff
LP
264 return -errno;
265
046a82c1 266 return move_fd(fd, nfd, false);
071830ff
LP
267}
268
91dd5f7c
LP
269static int connect_journal_socket(
270 int fd,
271 const char *log_namespace,
272 uid_t uid,
273 gid_t gid) {
274
f36a9d59
ZJS
275 union sockaddr_union sa;
276 socklen_t sa_len;
524daa8c
ZJS
277 uid_t olduid = UID_INVALID;
278 gid_t oldgid = GID_INVALID;
91dd5f7c 279 const char *j;
524daa8c
ZJS
280 int r;
281
91dd5f7c
LP
282 j = log_namespace ?
283 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
284 "/run/systemd/journal/stdout";
285 r = sockaddr_un_set_path(&sa.un, j);
286 if (r < 0)
287 return r;
f36a9d59 288 sa_len = r;
91dd5f7c 289
cad93f29 290 if (gid_is_valid(gid)) {
524daa8c
ZJS
291 oldgid = getgid();
292
92a17af9 293 if (setegid(gid) < 0)
524daa8c
ZJS
294 return -errno;
295 }
296
cad93f29 297 if (uid_is_valid(uid)) {
524daa8c
ZJS
298 olduid = getuid();
299
92a17af9 300 if (seteuid(uid) < 0) {
524daa8c
ZJS
301 r = -errno;
302 goto restore_gid;
303 }
304 }
305
f36a9d59 306 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
524daa8c
ZJS
307
308 /* If we fail to restore the uid or gid, things will likely
309 fail later on. This should only happen if an LSM interferes. */
310
cad93f29 311 if (uid_is_valid(uid))
524daa8c
ZJS
312 (void) seteuid(olduid);
313
314 restore_gid:
cad93f29 315 if (gid_is_valid(gid))
524daa8c
ZJS
316 (void) setegid(oldgid);
317
318 return r;
319}
320
fd1f9c89 321static int connect_logger_as(
34cf6c43 322 const Unit *unit,
fd1f9c89 323 const ExecContext *context,
af635cf3 324 const ExecParameters *params,
fd1f9c89
LP
325 ExecOutput output,
326 const char *ident,
fd1f9c89
LP
327 int nfd,
328 uid_t uid,
329 gid_t gid) {
330
2ac1ff68
EV
331 _cleanup_close_ int fd = -1;
332 int r;
071830ff
LP
333
334 assert(context);
af635cf3 335 assert(params);
80876c20
LP
336 assert(output < _EXEC_OUTPUT_MAX);
337 assert(ident);
338 assert(nfd >= 0);
071830ff 339
54fe0cdb
LP
340 fd = socket(AF_UNIX, SOCK_STREAM, 0);
341 if (fd < 0)
80876c20 342 return -errno;
071830ff 343
91dd5f7c 344 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
345 if (r < 0)
346 return r;
071830ff 347
2ac1ff68 348 if (shutdown(fd, SHUT_RD) < 0)
80876c20 349 return -errno;
071830ff 350
fd1f9c89 351 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 352
2ac1ff68 353 if (dprintf(fd,
62bca2c6 354 "%s\n"
80876c20
LP
355 "%s\n"
356 "%i\n"
54fe0cdb
LP
357 "%i\n"
358 "%i\n"
359 "%i\n"
4f4a1dbf 360 "%i\n",
c867611e 361 context->syslog_identifier ?: ident,
af635cf3 362 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
363 context->syslog_priority,
364 !!context->syslog_level_prefix,
f3dc6af2 365 false,
aac8c0c3 366 is_kmsg_output(output),
2ac1ff68
EV
367 is_terminal_output(output)) < 0)
368 return -errno;
80876c20 369
2ac1ff68 370 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 371}
2ac1ff68 372
3a274a21 373static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 374 int fd;
071830ff 375
80876c20
LP
376 assert(path);
377 assert(nfd >= 0);
fd1f9c89 378
3a274a21 379 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 380 if (fd < 0)
80876c20 381 return fd;
071830ff 382
046a82c1 383 return move_fd(fd, nfd, false);
80876c20 384}
071830ff 385
2038c3f5 386static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
387 union sockaddr_union sa;
388 socklen_t sa_len;
15a3e96f 389 _cleanup_close_ int fd = -1;
86fca584 390 int r;
071830ff 391
80876c20 392 assert(path);
071830ff 393
2038c3f5
LP
394 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
395 flags |= O_CREAT;
396
397 fd = open(path, flags|O_NOCTTY, mode);
398 if (fd >= 0)
15a3e96f 399 return TAKE_FD(fd);
071830ff 400
2038c3f5
LP
401 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
402 return -errno;
2038c3f5
LP
403
404 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
405
86fca584
ZJS
406 r = sockaddr_un_set_path(&sa.un, path);
407 if (r < 0)
408 return r == -EINVAL ? -ENXIO : r;
409 sa_len = r;
410
2038c3f5
LP
411 fd = socket(AF_UNIX, SOCK_STREAM, 0);
412 if (fd < 0)
413 return -errno;
414
86fca584 415 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 416 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 417 * indication that this wasn't an AF_UNIX socket after all */
071830ff 418
2038c3f5
LP
419 if ((flags & O_ACCMODE) == O_RDONLY)
420 r = shutdown(fd, SHUT_WR);
421 else if ((flags & O_ACCMODE) == O_WRONLY)
422 r = shutdown(fd, SHUT_RD);
423 else
86fca584 424 r = 0;
15a3e96f 425 if (r < 0)
2038c3f5 426 return -errno;
2038c3f5 427
15a3e96f 428 return TAKE_FD(fd);
80876c20 429}
071830ff 430
08f3be7a
LP
431static int fixup_input(
432 const ExecContext *context,
433 int socket_fd,
434 bool apply_tty_stdin) {
435
436 ExecInput std_input;
437
438 assert(context);
439
440 std_input = context->std_input;
1e3ad081
LP
441
442 if (is_terminal_input(std_input) && !apply_tty_stdin)
443 return EXEC_INPUT_NULL;
071830ff 444
03fd9c49 445 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
446 return EXEC_INPUT_NULL;
447
08f3be7a
LP
448 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
449 return EXEC_INPUT_NULL;
450
03fd9c49 451 return std_input;
4f2d528d
LP
452}
453
7966a916 454static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 455
7966a916 456 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
457 return EXEC_OUTPUT_INHERIT;
458
7966a916 459 return output;
4f2d528d
LP
460}
461
a34ceba6
LP
462static int setup_input(
463 const ExecContext *context,
464 const ExecParameters *params,
52c239d7 465 int socket_fd,
2caa38e9 466 const int named_iofds[static 3]) {
a34ceba6 467
4f2d528d
LP
468 ExecInput i;
469
470 assert(context);
a34ceba6 471 assert(params);
2caa38e9 472 assert(named_iofds);
a34ceba6
LP
473
474 if (params->stdin_fd >= 0) {
475 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
476 return -errno;
477
478 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
479 if (isatty(STDIN_FILENO)) {
480 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
481 (void) reset_terminal_fd(STDIN_FILENO, true);
482 }
a34ceba6
LP
483
484 return STDIN_FILENO;
485 }
4f2d528d 486
08f3be7a 487 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
488
489 switch (i) {
071830ff 490
80876c20
LP
491 case EXEC_INPUT_NULL:
492 return open_null_as(O_RDONLY, STDIN_FILENO);
493
494 case EXEC_INPUT_TTY:
495 case EXEC_INPUT_TTY_FORCE:
496 case EXEC_INPUT_TTY_FAIL: {
046a82c1 497 int fd;
071830ff 498
1e22b5cd 499 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
500 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
501 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
502 ACQUIRE_TERMINAL_WAIT,
3a43da28 503 USEC_INFINITY);
970edce6 504 if (fd < 0)
80876c20
LP
505 return fd;
506
046a82c1 507 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
508 }
509
4f2d528d 510 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
511 assert(socket_fd >= 0);
512
4f2d528d
LP
513 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
514
52c239d7 515 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
516 assert(named_iofds[STDIN_FILENO] >= 0);
517
52c239d7
LB
518 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
519 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
520
08f3be7a
LP
521 case EXEC_INPUT_DATA: {
522 int fd;
523
524 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
525 if (fd < 0)
526 return fd;
527
528 return move_fd(fd, STDIN_FILENO, false);
529 }
530
2038c3f5
LP
531 case EXEC_INPUT_FILE: {
532 bool rw;
533 int fd;
534
535 assert(context->stdio_file[STDIN_FILENO]);
536
537 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
538 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
539
540 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
541 if (fd < 0)
542 return fd;
543
544 return move_fd(fd, STDIN_FILENO, false);
545 }
546
80876c20 547 default:
04499a70 548 assert_not_reached();
80876c20
LP
549 }
550}
551
41fc585a
LP
552static bool can_inherit_stderr_from_stdout(
553 const ExecContext *context,
554 ExecOutput o,
555 ExecOutput e) {
556
557 assert(context);
558
559 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
560 * stderr fd */
561
562 if (e == EXEC_OUTPUT_INHERIT)
563 return true;
564 if (e != o)
565 return false;
566
567 if (e == EXEC_OUTPUT_NAMED_FD)
568 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
569
8d7dab1f 570 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
571 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
572
573 return true;
574}
575
a34ceba6 576static int setup_output(
34cf6c43 577 const Unit *unit,
a34ceba6
LP
578 const ExecContext *context,
579 const ExecParameters *params,
580 int fileno,
581 int socket_fd,
2caa38e9 582 const int named_iofds[static 3],
a34ceba6 583 const char *ident,
7bce046b
LP
584 uid_t uid,
585 gid_t gid,
586 dev_t *journal_stream_dev,
587 ino_t *journal_stream_ino) {
a34ceba6 588
4f2d528d
LP
589 ExecOutput o;
590 ExecInput i;
47c1d80d 591 int r;
4f2d528d 592
f2341e0a 593 assert(unit);
80876c20 594 assert(context);
a34ceba6 595 assert(params);
80876c20 596 assert(ident);
7bce046b
LP
597 assert(journal_stream_dev);
598 assert(journal_stream_ino);
80876c20 599
a34ceba6
LP
600 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
601
602 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
603 return -errno;
604
605 return STDOUT_FILENO;
606 }
607
608 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
609 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
610 return -errno;
611
612 return STDERR_FILENO;
613 }
614
08f3be7a 615 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 616 o = fixup_output(context->std_output, socket_fd);
4f2d528d 617
eb17e935
MS
618 if (fileno == STDERR_FILENO) {
619 ExecOutput e;
620 e = fixup_output(context->std_error, socket_fd);
80876c20 621
eb17e935
MS
622 /* This expects the input and output are already set up */
623
624 /* Don't change the stderr file descriptor if we inherit all
625 * the way and are not on a tty */
626 if (e == EXEC_OUTPUT_INHERIT &&
627 o == EXEC_OUTPUT_INHERIT &&
628 i == EXEC_INPUT_NULL &&
629 !is_terminal_input(context->std_input) &&
7966a916 630 getppid() != 1)
eb17e935
MS
631 return fileno;
632
633 /* Duplicate from stdout if possible */
41fc585a 634 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 635 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 636
eb17e935 637 o = e;
80876c20 638
eb17e935 639 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
640 /* If input got downgraded, inherit the original value */
641 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 642 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 643
08f3be7a
LP
644 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
645 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 646 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 647
acb591e4
LP
648 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
649 if (getppid() != 1)
eb17e935 650 return fileno;
94f04347 651
eb17e935
MS
652 /* We need to open /dev/null here anew, to get the right access mode. */
653 return open_null_as(O_WRONLY, fileno);
071830ff 654 }
94f04347 655
eb17e935 656 switch (o) {
80876c20
LP
657
658 case EXEC_OUTPUT_NULL:
eb17e935 659 return open_null_as(O_WRONLY, fileno);
80876c20
LP
660
661 case EXEC_OUTPUT_TTY:
4f2d528d 662 if (is_terminal_input(i))
eb17e935 663 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
664
665 /* We don't reset the terminal if this is just about output */
1e22b5cd 666 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 667
9a6bca7a 668 case EXEC_OUTPUT_KMSG:
28dbc1e8 669 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
670 case EXEC_OUTPUT_JOURNAL:
671 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 672 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 673 if (r < 0) {
7966a916
ZJS
674 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
675 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 676 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
677 } else {
678 struct stat st;
679
680 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
681 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
682 * services to detect whether they are connected to the journal or not.
683 *
684 * If both stdout and stderr are connected to a stream then let's make sure to store the data
685 * about STDERR as that's usually the best way to do logging. */
7bce046b 686
ab2116b1
LP
687 if (fstat(fileno, &st) >= 0 &&
688 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
689 *journal_stream_dev = st.st_dev;
690 *journal_stream_ino = st.st_ino;
691 }
47c1d80d
MS
692 }
693 return r;
4f2d528d
LP
694
695 case EXEC_OUTPUT_SOCKET:
696 assert(socket_fd >= 0);
e75a9ed1 697
eb17e935 698 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 699
52c239d7 700 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
701 assert(named_iofds[fileno] >= 0);
702
52c239d7
LB
703 (void) fd_nonblock(named_iofds[fileno], false);
704 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
705
566b7d23 706 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
707 case EXEC_OUTPUT_FILE_APPEND:
708 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 709 bool rw;
566b7d23 710 int fd, flags;
2038c3f5
LP
711
712 assert(context->stdio_file[fileno]);
713
714 rw = context->std_input == EXEC_INPUT_FILE &&
715 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
716
717 if (rw)
718 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
719
566b7d23
ZD
720 flags = O_WRONLY;
721 if (o == EXEC_OUTPUT_FILE_APPEND)
722 flags |= O_APPEND;
8d7dab1f
LW
723 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
724 flags |= O_TRUNC;
566b7d23
ZD
725
726 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
727 if (fd < 0)
728 return fd;
729
566b7d23 730 return move_fd(fd, fileno, 0);
2038c3f5
LP
731 }
732
94f04347 733 default:
04499a70 734 assert_not_reached();
94f04347 735 }
071830ff
LP
736}
737
02a51aba 738static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 739 int r;
02a51aba
LP
740
741 assert(fd >= 0);
02a51aba 742
1ff74fb6 743 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
744 if (isatty(fd) < 1) {
745 if (IN_SET(errno, EINVAL, ENOTTY))
746 return 0; /* not a tty */
1ff74fb6 747
02a51aba 748 return -errno;
4b3b5bc7 749 }
02a51aba 750
4b3b5bc7 751 /* This might fail. What matters are the results. */
f2df231f 752 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
753 if (r < 0)
754 return r;
02a51aba 755
4b3b5bc7 756 return 1;
02a51aba
LP
757}
758
7d5ceb64 759static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
760 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
761 int r;
80876c20 762
80876c20
LP
763 assert(_saved_stdin);
764 assert(_saved_stdout);
765
af6da548
LP
766 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
767 if (saved_stdin < 0)
768 return -errno;
80876c20 769
af6da548 770 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
771 if (saved_stdout < 0)
772 return -errno;
80876c20 773
8854d795 774 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
775 if (fd < 0)
776 return fd;
80876c20 777
af6da548
LP
778 r = chown_terminal(fd, getuid());
779 if (r < 0)
3d18b167 780 return r;
02a51aba 781
3d18b167
LP
782 r = reset_terminal_fd(fd, true);
783 if (r < 0)
784 return r;
80876c20 785
2b33ab09 786 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 787 fd = -1;
2b33ab09
LP
788 if (r < 0)
789 return r;
80876c20
LP
790
791 *_saved_stdin = saved_stdin;
792 *_saved_stdout = saved_stdout;
793
3d18b167 794 saved_stdin = saved_stdout = -1;
80876c20 795
3d18b167 796 return 0;
80876c20
LP
797}
798
63d77c92 799static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
800 assert(err < 0);
801
802 if (err == -ETIMEDOUT)
63d77c92 803 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
804 else {
805 errno = -err;
63d77c92 806 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
807 }
808}
809
63d77c92 810static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 811 _cleanup_close_ int fd = -1;
80876c20 812
3b20f877 813 assert(vc);
80876c20 814
7d5ceb64 815 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 816 if (fd < 0)
3b20f877 817 return;
80876c20 818
63d77c92 819 write_confirm_error_fd(err, fd, u);
af6da548 820}
80876c20 821
3d18b167 822static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 823 int r = 0;
80876c20 824
af6da548
LP
825 assert(saved_stdin);
826 assert(saved_stdout);
827
828 release_terminal();
829
830 if (*saved_stdin >= 0)
80876c20 831 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 832 r = -errno;
80876c20 833
af6da548 834 if (*saved_stdout >= 0)
80876c20 835 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 836 r = -errno;
80876c20 837
3d18b167
LP
838 *saved_stdin = safe_close(*saved_stdin);
839 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
840
841 return r;
842}
843
3b20f877
FB
844enum {
845 CONFIRM_PRETEND_FAILURE = -1,
846 CONFIRM_PRETEND_SUCCESS = 0,
847 CONFIRM_EXECUTE = 1,
848};
849
eedf223a 850static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 851 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 852 _cleanup_free_ char *e = NULL;
3b20f877 853 char c;
af6da548 854
3b20f877 855 /* For any internal errors, assume a positive response. */
7d5ceb64 856 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 857 if (r < 0) {
63d77c92 858 write_confirm_error(r, vc, u);
3b20f877
FB
859 return CONFIRM_EXECUTE;
860 }
af6da548 861
b0eb2944
FB
862 /* confirm_spawn might have been disabled while we were sleeping. */
863 if (manager_is_confirm_spawn_disabled(u->manager)) {
864 r = 1;
865 goto restore_stdio;
866 }
af6da548 867
2bcd3c26
FB
868 e = ellipsize(cmdline, 60, 100);
869 if (!e) {
870 log_oom();
871 r = CONFIRM_EXECUTE;
872 goto restore_stdio;
873 }
af6da548 874
d172b175 875 for (;;) {
539622bd 876 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 877 if (r < 0) {
63d77c92 878 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
af6da548 882
d172b175 883 switch (c) {
b0eb2944
FB
884 case 'c':
885 printf("Resuming normal execution.\n");
886 manager_disable_confirm_spawn();
887 r = 1;
888 break;
dd6f9ac0
FB
889 case 'D':
890 unit_dump(u, stdout, " ");
891 continue; /* ask again */
d172b175
FB
892 case 'f':
893 printf("Failing execution.\n");
894 r = CONFIRM_PRETEND_FAILURE;
895 break;
896 case 'h':
b0eb2944
FB
897 printf(" c - continue, proceed without asking anymore\n"
898 " D - dump, show the state of the unit\n"
dd6f9ac0 899 " f - fail, don't execute the command and pretend it failed\n"
d172b175 900 " h - help\n"
eedf223a 901 " i - info, show a short summary of the unit\n"
56fde33a 902 " j - jobs, show jobs that are in progress\n"
d172b175
FB
903 " s - skip, don't execute the command and pretend it succeeded\n"
904 " y - yes, execute the command\n");
dd6f9ac0 905 continue; /* ask again */
eedf223a
FB
906 case 'i':
907 printf(" Description: %s\n"
908 " Unit: %s\n"
909 " Command: %s\n",
910 u->id, u->description, cmdline);
911 continue; /* ask again */
56fde33a
FB
912 case 'j':
913 manager_dump_jobs(u->manager, stdout, " ");
914 continue; /* ask again */
539622bd
FB
915 case 'n':
916 /* 'n' was removed in favor of 'f'. */
917 printf("Didn't understand 'n', did you mean 'f'?\n");
918 continue; /* ask again */
d172b175
FB
919 case 's':
920 printf("Skipping execution.\n");
921 r = CONFIRM_PRETEND_SUCCESS;
922 break;
923 case 'y':
924 r = CONFIRM_EXECUTE;
925 break;
926 default:
04499a70 927 assert_not_reached();
d172b175 928 }
3b20f877 929 break;
3b20f877 930 }
af6da548 931
3b20f877 932restore_stdio:
af6da548 933 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 934 return r;
80876c20
LP
935}
936
4d885bd3
DH
937static int get_fixed_user(const ExecContext *c, const char **user,
938 uid_t *uid, gid_t *gid,
939 const char **home, const char **shell) {
81a2b7ce 940 int r;
4d885bd3 941 const char *name;
81a2b7ce 942
4d885bd3 943 assert(c);
81a2b7ce 944
23deef88
LP
945 if (!c->user)
946 return 0;
947
4d885bd3
DH
948 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
949 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 950
23deef88 951 name = c->user;
fafff8f1 952 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
953 if (r < 0)
954 return r;
81a2b7ce 955
4d885bd3
DH
956 *user = name;
957 return 0;
958}
959
960static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
961 int r;
962 const char *name;
963
964 assert(c);
965
966 if (!c->group)
967 return 0;
968
969 name = c->group;
fafff8f1 970 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
971 if (r < 0)
972 return r;
973
974 *group = name;
975 return 0;
976}
977
cdc5d5c5
DH
978static int get_supplementary_groups(const ExecContext *c, const char *user,
979 const char *group, gid_t gid,
980 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
981 char **i;
982 int r, k = 0;
983 int ngroups_max;
984 bool keep_groups = false;
985 gid_t *groups = NULL;
986 _cleanup_free_ gid_t *l_gids = NULL;
987
988 assert(c);
989
bbeea271
DH
990 /*
991 * If user is given, then lookup GID and supplementary groups list.
992 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
993 * here and as early as possible so we keep the list of supplementary
994 * groups of the caller.
bbeea271
DH
995 */
996 if (user && gid_is_valid(gid) && gid != 0) {
997 /* First step, initialize groups from /etc/groups */
998 if (initgroups(user, gid) < 0)
999 return -errno;
1000
1001 keep_groups = true;
1002 }
1003
ac6e8be6 1004 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1005 return 0;
1006
366ddd25
DH
1007 /*
1008 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1009 * be positive, otherwise fail.
1010 */
1011 errno = 0;
1012 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1013 if (ngroups_max <= 0)
1014 return errno_or_else(EOPNOTSUPP);
366ddd25 1015
4d885bd3
DH
1016 l_gids = new(gid_t, ngroups_max);
1017 if (!l_gids)
1018 return -ENOMEM;
81a2b7ce 1019
4d885bd3
DH
1020 if (keep_groups) {
1021 /*
1022 * Lookup the list of groups that the user belongs to, we
1023 * avoid NSS lookups here too for gid=0.
1024 */
1025 k = ngroups_max;
1026 if (getgrouplist(user, gid, l_gids, &k) < 0)
1027 return -EINVAL;
1028 } else
1029 k = 0;
81a2b7ce 1030
4d885bd3
DH
1031 STRV_FOREACH(i, c->supplementary_groups) {
1032 const char *g;
81a2b7ce 1033
4d885bd3
DH
1034 if (k >= ngroups_max)
1035 return -E2BIG;
81a2b7ce 1036
4d885bd3 1037 g = *i;
fafff8f1 1038 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1039 if (r < 0)
1040 return r;
81a2b7ce 1041
4d885bd3
DH
1042 k++;
1043 }
81a2b7ce 1044
4d885bd3
DH
1045 /*
1046 * Sets ngids to zero to drop all supplementary groups, happens
1047 * when we are under root and SupplementaryGroups= is empty.
1048 */
1049 if (k == 0) {
1050 *ngids = 0;
1051 return 0;
1052 }
81a2b7ce 1053
4d885bd3
DH
1054 /* Otherwise get the final list of supplementary groups */
1055 groups = memdup(l_gids, sizeof(gid_t) * k);
1056 if (!groups)
1057 return -ENOMEM;
1058
1059 *supplementary_gids = groups;
1060 *ngids = k;
1061
1062 groups = NULL;
1063
1064 return 0;
1065}
1066
34cf6c43 1067static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1068 int r;
1069
709dbeac
YW
1070 /* Handle SupplementaryGroups= if it is not empty */
1071 if (ngids > 0) {
4d885bd3
DH
1072 r = maybe_setgroups(ngids, supplementary_gids);
1073 if (r < 0)
97f0e76f 1074 return r;
4d885bd3 1075 }
81a2b7ce 1076
4d885bd3
DH
1077 if (gid_is_valid(gid)) {
1078 /* Then set our gids */
1079 if (setresgid(gid, gid, gid) < 0)
1080 return -errno;
81a2b7ce
LP
1081 }
1082
1083 return 0;
1084}
1085
dbdc4098
TK
1086static int set_securebits(int bits, int mask) {
1087 int current, applied;
1088 current = prctl(PR_GET_SECUREBITS);
1089 if (current < 0)
1090 return -errno;
1091 /* Clear all securebits defined in mask and set bits */
1092 applied = (current & ~mask) | bits;
1093 if (current == applied)
1094 return 0;
1095 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1096 return -errno;
1097 return 1;
1098}
1099
81a2b7ce 1100static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1101 assert(context);
dbdc4098 1102 int r;
81a2b7ce 1103
4d885bd3
DH
1104 if (!uid_is_valid(uid))
1105 return 0;
1106
479050b3 1107 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1108 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1109 * required, so we also need keep-caps in this case.
1110 */
81a2b7ce 1111
dbdc4098 1112 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1113
1114 /* First step: If we need to keep capabilities but
1115 * drop privileges we need to make sure we keep our
cbb21cca 1116 * caps, while we drop privileges. */
693ced48 1117 if (uid != 0) {
dbdc4098
TK
1118 /* Add KEEP_CAPS to the securebits */
1119 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1120 if (r < 0)
1121 return r;
693ced48 1122 }
81a2b7ce
LP
1123 }
1124
479050b3 1125 /* Second step: actually set the uids */
81a2b7ce
LP
1126 if (setresuid(uid, uid, uid) < 0)
1127 return -errno;
1128
1129 /* At this point we should have all necessary capabilities but
1130 are otherwise a normal user. However, the caps might got
1131 corrupted due to the setresuid() so we need clean them up
1132 later. This is done outside of this call. */
1133
1134 return 0;
1135}
1136
349cc4a5 1137#if HAVE_PAM
5b6319dc
LP
1138
1139static int null_conv(
1140 int num_msg,
1141 const struct pam_message **msg,
1142 struct pam_response **resp,
1143 void *appdata_ptr) {
1144
1145 /* We don't support conversations */
1146
1147 return PAM_CONV_ERR;
1148}
1149
cefc33ae
LP
1150#endif
1151
5b6319dc
LP
1152static int setup_pam(
1153 const char *name,
1154 const char *user,
940c5210 1155 uid_t uid,
2d6fce8d 1156 gid_t gid,
5b6319dc 1157 const char *tty,
2065ca69 1158 char ***env,
5b8d1f6b 1159 const int fds[], size_t n_fds) {
5b6319dc 1160
349cc4a5 1161#if HAVE_PAM
cefc33ae 1162
5b6319dc
LP
1163 static const struct pam_conv conv = {
1164 .conv = null_conv,
1165 .appdata_ptr = NULL
1166 };
1167
2d7c6aa2 1168 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1169 pam_handle_t *handle = NULL;
d6e5f3ad 1170 sigset_t old_ss;
7bb70b6e 1171 int pam_code = PAM_SUCCESS, r;
84eada2f 1172 char **nv, **e = NULL;
5b6319dc
LP
1173 bool close_session = false;
1174 pid_t pam_pid = 0, parent_pid;
970edce6 1175 int flags = 0;
5b6319dc
LP
1176
1177 assert(name);
1178 assert(user);
2065ca69 1179 assert(env);
5b6319dc
LP
1180
1181 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1182 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1183 * systemd via the cgroup logic. It will then remove the PAM
1184 * session again. The parent process will exec() the actual
1185 * daemon. We do things this way to ensure that the main PID
1186 * of the daemon is the one we initially fork()ed. */
1187
7bb70b6e
LP
1188 r = barrier_create(&barrier);
1189 if (r < 0)
2d7c6aa2
DH
1190 goto fail;
1191
553d2243 1192 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1193 flags |= PAM_SILENT;
1194
f546241b
ZJS
1195 pam_code = pam_start(name, user, &conv, &handle);
1196 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1197 handle = NULL;
1198 goto fail;
1199 }
1200
3cd24c1a
LP
1201 if (!tty) {
1202 _cleanup_free_ char *q = NULL;
1203
1204 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1205 * out if that's the case, and read the TTY off it. */
1206
1207 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1208 tty = strjoina("/dev/", q);
1209 }
1210
f546241b
ZJS
1211 if (tty) {
1212 pam_code = pam_set_item(handle, PAM_TTY, tty);
1213 if (pam_code != PAM_SUCCESS)
5b6319dc 1214 goto fail;
f546241b 1215 }
5b6319dc 1216
84eada2f
JW
1217 STRV_FOREACH(nv, *env) {
1218 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1219 if (pam_code != PAM_SUCCESS)
1220 goto fail;
1221 }
1222
970edce6 1223 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1224 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1225 goto fail;
1226
3bb39ea9
DG
1227 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1228 if (pam_code != PAM_SUCCESS)
46d7c6af 1229 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1230
970edce6 1231 pam_code = pam_open_session(handle, flags);
f546241b 1232 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1233 goto fail;
1234
1235 close_session = true;
1236
f546241b
ZJS
1237 e = pam_getenvlist(handle);
1238 if (!e) {
5b6319dc
LP
1239 pam_code = PAM_BUF_ERR;
1240 goto fail;
1241 }
1242
1243 /* Block SIGTERM, so that we know that it won't get lost in
1244 * the child */
ce30c8dc 1245
72c0a2c2 1246 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1247
df0ff127 1248 parent_pid = getpid_cached();
5b6319dc 1249
4c253ed1
LP
1250 r = safe_fork("(sd-pam)", 0, &pam_pid);
1251 if (r < 0)
5b6319dc 1252 goto fail;
4c253ed1 1253 if (r == 0) {
7bb70b6e 1254 int sig, ret = EXIT_PAM;
5b6319dc
LP
1255
1256 /* The child's job is to reset the PAM session on
1257 * termination */
2d7c6aa2 1258 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1259
1da37e58
ZJS
1260 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1261 * those fds are open here that have been opened by PAM. */
4c253ed1 1262 (void) close_many(fds, n_fds);
5b6319dc 1263
940c5210
AK
1264 /* Drop privileges - we don't need any to pam_close_session
1265 * and this will make PR_SET_PDEATHSIG work in most cases.
1266 * If this fails, ignore the error - but expect sd-pam threads
1267 * to fail to exit normally */
2d6fce8d 1268
97f0e76f
LP
1269 r = maybe_setgroups(0, NULL);
1270 if (r < 0)
1271 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1272 if (setresgid(gid, gid, gid) < 0)
1273 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1274 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1275 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1276
9c274488 1277 (void) ignore_signals(SIGPIPE);
ce30c8dc 1278
940c5210
AK
1279 /* Wait until our parent died. This will only work if
1280 * the above setresuid() succeeds, otherwise the kernel
1281 * will not allow unprivileged parents kill their privileged
1282 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1283 * to do the rest for us. */
1284 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1285 goto child_finish;
1286
2d7c6aa2
DH
1287 /* Tell the parent that our setup is done. This is especially
1288 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1289 * setup might race against our setresuid(2) call.
1290 *
1291 * If the parent aborted, we'll detect this below, hence ignore
1292 * return failure here. */
1293 (void) barrier_place(&barrier);
2d7c6aa2 1294
643f4706 1295 /* Check if our parent process might already have died? */
5b6319dc 1296 if (getppid() == parent_pid) {
d6e5f3ad
DM
1297 sigset_t ss;
1298
1299 assert_se(sigemptyset(&ss) >= 0);
1300 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
3dead8d9
LP
1302 for (;;) {
1303 if (sigwait(&ss, &sig) < 0) {
1304 if (errno == EINTR)
1305 continue;
1306
1307 goto child_finish;
1308 }
5b6319dc 1309
3dead8d9
LP
1310 assert(sig == SIGTERM);
1311 break;
1312 }
5b6319dc
LP
1313 }
1314
3bb39ea9
DG
1315 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316 if (pam_code != PAM_SUCCESS)
1317 goto child_finish;
1318
3dead8d9 1319 /* If our parent died we'll end the session */
f546241b 1320 if (getppid() != parent_pid) {
970edce6 1321 pam_code = pam_close_session(handle, flags);
f546241b 1322 if (pam_code != PAM_SUCCESS)
5b6319dc 1323 goto child_finish;
f546241b 1324 }
5b6319dc 1325
7bb70b6e 1326 ret = 0;
5b6319dc
LP
1327
1328 child_finish:
970edce6 1329 pam_end(handle, pam_code | flags);
7bb70b6e 1330 _exit(ret);
5b6319dc
LP
1331 }
1332
2d7c6aa2
DH
1333 barrier_set_role(&barrier, BARRIER_PARENT);
1334
5b6319dc
LP
1335 /* If the child was forked off successfully it will do all the
1336 * cleanups, so forget about the handle here. */
1337 handle = NULL;
1338
3b8bddde 1339 /* Unblock SIGTERM again in the parent */
72c0a2c2 1340 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1341
1342 /* We close the log explicitly here, since the PAM modules
1343 * might have opened it, but we don't want this fd around. */
1344 closelog();
1345
2d7c6aa2
DH
1346 /* Synchronously wait for the child to initialize. We don't care for
1347 * errors as we cannot recover. However, warn loudly if it happens. */
1348 if (!barrier_place_and_sync(&barrier))
1349 log_error("PAM initialization failed");
1350
130d3d22 1351 return strv_free_and_replace(*env, e);
5b6319dc
LP
1352
1353fail:
970edce6
ZJS
1354 if (pam_code != PAM_SUCCESS) {
1355 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1356 r = -EPERM; /* PAM errors do not map to errno */
1357 } else
1358 log_error_errno(r, "PAM failed: %m");
9ba35398 1359
5b6319dc
LP
1360 if (handle) {
1361 if (close_session)
970edce6 1362 pam_code = pam_close_session(handle, flags);
5b6319dc 1363
970edce6 1364 pam_end(handle, pam_code | flags);
5b6319dc
LP
1365 }
1366
1367 strv_free(e);
5b6319dc
LP
1368 closelog();
1369
7bb70b6e 1370 return r;
cefc33ae
LP
1371#else
1372 return 0;
5b6319dc 1373#endif
cefc33ae 1374}
5b6319dc 1375
5d6b1584
LP
1376static void rename_process_from_path(const char *path) {
1377 char process_name[11];
1378 const char *p;
1379 size_t l;
1380
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1383
2b6bf07d 1384 p = basename(path);
5d6b1584
LP
1385 if (isempty(p)) {
1386 rename_process("(...)");
1387 return;
1388 }
1389
1390 l = strlen(p);
1391 if (l > 8) {
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1394 * "systemd-" */
1395 p = p + l - 8;
1396 l = 8;
1397 }
1398
1399 process_name[0] = '(';
1400 memcpy(process_name+1, p, l);
1401 process_name[1+l] = ')';
1402 process_name[1+l+1] = 0;
1403
1404 rename_process(process_name);
1405}
1406
469830d1
LP
1407static bool context_has_address_families(const ExecContext *c) {
1408 assert(c);
1409
6b000af4 1410 return c->address_families_allow_list ||
469830d1
LP
1411 !set_isempty(c->address_families);
1412}
1413
1414static bool context_has_syscall_filters(const ExecContext *c) {
1415 assert(c);
1416
6b000af4 1417 return c->syscall_allow_list ||
8cfa775f 1418 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1419}
1420
9df2cdd8
TM
1421static bool context_has_syscall_logs(const ExecContext *c) {
1422 assert(c);
1423
1424 return c->syscall_log_allow_list ||
1425 !hashmap_isempty(c->syscall_log);
1426}
1427
469830d1
LP
1428static bool context_has_no_new_privileges(const ExecContext *c) {
1429 assert(c);
1430
1431 if (c->no_new_privileges)
1432 return true;
1433
1434 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435 return false;
1436
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1438 return c->lock_personality ||
469830d1 1439 c->memory_deny_write_execute ||
0538d2a8 1440 c->private_devices ||
fc64760d 1441 c->protect_clock ||
0538d2a8 1442 c->protect_hostname ||
469830d1
LP
1443 c->protect_kernel_tunables ||
1444 c->protect_kernel_modules ||
84703040 1445 c->protect_kernel_logs ||
0538d2a8
YW
1446 context_has_address_families(c) ||
1447 exec_context_restrict_namespaces_set(c) ||
1448 c->restrict_realtime ||
1449 c->restrict_suid_sgid ||
78e864e5 1450 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1451 context_has_syscall_filters(c) ||
1452 context_has_syscall_logs(c);
469830d1
LP
1453}
1454
bb0c0d6f
LP
1455static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457 assert(context);
1458
1459 return !hashmap_isempty(context->set_credentials) ||
43144be4 1460 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1461}
1462
349cc4a5 1463#if HAVE_SECCOMP
17df7223 1464
83f12b27 1465static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1466
1467 if (is_seccomp_available())
1468 return false;
1469
f673b62d 1470 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1471 return true;
83f12b27
FS
1472}
1473
165a31c0 1474static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1475 uint32_t negative_action, default_action, action;
165a31c0 1476 int r;
8351ceae 1477
469830d1 1478 assert(u);
c0467cf3 1479 assert(c);
8351ceae 1480
469830d1 1481 if (!context_has_syscall_filters(c))
83f12b27
FS
1482 return 0;
1483
469830d1
LP
1484 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485 return 0;
e9642be2 1486
005bfaf1 1487 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1488
6b000af4 1489 if (c->syscall_allow_list) {
469830d1
LP
1490 default_action = negative_action;
1491 action = SCMP_ACT_ALLOW;
7c66bae2 1492 } else {
469830d1
LP
1493 default_action = SCMP_ACT_ALLOW;
1494 action = negative_action;
57183d11 1495 }
8351ceae 1496
165a31c0 1497 if (needs_ambient_hack) {
6b000af4 1498 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1499 if (r < 0)
1500 return r;
1501 }
1502
b54f36c6 1503 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1504}
1505
9df2cdd8
TM
1506static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507#ifdef SCMP_ACT_LOG
1508 uint32_t default_action, action;
1509#endif
1510
1511 assert(u);
1512 assert(c);
1513
1514 if (!context_has_syscall_logs(c))
1515 return 0;
1516
1517#ifdef SCMP_ACT_LOG
1518 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519 return 0;
1520
1521 if (c->syscall_log_allow_list) {
1522 /* Log nothing but the ones listed */
1523 default_action = SCMP_ACT_ALLOW;
1524 action = SCMP_ACT_LOG;
1525 } else {
1526 /* Log everything but the ones listed */
1527 default_action = SCMP_ACT_LOG;
1528 action = SCMP_ACT_ALLOW;
1529 }
1530
1531 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532#else
1533 /* old libseccomp */
1534 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535 return 0;
1536#endif
1537}
1538
469830d1
LP
1539static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540 assert(u);
4298d0b5
LP
1541 assert(c);
1542
469830d1 1543 if (set_isempty(c->syscall_archs))
83f12b27
FS
1544 return 0;
1545
469830d1
LP
1546 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547 return 0;
4298d0b5 1548
469830d1
LP
1549 return seccomp_restrict_archs(c->syscall_archs);
1550}
4298d0b5 1551
469830d1
LP
1552static int apply_address_families(const Unit* u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
4298d0b5 1555
469830d1
LP
1556 if (!context_has_address_families(c))
1557 return 0;
4298d0b5 1558
469830d1
LP
1559 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560 return 0;
4298d0b5 1561
6b000af4 1562 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1563}
4298d0b5 1564
83f12b27 1565static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1566 assert(u);
f3e43635
TM
1567 assert(c);
1568
469830d1 1569 if (!c->memory_deny_write_execute)
83f12b27
FS
1570 return 0;
1571
469830d1
LP
1572 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573 return 0;
f3e43635 1574
469830d1 1575 return seccomp_memory_deny_write_execute();
f3e43635
TM
1576}
1577
83f12b27 1578static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1579 assert(u);
f4170c67
LP
1580 assert(c);
1581
469830d1 1582 if (!c->restrict_realtime)
83f12b27
FS
1583 return 0;
1584
469830d1
LP
1585 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586 return 0;
f4170c67 1587
469830d1 1588 return seccomp_restrict_realtime();
f4170c67
LP
1589}
1590
f69567cb
LP
1591static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592 assert(u);
1593 assert(c);
1594
1595 if (!c->restrict_suid_sgid)
1596 return 0;
1597
1598 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599 return 0;
1600
1601 return seccomp_restrict_suid_sgid();
1602}
1603
59e856c7 1604static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1605 assert(u);
59eeb84b
LP
1606 assert(c);
1607
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1610
469830d1 1611 if (!c->protect_kernel_tunables)
59eeb84b
LP
1612 return 0;
1613
469830d1
LP
1614 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615 return 0;
59eeb84b 1616
469830d1 1617 return seccomp_protect_sysctl();
59eeb84b
LP
1618}
1619
59e856c7 1620static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1621 assert(u);
502d704e
DH
1622 assert(c);
1623
25a8d8a0 1624 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1625
469830d1
LP
1626 if (!c->protect_kernel_modules)
1627 return 0;
1628
502d704e
DH
1629 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630 return 0;
1631
b54f36c6 1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1633}
1634
84703040
KK
1635static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636 assert(u);
1637 assert(c);
1638
1639 if (!c->protect_kernel_logs)
1640 return 0;
1641
1642 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643 return 0;
1644
1645 return seccomp_protect_syslog();
1646}
1647
daf8f72b 1648static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1649 assert(u);
1650 assert(c);
1651
1652 if (!c->protect_clock)
1653 return 0;
1654
1655 if (skip_seccomp_unavailable(u, "ProtectClock="))
1656 return 0;
1657
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659}
1660
59e856c7 1661static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1662 assert(u);
ba128bb8
LP
1663 assert(c);
1664
8f81a5f6 1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1666
469830d1
LP
1667 if (!c->private_devices)
1668 return 0;
1669
ba128bb8
LP
1670 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671 return 0;
1672
b54f36c6 1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1674}
1675
34cf6c43 1676static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1677 assert(u);
add00535
LP
1678 assert(c);
1679
1680 if (!exec_context_restrict_namespaces_set(c))
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684 return 0;
1685
1686 return seccomp_restrict_namespaces(c->restrict_namespaces);
1687}
1688
b1994387
ILG
1689#if HAVE_LIBBPF
1690static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
1691 if (lsm_bpf_supported())
1692 return false;
1693
1694 log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1695 return true;
1696}
1697
1698static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1699 assert(u);
1700 assert(c);
1701
1702 if (!exec_context_restrict_filesystems_set(c))
1703 return 0;
1704
1705 if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1706 return 0;
1707
1708 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1709}
1710#endif
1711
78e864e5 1712static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1713 unsigned long personality;
1714 int r;
78e864e5
TM
1715
1716 assert(u);
1717 assert(c);
1718
1719 if (!c->lock_personality)
1720 return 0;
1721
1722 if (skip_seccomp_unavailable(u, "LockPersonality="))
1723 return 0;
1724
e8132d63
LP
1725 personality = c->personality;
1726
1727 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1728 if (personality == PERSONALITY_INVALID) {
1729
1730 r = opinionated_personality(&personality);
1731 if (r < 0)
1732 return r;
1733 }
78e864e5
TM
1734
1735 return seccomp_lock_personality(personality);
1736}
1737
c0467cf3 1738#endif
8351ceae 1739
daf8f72b 1740static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1741 assert(u);
1742 assert(c);
1743
1744 if (!c->protect_hostname)
1745 return 0;
1746
1747 if (ns_type_supported(NAMESPACE_UTS)) {
1748 if (unshare(CLONE_NEWUTS) < 0) {
1749 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1750 *ret_exit_status = EXIT_NAMESPACE;
1751 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1752 }
1753
1754 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1755 }
1756 } else
1757 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1758
1759#if HAVE_SECCOMP
8f3e342f
ZJS
1760 int r;
1761
daf8f72b
LP
1762 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1763 return 0;
1764
1765 r = seccomp_protect_hostname();
1766 if (r < 0) {
1767 *ret_exit_status = EXIT_SECCOMP;
1768 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1769 }
1770#endif
1771
1772 return 0;
1773}
1774
3042bbeb 1775static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1776 assert(idle_pipe);
1777
54eb2300
LP
1778 idle_pipe[1] = safe_close(idle_pipe[1]);
1779 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1780
1781 if (idle_pipe[0] >= 0) {
1782 int r;
1783
1784 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1785
1786 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1787 ssize_t n;
1788
31a7eb86 1789 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1790 n = write(idle_pipe[3], "x", 1);
1791 if (n > 0)
cd972d69 1792 /* Wait for systemd to react to the signal above. */
54756dce 1793 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1794 }
1795
54eb2300 1796 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1797
1798 }
1799
54eb2300 1800 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1801}
1802
fb2042dd
YW
1803static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1804
7cae38c4 1805static int build_environment(
34cf6c43 1806 const Unit *u,
9fa95f85 1807 const ExecContext *c,
1e22b5cd 1808 const ExecParameters *p,
da6053d0 1809 size_t n_fds,
7cae38c4
LP
1810 const char *home,
1811 const char *username,
1812 const char *shell,
7bce046b
LP
1813 dev_t journal_stream_dev,
1814 ino_t journal_stream_ino,
7cae38c4
LP
1815 char ***ret) {
1816
1817 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1818 size_t n_env = 0;
7cae38c4
LP
1819 char *x;
1820
4b58153d 1821 assert(u);
7cae38c4 1822 assert(c);
7c1cb6f1 1823 assert(p);
7cae38c4
LP
1824 assert(ret);
1825
dc4e2940 1826#define N_ENV_VARS 17
8d5bb13d 1827 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1828 if (!our_env)
1829 return -ENOMEM;
1830
1831 if (n_fds > 0) {
8dd4c05b
LP
1832 _cleanup_free_ char *joined = NULL;
1833
df0ff127 1834 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1835 return -ENOMEM;
1836 our_env[n_env++] = x;
1837
da6053d0 1838 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1839 return -ENOMEM;
1840 our_env[n_env++] = x;
8dd4c05b 1841
1e22b5cd 1842 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1843 if (!joined)
1844 return -ENOMEM;
1845
605405c6 1846 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1847 if (!x)
1848 return -ENOMEM;
1849 our_env[n_env++] = x;
7cae38c4
LP
1850 }
1851
b08af3b1 1852 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1853 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1854 return -ENOMEM;
1855 our_env[n_env++] = x;
1856
1e22b5cd 1857 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1858 return -ENOMEM;
1859 our_env[n_env++] = x;
1860 }
1861
fd63e712
LP
1862 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1863 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1864 * check the database directly. */
ac647978 1865 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1866 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1867 if (!x)
1868 return -ENOMEM;
1869 our_env[n_env++] = x;
1870 }
1871
7cae38c4 1872 if (home) {
b910cc72 1873 x = strjoin("HOME=", home);
7cae38c4
LP
1874 if (!x)
1875 return -ENOMEM;
7bbead1d 1876
4ff361cc 1877 path_simplify(x + 5);
7cae38c4
LP
1878 our_env[n_env++] = x;
1879 }
1880
1881 if (username) {
b910cc72 1882 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886
b910cc72 1887 x = strjoin("USER=", username);
7cae38c4
LP
1888 if (!x)
1889 return -ENOMEM;
1890 our_env[n_env++] = x;
1891 }
1892
1893 if (shell) {
b910cc72 1894 x = strjoin("SHELL=", shell);
7cae38c4
LP
1895 if (!x)
1896 return -ENOMEM;
7bbead1d 1897
4ff361cc 1898 path_simplify(x + 6);
7cae38c4
LP
1899 our_env[n_env++] = x;
1900 }
1901
4b58153d
LP
1902 if (!sd_id128_is_null(u->invocation_id)) {
1903 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1904 return -ENOMEM;
1905
1906 our_env[n_env++] = x;
1907 }
1908
6af760f3
LP
1909 if (exec_context_needs_term(c)) {
1910 const char *tty_path, *term = NULL;
1911
1912 tty_path = exec_context_tty_path(c);
1913
e8cf09b2
LP
1914 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1915 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1916 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1917
e8cf09b2 1918 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1919 term = getenv("TERM");
e8cf09b2 1920
6af760f3
LP
1921 if (!term)
1922 term = default_term_for_tty(tty_path);
7cae38c4 1923
b910cc72 1924 x = strjoin("TERM=", term);
7cae38c4
LP
1925 if (!x)
1926 return -ENOMEM;
1927 our_env[n_env++] = x;
1928 }
1929
7bce046b
LP
1930 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1931 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1932 return -ENOMEM;
1933
1934 our_env[n_env++] = x;
1935 }
1936
91dd5f7c
LP
1937 if (c->log_namespace) {
1938 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1939 if (!x)
1940 return -ENOMEM;
1941
1942 our_env[n_env++] = x;
1943 }
1944
5b10116e 1945 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1946 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1947 const char *n;
1948
1949 if (!p->prefix[t])
1950 continue;
1951
211a3d87 1952 if (c->directories[t].n_items == 0)
fb2042dd
YW
1953 continue;
1954
1955 n = exec_directory_env_name_to_string(t);
1956 if (!n)
1957 continue;
1958
211a3d87
LB
1959 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1960 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1961
211a3d87
LB
1962 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1963 if (!prefixed)
1964 return -ENOMEM;
1965
1966 if (!strextend_with_separator(&joined, ":", prefixed))
1967 return -ENOMEM;
1968 }
fb2042dd
YW
1969
1970 x = strjoin(n, "=", joined);
1971 if (!x)
1972 return -ENOMEM;
1973
1974 our_env[n_env++] = x;
1975 }
1976
bb0c0d6f
LP
1977 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1978 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1979 if (!x)
1980 return -ENOMEM;
1981
1982 our_env[n_env++] = x;
1983 }
1984
dc4e2940
YW
1985 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1986 return -ENOMEM;
1987
1988 our_env[n_env++] = x;
1989
7cae38c4 1990 our_env[n_env++] = NULL;
8d5bb13d
LP
1991 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1992#undef N_ENV_VARS
7cae38c4 1993
ae2a15bc 1994 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1995
1996 return 0;
1997}
1998
b4c14404
FB
1999static int build_pass_environment(const ExecContext *c, char ***ret) {
2000 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2001 size_t n_env = 0;
b4c14404
FB
2002 char **i;
2003
2004 STRV_FOREACH(i, c->pass_environment) {
2005 _cleanup_free_ char *x = NULL;
2006 char *v;
2007
2008 v = getenv(*i);
2009 if (!v)
2010 continue;
605405c6 2011 x = strjoin(*i, "=", v);
b4c14404
FB
2012 if (!x)
2013 return -ENOMEM;
00819cc1 2014
319a4f4b 2015 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2016 return -ENOMEM;
00819cc1 2017
1cc6c93a 2018 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2019 pass_env[n_env] = NULL;
b4c14404
FB
2020 }
2021
ae2a15bc 2022 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2023
2024 return 0;
2025}
2026
5e8deb94 2027bool exec_needs_mount_namespace(
8b44a3d2
LP
2028 const ExecContext *context,
2029 const ExecParameters *params,
4657abb5 2030 const ExecRuntime *runtime) {
8b44a3d2
LP
2031
2032 assert(context);
8b44a3d2 2033
915e6d16
LP
2034 if (context->root_image)
2035 return true;
2036
2a624c36
AP
2037 if (!strv_isempty(context->read_write_paths) ||
2038 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2039 !strv_isempty(context->inaccessible_paths) ||
2040 !strv_isempty(context->exec_paths) ||
2041 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2042 return true;
2043
42b1d8e0 2044 if (context->n_bind_mounts > 0)
d2d6c096
LP
2045 return true;
2046
2abd4e38
YW
2047 if (context->n_temporary_filesystems > 0)
2048 return true;
2049
b3d13314
LB
2050 if (context->n_mount_images > 0)
2051 return true;
2052
93f59701
LB
2053 if (context->n_extension_images > 0)
2054 return true;
2055
37ed15d7 2056 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2057 return true;
2058
2059 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2060 return true;
2061
8b44a3d2 2062 if (context->private_devices ||
228af36f 2063 context->private_mounts ||
8b44a3d2 2064 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2065 context->protect_home != PROTECT_HOME_NO ||
2066 context->protect_kernel_tunables ||
c575770b 2067 context->protect_kernel_modules ||
94a7b275 2068 context->protect_kernel_logs ||
4e399953
LP
2069 context->protect_control_groups ||
2070 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2071 context->proc_subset != PROC_SUBSET_ALL ||
2072 context->private_ipc ||
2073 context->ipc_namespace_path)
8b44a3d2
LP
2074 return true;
2075
37c56f89 2076 if (context->root_directory) {
5e98086d 2077 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2078 return true;
2079
5b10116e 2080 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2081 if (params && !params->prefix[t])
37c56f89
YW
2082 continue;
2083
211a3d87 2084 if (context->directories[t].n_items > 0)
37c56f89
YW
2085 return true;
2086 }
2087 }
5d997827 2088
42b1d8e0 2089 if (context->dynamic_user &&
211a3d87
LB
2090 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2091 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2092 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2093 return true;
2094
91dd5f7c
LP
2095 if (context->log_namespace)
2096 return true;
2097
8b44a3d2
LP
2098 return false;
2099}
2100
5749f855 2101static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2102 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2103 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2104 _cleanup_close_ int unshare_ready_fd = -1;
2105 _cleanup_(sigkill_waitp) pid_t pid = 0;
2106 uint64_t c = 1;
d251207d
LP
2107 ssize_t n;
2108 int r;
2109
5749f855
AZ
2110 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2111 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2112 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2113 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2114 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2115 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2116 * continues execution normally.
2117 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2118 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2119
5749f855
AZ
2120 /* Can only set up multiple mappings with CAP_SETUID. */
2121 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2122 r = asprintf(&uid_map,
5749f855 2123 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2124 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2125 ouid, ouid, uid, uid);
2126 else
2127 r = asprintf(&uid_map,
2128 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2129 ouid, ouid);
d251207d 2130
5749f855
AZ
2131 if (r < 0)
2132 return -ENOMEM;
2133
2134 /* Can only set up multiple mappings with CAP_SETGID. */
2135 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2136 r = asprintf(&gid_map,
5749f855 2137 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2138 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2139 ogid, ogid, gid, gid);
2140 else
2141 r = asprintf(&gid_map,
2142 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2143 ogid, ogid);
2144
2145 if (r < 0)
2146 return -ENOMEM;
d251207d
LP
2147
2148 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2149 * namespace. */
2150 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2151 if (unshare_ready_fd < 0)
2152 return -errno;
2153
2154 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2155 * failed. */
2156 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2157 return -errno;
2158
4c253ed1
LP
2159 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2160 if (r < 0)
2161 return r;
2162 if (r == 0) {
d251207d
LP
2163 _cleanup_close_ int fd = -1;
2164 const char *a;
2165 pid_t ppid;
2166
2167 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2168 * here, after the parent opened its own user namespace. */
2169
2170 ppid = getppid();
2171 errno_pipe[0] = safe_close(errno_pipe[0]);
2172
2173 /* Wait until the parent unshared the user namespace */
2174 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2175 r = -errno;
2176 goto child_fail;
2177 }
2178
2179 /* Disable the setgroups() system call in the child user namespace, for good. */
2180 a = procfs_file_alloca(ppid, "setgroups");
2181 fd = open(a, O_WRONLY|O_CLOEXEC);
2182 if (fd < 0) {
2183 if (errno != ENOENT) {
2184 r = -errno;
2185 goto child_fail;
2186 }
2187
2188 /* If the file is missing the kernel is too old, let's continue anyway. */
2189 } else {
2190 if (write(fd, "deny\n", 5) < 0) {
2191 r = -errno;
2192 goto child_fail;
2193 }
2194
2195 fd = safe_close(fd);
2196 }
2197
2198 /* First write the GID map */
2199 a = procfs_file_alloca(ppid, "gid_map");
2200 fd = open(a, O_WRONLY|O_CLOEXEC);
2201 if (fd < 0) {
2202 r = -errno;
2203 goto child_fail;
2204 }
2205 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209 fd = safe_close(fd);
2210
2211 /* The write the UID map */
2212 a = procfs_file_alloca(ppid, "uid_map");
2213 fd = open(a, O_WRONLY|O_CLOEXEC);
2214 if (fd < 0) {
2215 r = -errno;
2216 goto child_fail;
2217 }
2218 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2219 r = -errno;
2220 goto child_fail;
2221 }
2222
2223 _exit(EXIT_SUCCESS);
2224
2225 child_fail:
2226 (void) write(errno_pipe[1], &r, sizeof(r));
2227 _exit(EXIT_FAILURE);
2228 }
2229
2230 errno_pipe[1] = safe_close(errno_pipe[1]);
2231
2232 if (unshare(CLONE_NEWUSER) < 0)
2233 return -errno;
2234
2235 /* Let the child know that the namespace is ready now */
2236 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2237 return -errno;
2238
2239 /* Try to read an error code from the child */
2240 n = read(errno_pipe[0], &r, sizeof(r));
2241 if (n < 0)
2242 return -errno;
2243 if (n == sizeof(r)) { /* an error code was sent to us */
2244 if (r < 0)
2245 return r;
2246 return -EIO;
2247 }
2248 if (n != 0) /* on success we should have read 0 bytes */
2249 return -EIO;
2250
8f03de53 2251 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2252 if (r < 0)
2253 return r;
2e87a1fd 2254 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2255 return -EIO;
2256
2257 return 0;
2258}
2259
494d0247
YW
2260static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2261 if (!context->dynamic_user)
2262 return false;
2263
2264 if (type == EXEC_DIRECTORY_CONFIGURATION)
2265 return false;
2266
2267 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2268 return false;
2269
2270 return true;
2271}
2272
211a3d87
LB
2273static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2274 _cleanup_free_ char *src_abs = NULL;
2275 char **dst;
2276 int r;
2277
2278 assert(source);
2279
2280 src_abs = path_join(root, source);
2281 if (!src_abs)
2282 return -ENOMEM;
2283
2284 STRV_FOREACH(dst, symlinks) {
2285 _cleanup_free_ char *dst_abs = NULL;
2286
2287 dst_abs = path_join(root, *dst);
2288 if (!dst_abs)
2289 return -ENOMEM;
2290
2291 r = mkdir_parents_label(dst_abs, 0755);
2292 if (r < 0)
2293 return r;
2294
2295 r = symlink_idempotent(src_abs, dst_abs, true);
2296 if (r < 0)
2297 return r;
2298 }
2299
2300 return 0;
2301}
2302
3536f49e 2303static int setup_exec_directory(
07689d5d
LP
2304 const ExecContext *context,
2305 const ExecParameters *params,
2306 uid_t uid,
3536f49e 2307 gid_t gid,
3536f49e 2308 ExecDirectoryType type,
211a3d87 2309 bool needs_mount_namespace,
3536f49e 2310 int *exit_status) {
07689d5d 2311
72fd1768 2312 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2313 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2314 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2315 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2316 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2317 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2318 };
07689d5d
LP
2319 int r;
2320
2321 assert(context);
2322 assert(params);
72fd1768 2323 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2324 assert(exit_status);
07689d5d 2325
3536f49e
YW
2326 if (!params->prefix[type])
2327 return 0;
2328
8679efde 2329 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2330 if (!uid_is_valid(uid))
2331 uid = 0;
2332 if (!gid_is_valid(gid))
2333 gid = 0;
2334 }
2335
211a3d87 2336 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2337 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2338
211a3d87 2339 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2340 if (!p) {
2341 r = -ENOMEM;
2342 goto fail;
2343 }
07689d5d 2344
23a7448e
YW
2345 r = mkdir_parents_label(p, 0755);
2346 if (r < 0)
3536f49e 2347 goto fail;
23a7448e 2348
494d0247 2349 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2350 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2351 * case we want to avoid leaving a directory around fully accessible that is owned by
2352 * a dynamic user whose UID is later on reused. To lock this down we use the same
2353 * trick used by container managers to prohibit host users to get access to files of
2354 * the same UID in containers: we place everything inside a directory that has an
2355 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2356 * for unprivileged host code. We then use fs namespacing to make this directory
2357 * permeable for the service itself.
6c47cd7d 2358 *
3f5b1508
LP
2359 * Specifically: for a service which wants a special directory "foo/" we first create
2360 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2361 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2362 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2363 * unprivileged host users can't look into it. Inside of the namespace of the unit
2364 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2365 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2366 * for the service and making sure it only gets access to the dirs it needs but no
2367 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2368 *
3f5b1508
LP
2369 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2370 * to be owned by the service itself.
2371 *
2372 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2373 * for sharing files or sockets with other services. */
6c47cd7d 2374
4ede9802
LP
2375 pp = path_join(params->prefix[type], "private");
2376 if (!pp) {
6c47cd7d
LP
2377 r = -ENOMEM;
2378 goto fail;
2379 }
2380
2381 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2382 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2383 if (r < 0)
2384 goto fail;
2385
211a3d87 2386 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2387 r = -ENOMEM;
2388 goto fail;
2389 }
2390
2391 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2392 r = mkdir_parents_label(pp, 0755);
2393 if (r < 0)
2394 goto fail;
2395
949befd3
LP
2396 if (is_dir(p, false) > 0 &&
2397 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2398
2399 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2400 * it over. Most likely the service has been upgraded from one that didn't use
2401 * DynamicUser=1, to one that does. */
2402
cf52c45d
LP
2403 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2404 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2405 exec_directory_type_to_string(type), p, pp);
2406
949befd3
LP
2407 if (rename(p, pp) < 0) {
2408 r = -errno;
2409 goto fail;
2410 }
2411 } else {
2412 /* Otherwise, create the actual directory for the service */
2413
2414 r = mkdir_label(pp, context->directories[type].mode);
2415 if (r < 0 && r != -EEXIST)
2416 goto fail;
2417 }
6c47cd7d 2418
df61e79a
LB
2419 /* And link it up from the original place. Note that if a mount namespace is going to be
2420 * used, then this symlink remains on the host, and a new one for the child namespace will
2421 * be created later. */
6c9c51e5 2422 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2423 if (r < 0)
2424 goto fail;
2425
6c47cd7d 2426 } else {
5c6d40d1
LP
2427 _cleanup_free_ char *target = NULL;
2428
2429 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2430 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2431 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2432
2433 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2434 * by DynamicUser=1 (see above)?
2435 *
2436 * We do this for all directory types except for ConfigurationDirectory=,
2437 * since they all support the private/ symlink logic at least in some
2438 * configurations, see above. */
5c6d40d1 2439
578dc69f
YW
2440 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2441 if (r < 0)
2442 goto fail;
2443
211a3d87 2444 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2445 if (!q) {
2446 r = -ENOMEM;
2447 goto fail;
2448 }
2449
578dc69f
YW
2450 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2451 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2452 if (r < 0)
2453 goto fail;
2454
2455 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2456
2457 /* Hmm, apparently DynamicUser= was once turned on for this service,
2458 * but is no longer. Let's move the directory back up. */
2459
cf52c45d
LP
2460 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2461 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2462 exec_directory_type_to_string(type), q, p);
2463
5c6d40d1
LP
2464 if (unlink(p) < 0) {
2465 r = -errno;
2466 goto fail;
2467 }
2468
2469 if (rename(q, p) < 0) {
2470 r = -errno;
2471 goto fail;
2472 }
2473 }
2474 }
2475
6c47cd7d 2476 r = mkdir_label(p, context->directories[type].mode);
d484580c 2477 if (r < 0) {
d484580c
LP
2478 if (r != -EEXIST)
2479 goto fail;
2480
206e9864
LP
2481 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2482 struct stat st;
2483
2484 /* Don't change the owner/access mode of the configuration directory,
2485 * as in the common case it is not written to by a service, and shall
2486 * not be writable. */
2487
2488 if (stat(p, &st) < 0) {
2489 r = -errno;
2490 goto fail;
2491 }
2492
2493 /* Still complain if the access mode doesn't match */
2494 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2495 log_warning("%s \'%s\' already exists but the mode is different. "
2496 "(File system: %o %sMode: %o)",
211a3d87 2497 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2498 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2499
6cff72eb 2500 continue;
206e9864 2501 }
6cff72eb 2502 }
a1164ae3 2503 }
07689d5d 2504
206e9864 2505 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2506 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2507 * current UID/GID ownership.) */
2508 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2509 if (r < 0)
2510 goto fail;
c71b2eb7 2511
607b358e
LP
2512 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2513 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2514 * assignments to exist. */
607b358e 2515 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2516 if (r < 0)
3536f49e 2517 goto fail;
07689d5d
LP
2518 }
2519
211a3d87
LB
2520 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2521 * they are set up later, to allow configuring empty var/run/etc. */
2522 if (!needs_mount_namespace)
2523 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2524 r = create_many_symlinks(params->prefix[type],
2525 context->directories[type].items[i].path,
2526 context->directories[type].items[i].symlinks);
2527 if (r < 0)
2528 goto fail;
2529 }
2530
07689d5d 2531 return 0;
3536f49e
YW
2532
2533fail:
2534 *exit_status = exit_status_table[type];
3536f49e 2535 return r;
07689d5d
LP
2536}
2537
bb0c0d6f
LP
2538static int write_credential(
2539 int dfd,
2540 const char *id,
2541 const void *data,
2542 size_t size,
2543 uid_t uid,
2544 bool ownership_ok) {
2545
2546 _cleanup_(unlink_and_freep) char *tmp = NULL;
2547 _cleanup_close_ int fd = -1;
2548 int r;
2549
2550 r = tempfn_random_child("", "cred", &tmp);
2551 if (r < 0)
2552 return r;
2553
2554 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2555 if (fd < 0) {
2556 tmp = mfree(tmp);
2557 return -errno;
2558 }
2559
43144be4 2560 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2561 if (r < 0)
2562 return r;
2563
2564 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2565 return -errno;
2566
2567 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2568 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2569 if (r < 0) {
2570 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2571 return r;
2572
2573 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2574 * to express: that the user gets read access and nothing
2575 * else. But if the backing fs can't support that (e.g. ramfs)
2576 * then we can use file ownership instead. But that's only safe if
2577 * we can then re-mount the whole thing read-only, so that the
2578 * user can no longer chmod() the file to gain write access. */
2579 return r;
2580
f5fbe71d 2581 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2582 return -errno;
2583 }
2584 }
2585
2586 if (renameat(dfd, tmp, dfd, id) < 0)
2587 return -errno;
2588
2589 tmp = mfree(tmp);
2590 return 0;
2591}
2592
bb0c0d6f
LP
2593static int acquire_credentials(
2594 const ExecContext *context,
2595 const ExecParameters *params,
d3dcf4e3 2596 const char *unit,
bb0c0d6f
LP
2597 const char *p,
2598 uid_t uid,
2599 bool ownership_ok) {
2600
43144be4 2601 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2602 _cleanup_close_ int dfd = -1;
43144be4 2603 ExecLoadCredential *lc;
bb0c0d6f 2604 ExecSetCredential *sc;
bb0c0d6f
LP
2605 int r;
2606
2607 assert(context);
2608 assert(p);
2609
2610 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2611 if (dfd < 0)
2612 return -errno;
2613
43144be4
LP
2614 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2615 HASHMAP_FOREACH(lc, context->load_credentials) {
2616 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
bb0c0d6f 2617 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2618 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2619 bool missing_ok = true;
bb0c0d6f
LP
2620 const char *source;
2621 size_t size, add;
2622
43144be4 2623 if (path_is_absolute(lc->path)) {
bb0c0d6f 2624 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
43144be4 2625 source = lc->path;
bb0c0d6f 2626 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2627
2628 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2629 * via the source socket address in case we read off an AF_UNIX socket. */
43144be4 2630 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
d3dcf4e3
LP
2631 return -ENOMEM;
2632
fc682be2
LP
2633 missing_ok = false;
2634
bb0c0d6f
LP
2635 } else if (params->received_credentials) {
2636 /* If this is a relative path, take it relative to the credentials we received
2637 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2638 * on a credential store, i.e. this is guaranteed to be regular files. */
43144be4 2639 j = path_join(params->received_credentials, lc->path);
bb0c0d6f
LP
2640 if (!j)
2641 return -ENOMEM;
2642
2643 source = j;
2644 } else
2645 source = NULL;
2646
2647 if (source)
43144be4
LP
2648 r = read_full_file_full(
2649 AT_FDCWD, source,
2650 UINT64_MAX,
2651 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2652 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2653 bindname,
2654 &data, &size);
bb0c0d6f
LP
2655 else
2656 r = -ENOENT;
43144be4 2657 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
fc682be2
LP
2658 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2659 * will get clear errors if we don't pass such a missing credential on as they
2660 * themselves will get ENOENT when trying to read them, which should not be much
2661 * worse than when we handle the error here and make it fatal.
2662 *
43144be4
LP
2663 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2664 * we are fine, too. */
2665 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
bb0c0d6f 2666 continue;
fc682be2 2667 }
bb0c0d6f 2668 if (r < 0)
43144be4
LP
2669 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2670
2671 if (lc->encrypted) {
2672 _cleanup_free_ void *plaintext = NULL;
2673 size_t plaintext_size = 0;
2674
2675 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2676 if (r < 0)
2677 return r;
bb0c0d6f 2678
43144be4
LP
2679 free_and_replace(data, plaintext);
2680 size = plaintext_size;
2681 }
2682
2683 add = strlen(lc->id) + size;
bb0c0d6f
LP
2684 if (add > left)
2685 return -E2BIG;
2686
43144be4 2687 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
bb0c0d6f
LP
2688 if (r < 0)
2689 return r;
2690
2691 left -= add;
2692 }
2693
43144be4
LP
2694 /* First we use the literally specified credentials. Note that they might be overridden again below,
2695 * and thus act as a "default" if the same credential is specified multiple times */
2696 HASHMAP_FOREACH(sc, context->set_credentials) {
2697 _cleanup_(erase_and_freep) void *plaintext = NULL;
2698 const char *data;
2699 size_t size, add;
2700
2701 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2702 continue;
2703 if (errno != ENOENT)
2704 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2705
2706 if (sc->encrypted) {
2707 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2708 if (r < 0)
2709 return r;
2710
2711 data = plaintext;
2712 } else {
2713 data = sc->data;
2714 size = sc->size;
2715 }
2716
2717 add = strlen(sc->id) + size;
2718 if (add > left)
2719 return -E2BIG;
2720
2721 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2722 if (r < 0)
2723 return r;
2724
2725
2726 left -= add;
2727 }
2728
bb0c0d6f
LP
2729 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2730 return -errno;
2731
2732 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2733 * accessible */
2734
2735 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2736 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2737 if (r < 0) {
2738 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2739 return r;
2740
2741 if (!ownership_ok)
2742 return r;
2743
f5fbe71d 2744 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2745 return -errno;
2746 }
2747 }
2748
2749 return 0;
2750}
2751
2752static int setup_credentials_internal(
2753 const ExecContext *context,
2754 const ExecParameters *params,
d3dcf4e3 2755 const char *unit,
bb0c0d6f
LP
2756 const char *final, /* This is where the credential store shall eventually end up at */
2757 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2758 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2759 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2760 uid_t uid) {
2761
2762 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2763 * if we mounted something; false if we definitely can't mount anything */
2764 bool final_mounted;
2765 const char *where;
2766
2767 assert(context);
2768 assert(final);
2769 assert(workspace);
2770
2771 if (reuse_workspace) {
2772 r = path_is_mount_point(workspace, NULL, 0);
2773 if (r < 0)
2774 return r;
2775 if (r > 0)
2776 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2777 else
2778 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2779 } else
2780 workspace_mounted = -1; /* ditto */
2781
2782 r = path_is_mount_point(final, NULL, 0);
2783 if (r < 0)
2784 return r;
2785 if (r > 0) {
2786 /* If the final place already has something mounted, we use that. If the workspace also has
2787 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2788 * different). */
2789 final_mounted = true;
2790
2791 if (workspace_mounted < 0) {
2792 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2793 * the final version to the workspace, and make it writable, so that we can make
2794 * changes */
2795
21935150
LP
2796 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2797 if (r < 0)
2798 return r;
bb0c0d6f 2799
21935150
LP
2800 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2801 if (r < 0)
2802 return r;
bb0c0d6f
LP
2803
2804 workspace_mounted = true;
2805 }
2806 } else
2807 final_mounted = false;
2808
2809 if (workspace_mounted < 0) {
2810 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2811 for (int try = 0;; try++) {
2812
2813 if (try == 0) {
2814 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2815 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2816 if (r >= 0) {
bb0c0d6f
LP
2817 workspace_mounted = true;
2818 break;
2819 }
2820
2821 } else if (try == 1) {
2822 _cleanup_free_ char *opts = NULL;
2823
43144be4 2824 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
2825 return -ENOMEM;
2826
2827 /* Fall back to "tmpfs" otherwise */
21935150
LP
2828 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2829 if (r >= 0) {
bb0c0d6f
LP
2830 workspace_mounted = true;
2831 break;
2832 }
2833
2834 } else {
2835 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2836 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2837 if (r < 0) {
2838 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2839 return r;
bb0c0d6f
LP
2840
2841 if (must_mount) /* If we it's not OK to use the plain directory
2842 * fallback, propagate all errors too */
21935150 2843 return r;
bb0c0d6f
LP
2844
2845 /* If we lack privileges to bind mount stuff, then let's gracefully
2846 * proceed for compat with container envs, and just use the final dir
2847 * as is. */
2848
2849 workspace_mounted = false;
2850 break;
2851 }
2852
2853 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2854 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2855 if (r < 0)
2856 return r;
bb0c0d6f
LP
2857
2858 workspace_mounted = true;
2859 break;
2860 }
2861 }
2862 }
2863
2864 assert(!must_mount || workspace_mounted > 0);
2865 where = workspace_mounted ? workspace : final;
2866
e3a0a862
CG
2867 (void) label_fix_container(where, final, 0);
2868
d3dcf4e3 2869 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2870 if (r < 0)
2871 return r;
2872
2873 if (workspace_mounted) {
2874 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2875 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2876 if (r < 0)
2877 return r;
bb0c0d6f
LP
2878
2879 /* And mount it to the final place, read-only */
21935150
LP
2880 if (final_mounted)
2881 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2882 else
2883 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2884 if (r < 0)
2885 return r;
bb0c0d6f
LP
2886 } else {
2887 _cleanup_free_ char *parent = NULL;
2888
2889 /* If we do not have our own mount put used the plain directory fallback, then we need to
2890 * open access to the top-level credential directory and the per-service directory now */
2891
2892 parent = dirname_malloc(final);
2893 if (!parent)
2894 return -ENOMEM;
2895 if (chmod(parent, 0755) < 0)
2896 return -errno;
2897 }
2898
2899 return 0;
2900}
2901
2902static int setup_credentials(
2903 const ExecContext *context,
2904 const ExecParameters *params,
2905 const char *unit,
2906 uid_t uid) {
2907
2908 _cleanup_free_ char *p = NULL, *q = NULL;
2909 const char *i;
2910 int r;
2911
2912 assert(context);
2913 assert(params);
2914
2915 if (!exec_context_has_credentials(context))
2916 return 0;
2917
2918 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2919 return -EINVAL;
2920
2921 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2922 * and the subdir we mount over with a read-only file system readable by the service's user */
2923 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2924 if (!q)
2925 return -ENOMEM;
2926
2927 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2928 if (r < 0 && r != -EEXIST)
2929 return r;
2930
2931 p = path_join(q, unit);
2932 if (!p)
2933 return -ENOMEM;
2934
2935 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2936 if (r < 0 && r != -EEXIST)
2937 return r;
2938
2939 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2940 if (r < 0) {
2941 _cleanup_free_ char *t = NULL, *u = NULL;
2942
2943 /* If this is not a privilege or support issue then propagate the error */
2944 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2945 return r;
2946
2947 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2948 * it into place, so that users can't access half-initialized credential stores. */
2949 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2950 if (!t)
2951 return -ENOMEM;
2952
2953 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2954 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2955 * after it is fully set up */
2956 u = path_join(t, unit);
2957 if (!u)
2958 return -ENOMEM;
2959
2960 FOREACH_STRING(i, t, u) {
2961 r = mkdir_label(i, 0700);
2962 if (r < 0 && r != -EEXIST)
2963 return r;
2964 }
2965
2966 r = setup_credentials_internal(
2967 context,
2968 params,
d3dcf4e3 2969 unit,
bb0c0d6f
LP
2970 p, /* final mount point */
2971 u, /* temporary workspace to overmount */
2972 true, /* reuse the workspace if it is already a mount */
2973 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2974 uid);
2975
2976 (void) rmdir(u); /* remove the workspace again if we can. */
2977
2978 if (r < 0)
2979 return r;
2980
2981 } else if (r == 0) {
2982
2983 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2984 * we can use the same directory for all cases, after turning off propagation. Question
2985 * though is: where do we turn off propagation exactly, and where do we place the workspace
2986 * directory? We need some place that is guaranteed to be a mount point in the host, and
2987 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2988 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2989 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2990 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2991 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2992 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2993 * propagation on the former, and then overmount the latter.
2994 *
2995 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2996 * for this purpose, but there are few other candidates that work equally well for us, and
2997 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 2998 * that no one else sees this should be OK to do. */
bb0c0d6f 2999
21935150
LP
3000 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3001 if (r < 0)
bb0c0d6f
LP
3002 goto child_fail;
3003
3004 r = setup_credentials_internal(
3005 context,
3006 params,
d3dcf4e3 3007 unit,
bb0c0d6f
LP
3008 p, /* final mount point */
3009 "/dev/shm", /* temporary workspace to overmount */
3010 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3011 true, /* insist that something is mounted, do not allow fallback to plain directory */
3012 uid);
3013 if (r < 0)
3014 goto child_fail;
3015
3016 _exit(EXIT_SUCCESS);
3017
3018 child_fail:
3019 _exit(EXIT_FAILURE);
3020 }
3021
3022 return 0;
3023}
3024
92b423b9 3025#if ENABLE_SMACK
cefc33ae
LP
3026static int setup_smack(
3027 const ExecContext *context,
b83d5050 3028 int executable_fd) {
cefc33ae
LP
3029 int r;
3030
3031 assert(context);
b83d5050 3032 assert(executable_fd >= 0);
cefc33ae 3033
cefc33ae
LP
3034 if (context->smack_process_label) {
3035 r = mac_smack_apply_pid(0, context->smack_process_label);
3036 if (r < 0)
3037 return r;
3038 }
3039#ifdef SMACK_DEFAULT_PROCESS_LABEL
3040 else {
3041 _cleanup_free_ char *exec_label = NULL;
3042
b83d5050 3043 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3044 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3045 return r;
3046
3047 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3048 if (r < 0)
3049 return r;
3050 }
cefc33ae
LP
3051#endif
3052
3053 return 0;
3054}
92b423b9 3055#endif
cefc33ae 3056
6c47cd7d
LP
3057static int compile_bind_mounts(
3058 const ExecContext *context,
3059 const ExecParameters *params,
3060 BindMount **ret_bind_mounts,
da6053d0 3061 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3062 char ***ret_empty_directories) {
3063
3064 _cleanup_strv_free_ char **empty_directories = NULL;
3065 BindMount *bind_mounts;
5b10116e 3066 size_t n, h = 0;
6c47cd7d
LP
3067 int r;
3068
3069 assert(context);
3070 assert(params);
3071 assert(ret_bind_mounts);
3072 assert(ret_n_bind_mounts);
3073 assert(ret_empty_directories);
3074
3075 n = context->n_bind_mounts;
5b10116e 3076 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3077 if (!params->prefix[t])
3078 continue;
3079
211a3d87 3080 n += context->directories[t].n_items;
6c47cd7d
LP
3081 }
3082
3083 if (n <= 0) {
3084 *ret_bind_mounts = NULL;
3085 *ret_n_bind_mounts = 0;
3086 *ret_empty_directories = NULL;
3087 return 0;
3088 }
3089
3090 bind_mounts = new(BindMount, n);
3091 if (!bind_mounts)
3092 return -ENOMEM;
3093
5b10116e 3094 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3095 BindMount *item = context->bind_mounts + i;
3096 char *s, *d;
3097
3098 s = strdup(item->source);
3099 if (!s) {
3100 r = -ENOMEM;
3101 goto finish;
3102 }
3103
3104 d = strdup(item->destination);
3105 if (!d) {
3106 free(s);
3107 r = -ENOMEM;
3108 goto finish;
3109 }
3110
3111 bind_mounts[h++] = (BindMount) {
3112 .source = s,
3113 .destination = d,
3114 .read_only = item->read_only,
3115 .recursive = item->recursive,
3116 .ignore_enoent = item->ignore_enoent,
3117 };
3118 }
3119
5b10116e 3120 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3121 if (!params->prefix[t])
3122 continue;
3123
211a3d87 3124 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3125 continue;
3126
494d0247 3127 if (exec_directory_is_private(context, t) &&
74e12520 3128 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3129 char *private_root;
3130
3131 /* So this is for a dynamic user, and we need to make sure the process can access its own
3132 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3133 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3134
657ee2d8 3135 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3136 if (!private_root) {
3137 r = -ENOMEM;
3138 goto finish;
3139 }
3140
3141 r = strv_consume(&empty_directories, private_root);
a635a7ae 3142 if (r < 0)
6c47cd7d 3143 goto finish;
6c47cd7d
LP
3144 }
3145
211a3d87 3146 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3147 char *s, *d;
3148
494d0247 3149 if (exec_directory_is_private(context, t))
211a3d87 3150 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3151 else
211a3d87 3152 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3153 if (!s) {
3154 r = -ENOMEM;
3155 goto finish;
3156 }
3157
494d0247 3158 if (exec_directory_is_private(context, t) &&
74e12520 3159 exec_context_with_rootfs(context))
5609f688
YW
3160 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3161 * directory is not created on the root directory. So, let's bind-mount the directory
3162 * on the 'non-private' place. */
211a3d87 3163 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3164 else
3165 d = strdup(s);
6c47cd7d
LP
3166 if (!d) {
3167 free(s);
3168 r = -ENOMEM;
3169 goto finish;
3170 }
3171
3172 bind_mounts[h++] = (BindMount) {
3173 .source = s,
3174 .destination = d,
3175 .read_only = false,
9ce4e4b0 3176 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3177 .recursive = true,
3178 .ignore_enoent = false,
3179 };
3180 }
3181 }
3182
3183 assert(h == n);
3184
3185 *ret_bind_mounts = bind_mounts;
3186 *ret_n_bind_mounts = n;
ae2a15bc 3187 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3188
3189 return (int) n;
3190
3191finish:
3192 bind_mount_free_many(bind_mounts, h);
3193 return r;
3194}
3195
df61e79a
LB
3196/* ret_symlinks will contain a list of pairs src:dest that describes
3197 * the symlinks to create later on. For example, the symlinks needed
3198 * to safely give private directories to DynamicUser=1 users. */
3199static int compile_symlinks(
3200 const ExecContext *context,
3201 const ExecParameters *params,
3202 char ***ret_symlinks) {
3203
3204 _cleanup_strv_free_ char **symlinks = NULL;
3205 int r;
3206
3207 assert(context);
3208 assert(params);
3209 assert(ret_symlinks);
3210
3211 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3212 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3213 _cleanup_free_ char *private_path = NULL, *path = NULL;
3214 char **symlink;
df61e79a 3215
211a3d87
LB
3216 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3217 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3218
211a3d87
LB
3219 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3220 dst_abs = path_join(params->prefix[dt], *symlink);
3221 if (!src_abs || !dst_abs)
3222 return -ENOMEM;
df61e79a 3223
211a3d87
LB
3224 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3225 if (r < 0)
3226 return r;
3227 }
3228
3229 if (!exec_directory_is_private(context, dt))
3230 continue;
3231
3232 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3233 if (!private_path)
3234 return -ENOMEM;
3235
211a3d87 3236 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3237 if (!path)
3238 return -ENOMEM;
3239
3240 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3241 if (r < 0)
3242 return r;
3243 }
3244 }
3245
3246 *ret_symlinks = TAKE_PTR(symlinks);
3247
3248 return 0;
3249}
3250
4e677599
LP
3251static bool insist_on_sandboxing(
3252 const ExecContext *context,
3253 const char *root_dir,
3254 const char *root_image,
3255 const BindMount *bind_mounts,
3256 size_t n_bind_mounts) {
3257
4e677599
LP
3258 assert(context);
3259 assert(n_bind_mounts == 0 || bind_mounts);
3260
3261 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3262 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3263 * rearrange stuff in a way we cannot ignore gracefully. */
3264
3265 if (context->n_temporary_filesystems > 0)
3266 return true;
3267
3268 if (root_dir || root_image)
3269 return true;
3270
b3d13314
LB
3271 if (context->n_mount_images > 0)
3272 return true;
3273
4e677599
LP
3274 if (context->dynamic_user)
3275 return true;
3276
3277 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3278 * essential. */
5b10116e 3279 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3280 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3281 return true;
3282
91dd5f7c
LP
3283 if (context->log_namespace)
3284 return true;
3285
4e677599
LP
3286 return false;
3287}
3288
6818c54c 3289static int apply_mount_namespace(
34cf6c43 3290 const Unit *u,
9f71ba8d 3291 ExecCommandFlags command_flags,
6818c54c
LP
3292 const ExecContext *context,
3293 const ExecParameters *params,
7cc5ef5f
ZJS
3294 const ExecRuntime *runtime,
3295 char **error_path) {
6818c54c 3296
df61e79a 3297 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3298 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3299 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3300 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3301 NamespaceInfo ns_info;
165a31c0 3302 bool needs_sandboxing;
6c47cd7d 3303 BindMount *bind_mounts = NULL;
da6053d0 3304 size_t n_bind_mounts = 0;
6818c54c 3305 int r;
93c6bb51 3306
2b3c1b9e
DH
3307 assert(context);
3308
915e6d16
LP
3309 if (params->flags & EXEC_APPLY_CHROOT) {
3310 root_image = context->root_image;
3311
3312 if (!root_image)
3313 root_dir = context->root_directory;
3314 }
93c6bb51 3315
6c47cd7d
LP
3316 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3317 if (r < 0)
3318 return r;
3319
211a3d87 3320 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3321 r = compile_symlinks(context, params, &symlinks);
3322 if (r < 0)
3323 return r;
3324
9f71ba8d 3325 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3326 if (needs_sandboxing) {
3327 /* The runtime struct only contains the parent of the private /tmp,
3328 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3329 * that is sticky, and that's the one we want to use here.
3330 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3331
3332 if (context->private_tmp && runtime) {
56a13a49
ZJS
3333 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3334 tmp_dir = runtime->tmp_dir;
3335 else if (runtime->tmp_dir)
3336 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3337
3338 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3339 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3340 else if (runtime->var_tmp_dir)
56a13a49 3341 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3342 }
3343
b5a33299
YW
3344 ns_info = (NamespaceInfo) {
3345 .ignore_protect_paths = false,
3346 .private_dev = context->private_devices,
3347 .protect_control_groups = context->protect_control_groups,
3348 .protect_kernel_tunables = context->protect_kernel_tunables,
3349 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3350 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3351 .protect_hostname = context->protect_hostname,
5e98086d 3352 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3353 .private_mounts = context->private_mounts,
52b3d652
LP
3354 .protect_home = context->protect_home,
3355 .protect_system = context->protect_system,
4e399953
LP
3356 .protect_proc = context->protect_proc,
3357 .proc_subset = context->proc_subset,
80271a44 3358 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3359 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3360 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3361 };
ecf63c91 3362 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3363 /*
3364 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3365 * sandbox info, otherwise enforce it, don't ignore protected paths and
3366 * fail if we are enable to apply the sandbox inside the mount namespace.
3367 */
3368 ns_info = (NamespaceInfo) {
3369 .ignore_protect_paths = true,
3370 };
3371 else
3372 ns_info = (NamespaceInfo) {};
b5a33299 3373
37ed15d7
FB
3374 if (context->mount_flags == MS_SHARED)
3375 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3376
a631cbfa
LP
3377 if (exec_context_has_credentials(context) &&
3378 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3379 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3380 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3381 if (!creds_path) {
3382 r = -ENOMEM;
3383 goto finalize;
3384 }
bbb4e7f3
LP
3385 }
3386
5e8deb94
LB
3387 if (MANAGER_IS_SYSTEM(u->manager)) {
3388 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3389 if (!propagate_dir) {
3390 r = -ENOMEM;
3391 goto finalize;
3392 }
3393
5e8deb94 3394 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3395 if (!incoming_dir) {
3396 r = -ENOMEM;
3397 goto finalize;
3398 }
5e8deb94
LB
3399 }
3400
18d73705 3401 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3402 &ns_info, context->read_write_paths,
165a31c0
LP
3403 needs_sandboxing ? context->read_only_paths : NULL,
3404 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3405 needs_sandboxing ? context->exec_paths : NULL,
3406 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3407 empty_directories,
df61e79a 3408 symlinks,
6c47cd7d
LP
3409 bind_mounts,
3410 n_bind_mounts,
2abd4e38
YW
3411 context->temporary_filesystems,
3412 context->n_temporary_filesystems,
b3d13314
LB
3413 context->mount_images,
3414 context->n_mount_images,
56a13a49
ZJS
3415 tmp_dir,
3416 var_tmp_dir,
bbb4e7f3 3417 creds_path,
91dd5f7c 3418 context->log_namespace,
915e6d16 3419 context->mount_flags,
d4d55b0d
LB
3420 context->root_hash, context->root_hash_size, context->root_hash_path,
3421 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3422 context->root_verity,
93f59701
LB
3423 context->extension_images,
3424 context->n_extension_images,
5e8deb94
LB
3425 propagate_dir,
3426 incoming_dir,
3bdc25a4 3427 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3428 error_path);
93c6bb51 3429
1beab8b0 3430 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3431 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3432 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3433 * completely different execution environment. */
aca835ed 3434 if (r == -ENOANO) {
4e677599
LP
3435 if (insist_on_sandboxing(
3436 context,
3437 root_dir, root_image,
3438 bind_mounts,
3439 n_bind_mounts)) {
3440 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3441 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3442 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3443
3444 r = -EOPNOTSUPP;
3445 } else {
aca835ed 3446 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3447 r = 0;
aca835ed 3448 }
93c6bb51
DH
3449 }
3450
8062e643 3451finalize:
4e677599 3452 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3453 return r;
3454}
3455
915e6d16
LP
3456static int apply_working_directory(
3457 const ExecContext *context,
3458 const ExecParameters *params,
3459 const char *home,
376fecf6 3460 int *exit_status) {
915e6d16 3461
6732edab 3462 const char *d, *wd;
2b3c1b9e
DH
3463
3464 assert(context);
376fecf6 3465 assert(exit_status);
2b3c1b9e 3466
6732edab
LP
3467 if (context->working_directory_home) {
3468
376fecf6
LP
3469 if (!home) {
3470 *exit_status = EXIT_CHDIR;
6732edab 3471 return -ENXIO;
376fecf6 3472 }
6732edab 3473
2b3c1b9e 3474 wd = home;
6732edab 3475
14eb3285
LP
3476 } else
3477 wd = empty_to_root(context->working_directory);
e7f1e7c6 3478
fa97f630 3479 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3480 d = wd;
fa97f630 3481 else
3b0e5bb5 3482 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3483
376fecf6
LP
3484 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3485 *exit_status = EXIT_CHDIR;
2b3c1b9e 3486 return -errno;
376fecf6 3487 }
e7f1e7c6
DH
3488
3489 return 0;
3490}
3491
fa97f630
JB
3492static int apply_root_directory(
3493 const ExecContext *context,
3494 const ExecParameters *params,
3495 const bool needs_mount_ns,
3496 int *exit_status) {
3497
3498 assert(context);
3499 assert(exit_status);
3500
5b10116e 3501 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3502 if (!needs_mount_ns && context->root_directory)
3503 if (chroot(context->root_directory) < 0) {
3504 *exit_status = EXIT_CHROOT;
3505 return -errno;
3506 }
fa97f630
JB
3507
3508 return 0;
3509}
3510
b1edf445 3511static int setup_keyring(
34cf6c43 3512 const Unit *u,
b1edf445
LP
3513 const ExecContext *context,
3514 const ExecParameters *p,
3515 uid_t uid, gid_t gid) {
3516
74dd6b51 3517 key_serial_t keyring;
e64c2d0b
DJL
3518 int r = 0;
3519 uid_t saved_uid;
3520 gid_t saved_gid;
74dd6b51
LP
3521
3522 assert(u);
b1edf445 3523 assert(context);
74dd6b51
LP
3524 assert(p);
3525
3526 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3527 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3528 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3529 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3530 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3531 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3532
b1edf445
LP
3533 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3534 return 0;
3535
e64c2d0b
DJL
3536 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3537 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3538 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3539 * & group is just as nasty as acquiring a reference to the user keyring. */
3540
3541 saved_uid = getuid();
3542 saved_gid = getgid();
3543
3544 if (gid_is_valid(gid) && gid != saved_gid) {
3545 if (setregid(gid, -1) < 0)
3546 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3547 }
3548
3549 if (uid_is_valid(uid) && uid != saved_uid) {
3550 if (setreuid(uid, -1) < 0) {
3551 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3552 goto out;
3553 }
3554 }
3555
74dd6b51
LP
3556 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3557 if (keyring == -1) {
3558 if (errno == ENOSYS)
8002fb97 3559 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3560 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3561 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3562 else if (errno == EDQUOT)
8002fb97 3563 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3564 else
e64c2d0b 3565 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3566
e64c2d0b 3567 goto out;
74dd6b51
LP
3568 }
3569
e64c2d0b
DJL
3570 /* When requested link the user keyring into the session keyring. */
3571 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3572
3573 if (keyctl(KEYCTL_LINK,
3574 KEY_SPEC_USER_KEYRING,
3575 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3576 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3577 goto out;
3578 }
3579 }
3580
3581 /* Restore uid/gid back */
3582 if (uid_is_valid(uid) && uid != saved_uid) {
3583 if (setreuid(saved_uid, -1) < 0) {
3584 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3585 goto out;
3586 }
3587 }
3588
3589 if (gid_is_valid(gid) && gid != saved_gid) {
3590 if (setregid(saved_gid, -1) < 0)
3591 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3592 }
3593
3594 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3595 if (!sd_id128_is_null(u->invocation_id)) {
3596 key_serial_t key;
3597
3598 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3599 if (key == -1)
8002fb97 3600 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3601 else {
3602 if (keyctl(KEYCTL_SETPERM, key,
3603 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3604 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3605 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3606 }
3607 }
3608
e64c2d0b 3609out:
37b22b3b 3610 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3611 /* no extra logging, as only the first already reported error matters */
3612 if (getuid() != saved_uid)
3613 (void) setreuid(saved_uid, -1);
b1edf445 3614
e64c2d0b
DJL
3615 if (getgid() != saved_gid)
3616 (void) setregid(saved_gid, -1);
b1edf445 3617
e64c2d0b 3618 return r;
74dd6b51
LP
3619}
3620
3042bbeb 3621static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3622 assert(array);
3623 assert(n);
2caa38e9 3624 assert(pair);
29206d46
LP
3625
3626 if (pair[0] >= 0)
3627 array[(*n)++] = pair[0];
3628 if (pair[1] >= 0)
3629 array[(*n)++] = pair[1];
3630}
3631
a34ceba6
LP
3632static int close_remaining_fds(
3633 const ExecParameters *params,
34cf6c43
YW
3634 const ExecRuntime *runtime,
3635 const DynamicCreds *dcreds,
00d9ef85 3636 int user_lookup_fd,
a34ceba6 3637 int socket_fd,
5b8d1f6b 3638 const int *fds, size_t n_fds) {
a34ceba6 3639
da6053d0 3640 size_t n_dont_close = 0;
00d9ef85 3641 int dont_close[n_fds + 12];
a34ceba6
LP
3642
3643 assert(params);
3644
3645 if (params->stdin_fd >= 0)
3646 dont_close[n_dont_close++] = params->stdin_fd;
3647 if (params->stdout_fd >= 0)
3648 dont_close[n_dont_close++] = params->stdout_fd;
3649 if (params->stderr_fd >= 0)
3650 dont_close[n_dont_close++] = params->stderr_fd;
3651
3652 if (socket_fd >= 0)
3653 dont_close[n_dont_close++] = socket_fd;
3654 if (n_fds > 0) {
3655 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3656 n_dont_close += n_fds;
3657 }
3658
a70581ff 3659 if (runtime) {
29206d46 3660 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3661 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3662 }
29206d46
LP
3663
3664 if (dcreds) {
3665 if (dcreds->user)
3666 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3667 if (dcreds->group)
3668 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3669 }
3670
00d9ef85
LP
3671 if (user_lookup_fd >= 0)
3672 dont_close[n_dont_close++] = user_lookup_fd;
3673
a34ceba6
LP
3674 return close_all_fds(dont_close, n_dont_close);
3675}
3676
00d9ef85
LP
3677static int send_user_lookup(
3678 Unit *unit,
3679 int user_lookup_fd,
3680 uid_t uid,
3681 gid_t gid) {
3682
3683 assert(unit);
3684
3685 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3686 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3687 * specified. */
3688
3689 if (user_lookup_fd < 0)
3690 return 0;
3691
3692 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3693 return 0;
3694
3695 if (writev(user_lookup_fd,
3696 (struct iovec[]) {
e6a7ec4b
LP
3697 IOVEC_INIT(&uid, sizeof(uid)),
3698 IOVEC_INIT(&gid, sizeof(gid)),
3699 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3700 return -errno;
3701
3702 return 0;
3703}
3704
6732edab
LP
3705static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3706 int r;
3707
3708 assert(c);
3709 assert(home);
3710 assert(buf);
3711
3712 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3713
3714 if (*home)
3715 return 0;
3716
3717 if (!c->working_directory_home)
3718 return 0;
3719
6732edab
LP
3720 r = get_home_dir(buf);
3721 if (r < 0)
3722 return r;
3723
3724 *home = *buf;
3725 return 1;
3726}
3727
da50b85a
LP
3728static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3729 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3730 int r;
3731
3732 assert(c);
3733 assert(p);
3734 assert(ret);
3735
3736 assert(c->dynamic_user);
3737
3738 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3739 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3740 * directories. */
3741
5b10116e 3742 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3743 if (t == EXEC_DIRECTORY_CONFIGURATION)
3744 continue;
3745
3746 if (!p->prefix[t])
3747 continue;
3748
211a3d87 3749 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3750 char *e;
3751
494d0247 3752 if (exec_directory_is_private(c, t))
211a3d87 3753 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3754 else
211a3d87 3755 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3756 if (!e)
3757 return -ENOMEM;
3758
3759 r = strv_consume(&list, e);
3760 if (r < 0)
3761 return r;
3762 }
3763 }
3764
ae2a15bc 3765 *ret = TAKE_PTR(list);
da50b85a
LP
3766
3767 return 0;
3768}
3769
78f93209
LP
3770static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3771 bool using_subcgroup;
3772 char *p;
3773
3774 assert(params);
3775 assert(ret);
3776
3777 if (!params->cgroup_path)
3778 return -EINVAL;
3779
3780 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3781 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3782 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3783 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3784 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3785 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3786 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3787 * flag, which is only passed for the former statements, not for the latter. */
3788
3789 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3790 if (using_subcgroup)
657ee2d8 3791 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3792 else
3793 p = strdup(params->cgroup_path);
3794 if (!p)
3795 return -ENOMEM;
3796
3797 *ret = p;
3798 return using_subcgroup;
3799}
3800
e2b2fb7f
MS
3801static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3802 _cleanup_(cpu_set_reset) CPUSet s = {};
3803 int r;
3804
3805 assert(c);
3806 assert(ret);
3807
3808 if (!c->numa_policy.nodes.set) {
3809 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3810 return 0;
3811 }
3812
3813 r = numa_to_cpu_set(&c->numa_policy, &s);
3814 if (r < 0)
3815 return r;
3816
3817 cpu_set_reset(ret);
3818
3819 return cpu_set_add_all(ret, &s);
3820}
3821
3822bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3823 assert(c);
3824
3825 return c->cpu_affinity_from_numa;
3826}
3827
1da37e58
ZJS
3828static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3829 int r;
3830
3831 assert(fds);
3832 assert(n_fds);
3833 assert(*n_fds < fds_size);
3834 assert(ret_fd);
3835
3836 if (fd < 0) {
3837 *ret_fd = -1;
3838 return 0;
3839 }
3840
3841 if (fd < 3 + (int) *n_fds) {
3842 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3843 * the fds we pass to the process (or which are closed only during execve). */
3844
3845 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3846 if (r < 0)
3847 return -errno;
3848
3849 CLOSE_AND_REPLACE(fd, r);
3850 }
3851
3852 *ret_fd = fds[*n_fds] = fd;
3853 (*n_fds) ++;
3854 return 1;
3855}
3856
ff0af2a1 3857static int exec_child(
f2341e0a 3858 Unit *unit,
34cf6c43 3859 const ExecCommand *command,
ff0af2a1
LP
3860 const ExecContext *context,
3861 const ExecParameters *params,
3862 ExecRuntime *runtime,
29206d46 3863 DynamicCreds *dcreds,
ff0af2a1 3864 int socket_fd,
2caa38e9 3865 const int named_iofds[static 3],
4c47affc 3866 int *fds,
da6053d0 3867 size_t n_socket_fds,
25b583d7 3868 size_t n_storage_fds,
ff0af2a1 3869 char **files_env,
00d9ef85 3870 int user_lookup_fd,
12145637 3871 int *exit_status) {
d35fbf6b 3872
8c35c10d 3873 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3874 int r, ngids = 0, exec_fd;
4d885bd3
DH
3875 _cleanup_free_ gid_t *supplementary_gids = NULL;
3876 const char *username = NULL, *groupname = NULL;
5686391b 3877 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3878 const char *home = NULL, *shell = NULL;
7ca69792 3879 char **final_argv = NULL;
7bce046b
LP
3880 dev_t journal_stream_dev = 0;
3881 ino_t journal_stream_ino = 0;
5749f855 3882 bool userns_set_up = false;
165a31c0
LP
3883 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3884 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3885 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3886 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3887#if HAVE_SELINUX
7f59dd35 3888 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3889 bool use_selinux = false;
ecfbc84f 3890#endif
f9fa32f0 3891#if ENABLE_SMACK
43b1f709 3892 bool use_smack = false;
ecfbc84f 3893#endif
349cc4a5 3894#if HAVE_APPARMOR
43b1f709 3895 bool use_apparmor = false;
ecfbc84f 3896#endif
5749f855
AZ
3897 uid_t saved_uid = getuid();
3898 gid_t saved_gid = getgid();
fed1e721
LP
3899 uid_t uid = UID_INVALID;
3900 gid_t gid = GID_INVALID;
1da37e58
ZJS
3901 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3902 n_keep_fds; /* total number of fds not to close */
165a31c0 3903 int secure_bits;
afb11bf1
DG
3904 _cleanup_free_ gid_t *gids_after_pam = NULL;
3905 int ngids_after_pam = 0;
034c6ed7 3906
f2341e0a 3907 assert(unit);
5cb5a6ff
LP
3908 assert(command);
3909 assert(context);
d35fbf6b 3910 assert(params);
ff0af2a1 3911 assert(exit_status);
d35fbf6b
DM
3912
3913 rename_process_from_path(command->path);
3914
9c274488
LP
3915 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3916 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3917 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3918 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3919 SIGNALS_IGNORE);
d35fbf6b
DM
3920
3921 if (context->ignore_sigpipe)
9c274488 3922 (void) ignore_signals(SIGPIPE);
d35fbf6b 3923
ff0af2a1
LP
3924 r = reset_signal_mask();
3925 if (r < 0) {
3926 *exit_status = EXIT_SIGNAL_MASK;
12145637 3927 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3928 }
034c6ed7 3929
d35fbf6b
DM
3930 if (params->idle_pipe)
3931 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3932
2c027c62
LP
3933 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3934 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3935 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3936 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3937
d35fbf6b 3938 log_forget_fds();
2c027c62 3939 log_set_open_when_needed(true);
4f2d528d 3940
40a80078
LP
3941 /* In case anything used libc syslog(), close this here, too */
3942 closelog();
3943
b1994387 3944 int keep_fds[n_fds + 3];
1da37e58
ZJS
3945 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3946 n_keep_fds = n_fds;
3947
3948 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3949 if (r < 0) {
3950 *exit_status = EXIT_FDS;
3951 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3952 }
3953
b1994387
ILG
3954#if HAVE_LIBBPF
3955 if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
3956 int bpf_map_fd = -1;
3957
3958 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
3959 if (bpf_map_fd < 0) {
3960 *exit_status = EXIT_FDS;
3961 return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
3962 }
3963
3964 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
3965 if (r < 0) {
3966 *exit_status = EXIT_FDS;
3967 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3968 }
3969 }
3970#endif
3971
1da37e58 3972 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3973 if (r < 0) {
3974 *exit_status = EXIT_FDS;
12145637 3975 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3976 }
3977
0af07108
ZJS
3978 if (!context->same_pgrp &&
3979 setsid() < 0) {
3980 *exit_status = EXIT_SETSID;
3981 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3982 }
9e2f7c11 3983
1e22b5cd 3984 exec_context_tty_reset(context, params);
d35fbf6b 3985
c891efaf 3986 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3987 const char *vc = params->confirm_spawn;
3b20f877
FB
3988 _cleanup_free_ char *cmdline = NULL;
3989
8a62620e 3990 cmdline = quote_command_line(command->argv);
3b20f877 3991 if (!cmdline) {
0460aa5c 3992 *exit_status = EXIT_MEMORY;
12145637 3993 return log_oom();
3b20f877 3994 }
d35fbf6b 3995
eedf223a 3996 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3997 if (r != CONFIRM_EXECUTE) {
3998 if (r == CONFIRM_PRETEND_SUCCESS) {
3999 *exit_status = EXIT_SUCCESS;
4000 return 0;
4001 }
ff0af2a1 4002 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4003 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4004 "Execution cancelled by the user");
d35fbf6b
DM
4005 }
4006 }
1a63a750 4007
d521916d
LP
4008 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4009 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4010 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4011 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4012 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4013 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4014 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4015 *exit_status = EXIT_MEMORY;
4016 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4017 }
4018
29206d46 4019 if (context->dynamic_user && dcreds) {
da50b85a 4020 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4021
d521916d 4022 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4023 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4024 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4025 *exit_status = EXIT_USER;
12145637 4026 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4027 }
4028
da50b85a
LP
4029 r = compile_suggested_paths(context, params, &suggested_paths);
4030 if (r < 0) {
4031 *exit_status = EXIT_MEMORY;
4032 return log_oom();
4033 }
4034
4035 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4036 if (r < 0) {
4037 *exit_status = EXIT_USER;
d85ff944
YW
4038 if (r == -EILSEQ)
4039 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4040 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4041 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4042 }
524daa8c 4043
70dd455c 4044 if (!uid_is_valid(uid)) {
29206d46 4045 *exit_status = EXIT_USER;
d85ff944 4046 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4047 }
4048
4049 if (!gid_is_valid(gid)) {
4050 *exit_status = EXIT_USER;
d85ff944 4051 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4052 }
5bc7452b 4053
29206d46
LP
4054 if (dcreds->user)
4055 username = dcreds->user->name;
4056
4057 } else {
4d885bd3
DH
4058 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4059 if (r < 0) {
4060 *exit_status = EXIT_USER;
12145637 4061 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4062 }
5bc7452b 4063
4d885bd3
DH
4064 r = get_fixed_group(context, &groupname, &gid);
4065 if (r < 0) {
4066 *exit_status = EXIT_GROUP;
12145637 4067 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4068 }
cdc5d5c5 4069 }
29206d46 4070
cdc5d5c5
DH
4071 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4072 r = get_supplementary_groups(context, username, groupname, gid,
4073 &supplementary_gids, &ngids);
4074 if (r < 0) {
4075 *exit_status = EXIT_GROUP;
12145637 4076 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4077 }
5bc7452b 4078
00d9ef85
LP
4079 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4080 if (r < 0) {
4081 *exit_status = EXIT_USER;
12145637 4082 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4083 }
4084
4085 user_lookup_fd = safe_close(user_lookup_fd);
4086
6732edab
LP
4087 r = acquire_home(context, uid, &home, &home_buffer);
4088 if (r < 0) {
4089 *exit_status = EXIT_CHDIR;
12145637 4090 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4091 }
4092
d35fbf6b
DM
4093 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4094 * must sure to drop O_NONBLOCK */
4095 if (socket_fd >= 0)
a34ceba6 4096 (void) fd_nonblock(socket_fd, false);
acbb0225 4097
4c70a4a7
MS
4098 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4099 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4100 if (params->cgroup_path) {
4101 _cleanup_free_ char *p = NULL;
4102
4103 r = exec_parameters_get_cgroup_path(params, &p);
4104 if (r < 0) {
4105 *exit_status = EXIT_CGROUP;
4106 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4107 }
4108
4109 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4110 if (r < 0) {
4111 *exit_status = EXIT_CGROUP;
4112 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4113 }
4114 }
4115
a8d08f39 4116 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4117 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4118 if (r < 0) {
4119 *exit_status = EXIT_NETWORK;
4120 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4121 }
4122 }
4123
a70581ff
XR
4124 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4125 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4126 if (r < 0) {
4127 *exit_status = EXIT_NAMESPACE;
4128 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4129 }
4130 }
4131
52c239d7 4132 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4133 if (r < 0) {
4134 *exit_status = EXIT_STDIN;
12145637 4135 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4136 }
034c6ed7 4137
52c239d7 4138 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4139 if (r < 0) {
4140 *exit_status = EXIT_STDOUT;
12145637 4141 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4142 }
4143
52c239d7 4144 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4145 if (r < 0) {
4146 *exit_status = EXIT_STDERR;
12145637 4147 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4148 }
4149
d35fbf6b 4150 if (context->oom_score_adjust_set) {
9f8168eb
LP
4151 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4152 * prohibit write access to this file, and we shouldn't trip up over that. */
4153 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4154 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4155 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4156 else if (r < 0) {
ff0af2a1 4157 *exit_status = EXIT_OOM_ADJUST;
12145637 4158 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4159 }
d35fbf6b
DM
4160 }
4161
ad21e542
ZJS
4162 if (context->coredump_filter_set) {
4163 r = set_coredump_filter(context->coredump_filter);
4164 if (ERRNO_IS_PRIVILEGE(r))
4165 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4166 else if (r < 0)
4167 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4168 }
4169
39090201
DJL
4170 if (context->nice_set) {
4171 r = setpriority_closest(context->nice);
4172 if (r < 0)
4173 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4174 }
613b411c 4175
d35fbf6b
DM
4176 if (context->cpu_sched_set) {
4177 struct sched_param param = {
4178 .sched_priority = context->cpu_sched_priority,
4179 };
4180
ff0af2a1
LP
4181 r = sched_setscheduler(0,
4182 context->cpu_sched_policy |
4183 (context->cpu_sched_reset_on_fork ?
4184 SCHED_RESET_ON_FORK : 0),
4185 &param);
4186 if (r < 0) {
4187 *exit_status = EXIT_SETSCHEDULER;
12145637 4188 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4189 }
d35fbf6b 4190 }
fc9b2a84 4191
e2b2fb7f
MS
4192 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4193 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4194 const CPUSet *cpu_set;
4195
4196 if (context->cpu_affinity_from_numa) {
4197 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4198 if (r < 0) {
4199 *exit_status = EXIT_CPUAFFINITY;
4200 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4201 }
4202
4203 cpu_set = &converted_cpu_set;
4204 } else
4205 cpu_set = &context->cpu_set;
4206
4207 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4208 *exit_status = EXIT_CPUAFFINITY;
12145637 4209 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4210 }
e2b2fb7f 4211 }
034c6ed7 4212
b070c7c0
MS
4213 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4214 r = apply_numa_policy(&context->numa_policy);
4215 if (r == -EOPNOTSUPP)
33fe9e3f 4216 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4217 else if (r < 0) {
4218 *exit_status = EXIT_NUMA_POLICY;
4219 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4220 }
4221 }
4222
d35fbf6b
DM
4223 if (context->ioprio_set)
4224 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4225 *exit_status = EXIT_IOPRIO;
12145637 4226 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4227 }
da726a4d 4228
d35fbf6b
DM
4229 if (context->timer_slack_nsec != NSEC_INFINITY)
4230 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4231 *exit_status = EXIT_TIMERSLACK;
12145637 4232 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4233 }
9eba9da4 4234
21022b9d
LP
4235 if (context->personality != PERSONALITY_INVALID) {
4236 r = safe_personality(context->personality);
4237 if (r < 0) {
ff0af2a1 4238 *exit_status = EXIT_PERSONALITY;
12145637 4239 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4240 }
21022b9d 4241 }
94f04347 4242
33331d11
VB
4243 if (context->utmp_id) {
4244 const char *line = context->tty_path ?
4245 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4246 NULL;
df0ff127 4247 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4248 line,
023a4f67
LP
4249 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4250 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4251 USER_PROCESS,
6a93917d 4252 username);
33331d11 4253 }
d35fbf6b 4254
08f67696 4255 if (uid_is_valid(uid)) {
ff0af2a1
LP
4256 r = chown_terminal(STDIN_FILENO, uid);
4257 if (r < 0) {
4258 *exit_status = EXIT_STDIN;
12145637 4259 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4260 }
d35fbf6b 4261 }
8e274523 4262
4e1dfa45 4263 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4264 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4265 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4266 * touch a single hierarchy too. */
584b8688 4267 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4268 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4269 if (r < 0) {
4270 *exit_status = EXIT_CGROUP;
12145637 4271 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4272 }
d35fbf6b 4273 }
034c6ed7 4274
211a3d87
LB
4275 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4276
5b10116e 4277 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4278 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4279 if (r < 0)
4280 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4281 }
94f04347 4282
bb0c0d6f
LP
4283 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4284 r = setup_credentials(context, params, unit->id, uid);
4285 if (r < 0) {
4286 *exit_status = EXIT_CREDENTIALS;
4287 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4288 }
4289 }
4290
7bce046b 4291 r = build_environment(
fd63e712 4292 unit,
7bce046b
LP
4293 context,
4294 params,
4295 n_fds,
4296 home,
4297 username,
4298 shell,
4299 journal_stream_dev,
4300 journal_stream_ino,
4301 &our_env);
2065ca69
JW
4302 if (r < 0) {
4303 *exit_status = EXIT_MEMORY;
12145637 4304 return log_oom();
2065ca69
JW
4305 }
4306
4307 r = build_pass_environment(context, &pass_env);
4308 if (r < 0) {
4309 *exit_status = EXIT_MEMORY;
12145637 4310 return log_oom();
2065ca69
JW
4311 }
4312
8c35c10d 4313 /* The PATH variable is set to the default path in params->environment.
4314 * However, this is overridden if user specified fields have PATH set.
4315 * The intention is to also override PATH if the user does
4316 * not specify PATH and the user has specified ExecSearchPath
4317 */
4318
4319 if (!strv_isempty(context->exec_search_path)) {
4320 _cleanup_free_ char *joined = NULL;
4321
4322 joined = strv_join(context->exec_search_path, ":");
4323 if (!joined) {
4324 *exit_status = EXIT_MEMORY;
4325 return log_oom();
4326 }
4327
4328 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4329 if (r < 0) {
4330 *exit_status = EXIT_MEMORY;
4331 return log_oom();
4332 }
4333 }
4334
4ab3d29f 4335 accum_env = strv_env_merge(params->environment,
2065ca69 4336 our_env,
8c35c10d 4337 joined_exec_search_path,
2065ca69
JW
4338 pass_env,
4339 context->environment,
44e5d006 4340 files_env);
2065ca69
JW
4341 if (!accum_env) {
4342 *exit_status = EXIT_MEMORY;
12145637 4343 return log_oom();
2065ca69 4344 }
1280503b 4345 accum_env = strv_env_clean(accum_env);
2065ca69 4346
096424d1 4347 (void) umask(context->umask);
b213e1c1 4348
b1edf445 4349 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4350 if (r < 0) {
4351 *exit_status = EXIT_KEYRING;
12145637 4352 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4353 }
4354
165a31c0 4355 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4356 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4357
165a31c0
LP
4358 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4359 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4360
165a31c0
LP
4361 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4362 if (needs_ambient_hack)
4363 needs_setuid = false;
4364 else
4365 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4366
4367 if (needs_sandboxing) {
7f18ef0a
FK
4368 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4369 * present. The actual MAC context application will happen later, as late as possible, to avoid
4370 * impacting our own code paths. */
4371
349cc4a5 4372#if HAVE_SELINUX
43b1f709 4373 use_selinux = mac_selinux_use();
7f18ef0a 4374#endif
f9fa32f0 4375#if ENABLE_SMACK
43b1f709 4376 use_smack = mac_smack_use();
7f18ef0a 4377#endif
349cc4a5 4378#if HAVE_APPARMOR
43b1f709 4379 use_apparmor = mac_apparmor_use();
7f18ef0a 4380#endif
165a31c0 4381 }
7f18ef0a 4382
ce932d2d
LP
4383 if (needs_sandboxing) {
4384 int which_failed;
4385
4386 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4387 * is set here. (See below.) */
4388
4389 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4390 if (r < 0) {
4391 *exit_status = EXIT_LIMITS;
4392 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4393 }
4394 }
4395
0af07108 4396 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4397 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4398 * wins here. (See above.) */
4399
1da37e58 4400 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4401 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4402 if (r < 0) {
4403 *exit_status = EXIT_PAM;
4404 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4405 }
ac45f971 4406
0af07108
ZJS
4407 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4408 if (ngids_after_pam < 0) {
4409 *exit_status = EXIT_MEMORY;
4410 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4411 }
b213e1c1 4412 }
5749f855 4413
0af07108 4414 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4415 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4416 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4417 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4418
4419 userns_set_up = true;
4420 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4421 if (r < 0) {
4422 *exit_status = EXIT_USER;
4423 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4424 }
4425 }
4426
a8d08f39
LP
4427 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4428
6e2d7c4f 4429 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4430 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4431 if (r == -EPERM)
4432 log_unit_warning_errno(unit, r,
4433 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4434 else if (r < 0) {
6e2d7c4f
MS
4435 *exit_status = EXIT_NETWORK;
4436 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4437 }
a8d08f39
LP
4438 } else if (context->network_namespace_path) {
4439 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4440 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4441 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4442 } else
4443 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4444 }
169c1bda 4445
a70581ff
XR
4446 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4447
4448 if (ns_type_supported(NAMESPACE_IPC)) {
4449 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4450 if (r == -EPERM)
4451 log_unit_warning_errno(unit, r,
4452 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4453 else if (r < 0) {
4454 *exit_status = EXIT_NAMESPACE;
4455 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4456 }
4457 } else if (context->ipc_namespace_path) {
4458 *exit_status = EXIT_NAMESPACE;
4459 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4460 "IPCNamespacePath= is not supported, refusing.");
4461 } else
4462 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4463 }
4464
ee818b89 4465 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4466 _cleanup_free_ char *error_path = NULL;
4467
9f71ba8d 4468 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4469 if (r < 0) {
4470 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4471 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4472 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4473 }
d35fbf6b 4474 }
81a2b7ce 4475
daf8f72b
LP
4476 if (needs_sandboxing) {
4477 r = apply_protect_hostname(unit, context, exit_status);
4478 if (r < 0)
4479 return r;
aecd5ac6
TM
4480 }
4481
5749f855
AZ
4482 /* Drop groups as early as possible.
4483 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4484 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4485 if (needs_setuid) {
afb11bf1
DG
4486 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4487 int ngids_to_enforce = 0;
4488
4489 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4490 ngids,
4491 gids_after_pam,
4492 ngids_after_pam,
4493 &gids_to_enforce);
4494 if (ngids_to_enforce < 0) {
4495 *exit_status = EXIT_MEMORY;
4496 return log_unit_error_errno(unit,
4497 ngids_to_enforce,
4498 "Failed to merge group lists. Group membership might be incorrect: %m");
4499 }
4500
4501 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4502 if (r < 0) {
4503 *exit_status = EXIT_GROUP;
12145637 4504 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4505 }
165a31c0 4506 }
096424d1 4507
5749f855
AZ
4508 /* If the user namespace was not set up above, try to do it now.
4509 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4510 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4511 * case of mount namespaces being less privileged when the mount point list is copied from a
4512 * different user namespace). */
9008e1ac 4513
5749f855
AZ
4514 if (needs_sandboxing && context->private_users && !userns_set_up) {
4515 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4516 if (r < 0) {
4517 *exit_status = EXIT_USER;
4518 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4519 }
4520 }
4521
9f71ba8d
ZJS
4522 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4523 * shall execute. */
4524
4525 _cleanup_free_ char *executable = NULL;
b83d5050 4526 _cleanup_close_ int executable_fd = -1;
8c35c10d 4527 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4528 if (r < 0) {
4529 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4530 log_unit_struct_errno(unit, LOG_INFO, r,
4531 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4532 LOG_UNIT_INVOCATION_ID(unit),
4533 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4534 command->path),
4535 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4536 return 0;
4537 }
4538
4539 *exit_status = EXIT_EXEC;
c2503e35
RH
4540
4541 return log_unit_struct_errno(unit, LOG_INFO, r,
4542 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4543 LOG_UNIT_INVOCATION_ID(unit),
4544 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4545 command->path),
4546 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4547 }
4548
b83d5050
ZJS
4549 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4550 if (r < 0) {
4551 *exit_status = EXIT_FDS;
4552 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4553 }
4554
9f71ba8d 4555#if HAVE_SELINUX
49590d67
MS
4556 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4557 int fd = -1;
4558
4559 if (socket_fd >= 0)
4560 fd = socket_fd;
4561 else if (params->n_socket_fds == 1)
4562 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4563 * use context from that fd to compute the label. */
4564 fd = params->fds[0];
4565
4566 if (fd >= 0) {
4567 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
2ad2925d 4568 if (r < 0 && !context->selinux_context_ignore) {
49590d67
MS
4569 *exit_status = EXIT_SELINUX_CONTEXT;
4570 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4571 }
9f71ba8d
ZJS
4572 }
4573 }
4574#endif
4575
165a31c0 4576 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4577 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4578 * however if we have it as we want to keep it open until the final execve(). */
4579
1da37e58 4580 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4581 if (r >= 0)
4582 r = shift_fds(fds, n_fds);
4583 if (r >= 0)
25b583d7 4584 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4585 if (r < 0) {
4586 *exit_status = EXIT_FDS;
12145637 4587 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4588 }
e66cf1a3 4589
5686391b
LP
4590 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4591 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4592 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4593 * came this far. */
4594
165a31c0 4595 secure_bits = context->secure_bits;
e66cf1a3 4596
165a31c0
LP
4597 if (needs_sandboxing) {
4598 uint64_t bset;
e66cf1a3 4599
ce932d2d
LP
4600 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4601 * requested. (Note this is placed after the general resource limit initialization, see
4602 * above, in order to take precedence.) */
f4170c67
LP
4603 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4604 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4605 *exit_status = EXIT_LIMITS;
12145637 4606 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4607 }
4608 }
4609
37ac2744
JB
4610#if ENABLE_SMACK
4611 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4612 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4613 if (use_smack) {
b83d5050 4614 r = setup_smack(context, executable_fd);
29ff6247 4615 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4616 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4617 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4618 }
4619 }
4620#endif
4621
165a31c0
LP
4622 bset = context->capability_bounding_set;
4623 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4624 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4625 * instead of us doing that */
4626 if (needs_ambient_hack)
4627 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4628 (UINT64_C(1) << CAP_SETUID) |
4629 (UINT64_C(1) << CAP_SETGID);
4630
4631 if (!cap_test_all(bset)) {
4632 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4633 if (r < 0) {
4634 *exit_status = EXIT_CAPABILITIES;
12145637 4635 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4636 }
4c2630eb 4637 }
3b8bddde 4638
16fcb191
TK
4639 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4640 * keep-caps set.
4641 * To be able to raise the ambient capabilities after setresuid() they have to be
4642 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4643 * After setresuid() the ambient capabilities can be raised as they are present in
4644 * the permitted and inhertiable set. However it is possible that someone wants to
4645 * set ambient capabilities without changing the user, so we also set the ambient
4646 * capabilities here.
4647 * The requested ambient capabilities are raised in the inheritable set if the
4648 * second argument is true. */
943800f4 4649 if (!needs_ambient_hack) {
755d4b67
IP
4650 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4651 if (r < 0) {
4652 *exit_status = EXIT_CAPABILITIES;
12145637 4653 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4654 }
755d4b67 4655 }
165a31c0 4656 }
755d4b67 4657
fa97f630
JB
4658 /* chroot to root directory first, before we lose the ability to chroot */
4659 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4660 if (r < 0)
4661 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4662
165a31c0 4663 if (needs_setuid) {
08f67696 4664 if (uid_is_valid(uid)) {
ff0af2a1
LP
4665 r = enforce_user(context, uid);
4666 if (r < 0) {
4667 *exit_status = EXIT_USER;
12145637 4668 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4669 }
165a31c0
LP
4670
4671 if (!needs_ambient_hack &&
4672 context->capability_ambient_set != 0) {
755d4b67 4673
16fcb191 4674 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4675 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4676 if (r < 0) {
4677 *exit_status = EXIT_CAPABILITIES;
12145637 4678 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4679 }
755d4b67 4680 }
5b6319dc 4681 }
165a31c0 4682 }
d35fbf6b 4683
56ef8db9
JB
4684 /* Apply working directory here, because the working directory might be on NFS and only the user running
4685 * this service might have the correct privilege to change to the working directory */
fa97f630 4686 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4687 if (r < 0)
4688 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4689
165a31c0 4690 if (needs_sandboxing) {
37ac2744 4691 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4692 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4693 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4694 * are restricted. */
4695
349cc4a5 4696#if HAVE_SELINUX
43b1f709 4697 if (use_selinux) {
5cd9cd35
LP
4698 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4699
4700 if (exec_context) {
4701 r = setexeccon(exec_context);
2ad2925d 4702 if (r < 0 && !context->selinux_context_ignore) {
5cd9cd35 4703 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 4704 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
4705 }
4706 }
4707 }
4708#endif
4709
349cc4a5 4710#if HAVE_APPARMOR
43b1f709 4711 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4712 r = aa_change_onexec(context->apparmor_profile);
4713 if (r < 0 && !context->apparmor_profile_ignore) {
4714 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4715 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4716 }
4717 }
4718#endif
4719
165a31c0 4720 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4721 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4722 * CAP_SETPCAP. */
4723 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4724 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4725 * effective set here.
4726 * The effective set is overwritten during execve with the following values:
4727 * - ambient set (for non-root processes)
4728 * - (inheritable | bounding) set for root processes)
4729 *
4730 * Hence there is no security impact to raise it in the effective set before execve
4731 */
4732 r = capability_gain_cap_setpcap(NULL);
4733 if (r < 0) {
4734 *exit_status = EXIT_CAPABILITIES;
4735 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4736 }
755d4b67 4737 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4738 *exit_status = EXIT_SECUREBITS;
12145637 4739 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4740 }
dbdc4098 4741 }
5b6319dc 4742
59eeb84b 4743 if (context_has_no_new_privileges(context))
d35fbf6b 4744 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4745 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4746 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4747 }
4748
349cc4a5 4749#if HAVE_SECCOMP
469830d1
LP
4750 r = apply_address_families(unit, context);
4751 if (r < 0) {
4752 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4753 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4754 }
04aa0cb9 4755
469830d1
LP
4756 r = apply_memory_deny_write_execute(unit, context);
4757 if (r < 0) {
4758 *exit_status = EXIT_SECCOMP;
12145637 4759 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4760 }
f4170c67 4761
469830d1
LP
4762 r = apply_restrict_realtime(unit, context);
4763 if (r < 0) {
4764 *exit_status = EXIT_SECCOMP;
12145637 4765 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4766 }
4767
f69567cb
LP
4768 r = apply_restrict_suid_sgid(unit, context);
4769 if (r < 0) {
4770 *exit_status = EXIT_SECCOMP;
4771 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4772 }
4773
add00535
LP
4774 r = apply_restrict_namespaces(unit, context);
4775 if (r < 0) {
4776 *exit_status = EXIT_SECCOMP;
12145637 4777 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4778 }
4779
469830d1
LP
4780 r = apply_protect_sysctl(unit, context);
4781 if (r < 0) {
4782 *exit_status = EXIT_SECCOMP;
12145637 4783 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4784 }
4785
469830d1
LP
4786 r = apply_protect_kernel_modules(unit, context);
4787 if (r < 0) {
4788 *exit_status = EXIT_SECCOMP;
12145637 4789 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4790 }
4791
84703040
KK
4792 r = apply_protect_kernel_logs(unit, context);
4793 if (r < 0) {
4794 *exit_status = EXIT_SECCOMP;
4795 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4796 }
4797
fc64760d
KK
4798 r = apply_protect_clock(unit, context);
4799 if (r < 0) {
4800 *exit_status = EXIT_SECCOMP;
4801 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4802 }
4803
469830d1
LP
4804 r = apply_private_devices(unit, context);
4805 if (r < 0) {
4806 *exit_status = EXIT_SECCOMP;
12145637 4807 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4808 }
4809
4810 r = apply_syscall_archs(unit, context);
4811 if (r < 0) {
4812 *exit_status = EXIT_SECCOMP;
12145637 4813 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4814 }
4815
78e864e5
TM
4816 r = apply_lock_personality(unit, context);
4817 if (r < 0) {
4818 *exit_status = EXIT_SECCOMP;
12145637 4819 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4820 }
4821
9df2cdd8
TM
4822 r = apply_syscall_log(unit, context);
4823 if (r < 0) {
4824 *exit_status = EXIT_SECCOMP;
4825 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4826 }
4827
5cd9cd35
LP
4828 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4829 * by the filter as little as possible. */
165a31c0 4830 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4831 if (r < 0) {
4832 *exit_status = EXIT_SECCOMP;
12145637 4833 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4834 }
4835#endif
b1994387
ILG
4836
4837#if HAVE_LIBBPF
4838 r = apply_restrict_filesystems(unit, context);
4839 if (r < 0) {
4840 *exit_status = EXIT_BPF;
4841 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
4842 }
4843#endif
4844
d35fbf6b 4845 }
034c6ed7 4846
00819cc1
LP
4847 if (!strv_isempty(context->unset_environment)) {
4848 char **ee = NULL;
4849
4850 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4851 if (!ee) {
4852 *exit_status = EXIT_MEMORY;
12145637 4853 return log_oom();
00819cc1
LP
4854 }
4855
130d3d22 4856 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4857 }
4858
7ca69792
AZ
4859 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4860 replaced_argv = replace_env_argv(command->argv, accum_env);
4861 if (!replaced_argv) {
4862 *exit_status = EXIT_MEMORY;
4863 return log_oom();
4864 }
4865 final_argv = replaced_argv;
4866 } else
4867 final_argv = command->argv;
034c6ed7 4868
f1d34068 4869 if (DEBUG_LOGGING) {
c2b2df60 4870 _cleanup_free_ char *line = NULL;
81a2b7ce 4871
8a62620e
ZJS
4872 line = quote_command_line(final_argv);
4873 if (!line) {
4874 *exit_status = EXIT_MEMORY;
4875 return log_oom();
4876 }
4877
4878 log_unit_struct(unit, LOG_DEBUG,
4879 "EXECUTABLE=%s", executable,
4880 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 4881 }
dd305ec9 4882
5686391b
LP
4883 if (exec_fd >= 0) {
4884 uint8_t hot = 1;
4885
4886 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4887 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4888
4889 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4890 *exit_status = EXIT_EXEC;
4891 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4892 }
4893 }
4894
a6d9111c 4895 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4896
4897 if (exec_fd >= 0) {
4898 uint8_t hot = 0;
4899
4900 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4901 * that POLLHUP on it no longer means execve() succeeded. */
4902
4903 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4904 *exit_status = EXIT_EXEC;
4905 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4906 }
4907 }
12145637 4908
ff0af2a1 4909 *exit_status = EXIT_EXEC;
9f71ba8d 4910 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4911}
81a2b7ce 4912
34cf6c43 4913static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4914static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4915
f2341e0a
LP
4916int exec_spawn(Unit *unit,
4917 ExecCommand *command,
d35fbf6b
DM
4918 const ExecContext *context,
4919 const ExecParameters *params,
4920 ExecRuntime *runtime,
29206d46 4921 DynamicCreds *dcreds,
d35fbf6b 4922 pid_t *ret) {
8351ceae 4923
ee39ca20 4924 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4925 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4926 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4927 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4928 _cleanup_free_ char *line = NULL;
d35fbf6b 4929 pid_t pid;
8351ceae 4930
f2341e0a 4931 assert(unit);
d35fbf6b
DM
4932 assert(command);
4933 assert(context);
4934 assert(ret);
4935 assert(params);
25b583d7 4936 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4937
d35fbf6b
DM
4938 if (context->std_input == EXEC_INPUT_SOCKET ||
4939 context->std_output == EXEC_OUTPUT_SOCKET ||
4940 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4941
d85ff944
YW
4942 if (params->n_socket_fds > 1)
4943 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4944
d85ff944
YW
4945 if (params->n_socket_fds == 0)
4946 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4947
d35fbf6b
DM
4948 socket_fd = params->fds[0];
4949 } else {
4950 socket_fd = -1;
4951 fds = params->fds;
9b141911 4952 n_socket_fds = params->n_socket_fds;
25b583d7 4953 n_storage_fds = params->n_storage_fds;
d35fbf6b 4954 }
94f04347 4955
34cf6c43 4956 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4957 if (r < 0)
4958 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4959
f2341e0a 4960 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4961 if (r < 0)
f2341e0a 4962 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4963
8a62620e 4964 line = quote_command_line(command->argv);
d35fbf6b
DM
4965 if (!line)
4966 return log_oom();
fab56fc5 4967
9f71ba8d
ZJS
4968 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4969 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4970 mac_selinux_maybe_reload();
4971
c2503e35
RH
4972 log_unit_struct(unit, LOG_DEBUG,
4973 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4974 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4975 the mount namespace in the child, but we want to log
4976 from the parent, so we need to use the (possibly
4977 inaccurate) path here. */
4978 LOG_UNIT_INVOCATION_ID(unit));
12145637 4979
78f93209
LP
4980 if (params->cgroup_path) {
4981 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4982 if (r < 0)
4983 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4984 if (r > 0) { /* We are using a child cgroup */
4985 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4986 if (r < 0)
4987 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
4988
4989 /* Normally we would not propagate the oomd xattrs to children but since we created this
4990 * sub-cgroup internally we should do it. */
4991 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
4992 }
4993 }
4994
d35fbf6b
DM
4995 pid = fork();
4996 if (pid < 0)
74129a12 4997 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
4998
4999 if (pid == 0) {
12145637 5000 int exit_status = EXIT_SUCCESS;
ff0af2a1 5001
f2341e0a
LP
5002 r = exec_child(unit,
5003 command,
ff0af2a1
LP
5004 context,
5005 params,
5006 runtime,
29206d46 5007 dcreds,
ff0af2a1 5008 socket_fd,
52c239d7 5009 named_iofds,
4c47affc 5010 fds,
9b141911 5011 n_socket_fds,
25b583d7 5012 n_storage_fds,
ff0af2a1 5013 files_env,
00d9ef85 5014 unit->manager->user_lookup_fds[1],
12145637
LP
5015 &exit_status);
5016
e1714f02
ZJS
5017 if (r < 0) {
5018 const char *status =
5019 exit_status_to_string(exit_status,
e04ed6db 5020 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5021
c2503e35
RH
5022 log_unit_struct_errno(unit, LOG_ERR, r,
5023 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5024 LOG_UNIT_INVOCATION_ID(unit),
5025 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5026 status, command->path),
5027 "EXECUTABLE=%s", command->path);
e1714f02 5028 }
4c2630eb 5029
ff0af2a1 5030 _exit(exit_status);
034c6ed7
LP
5031 }
5032
f2341e0a 5033 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5034
78f93209
LP
5035 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5036 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5037 * process will be killed too). */
5038 if (subcgroup_path)
5039 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5040
b58b4116 5041 exec_status_start(&command->exec_status, pid);
9fb86720 5042
034c6ed7 5043 *ret = pid;
5cb5a6ff
LP
5044 return 0;
5045}
5046
034c6ed7
LP
5047void exec_context_init(ExecContext *c) {
5048 assert(c);
5049
4c12626c 5050 c->umask = 0022;
5bead76e 5051 c->ioprio = ioprio_prio_value(IOPRIO_CLASS_BE, 0);
94f04347 5052 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5053 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5054 c->syslog_level_prefix = true;
353e12c2 5055 c->ignore_sigpipe = true;
3a43da28 5056 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5057 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5058 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5059 c->directories[t].mode = 0755;
12213aed 5060 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5061 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5062 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5063 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5064 c->log_level_max = -1;
005bfaf1
TM
5065#if HAVE_SECCOMP
5066 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5067#endif
b070c7c0 5068 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5069}
5070
613b411c 5071void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5072 assert(c);
5073
6796073e
LP
5074 c->environment = strv_free(c->environment);
5075 c->environment_files = strv_free(c->environment_files);
b4c14404 5076 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5077 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5078
31ce987c 5079 rlimit_free_all(c->rlimit);
034c6ed7 5080
5b10116e 5081 for (size_t l = 0; l < 3; l++) {
52c239d7 5082 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5083 c->stdio_file[l] = mfree(c->stdio_file[l]);
5084 }
52c239d7 5085
a1e58e8e
LP
5086 c->working_directory = mfree(c->working_directory);
5087 c->root_directory = mfree(c->root_directory);
915e6d16 5088 c->root_image = mfree(c->root_image);
18d73705 5089 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5090 c->root_hash = mfree(c->root_hash);
5091 c->root_hash_size = 0;
5092 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5093 c->root_hash_sig = mfree(c->root_hash_sig);
5094 c->root_hash_sig_size = 0;
5095 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5096 c->root_verity = mfree(c->root_verity);
93f59701 5097 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
5098 c->tty_path = mfree(c->tty_path);
5099 c->syslog_identifier = mfree(c->syslog_identifier);
5100 c->user = mfree(c->user);
5101 c->group = mfree(c->group);
034c6ed7 5102
6796073e 5103 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5104
a1e58e8e 5105 c->pam_name = mfree(c->pam_name);
5b6319dc 5106
2a624c36
AP
5107 c->read_only_paths = strv_free(c->read_only_paths);
5108 c->read_write_paths = strv_free(c->read_write_paths);
5109 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5110 c->exec_paths = strv_free(c->exec_paths);
5111 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5112 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5113
d2d6c096 5114 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5115 c->bind_mounts = NULL;
5116 c->n_bind_mounts = 0;
2abd4e38
YW
5117 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5118 c->temporary_filesystems = NULL;
5119 c->n_temporary_filesystems = 0;
b3d13314 5120 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5121
0985c7c4 5122 cpu_set_reset(&c->cpu_set);
b070c7c0 5123 numa_policy_reset(&c->numa_policy);
86a3475b 5124
a1e58e8e
LP
5125 c->utmp_id = mfree(c->utmp_id);
5126 c->selinux_context = mfree(c->selinux_context);
5127 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5128 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5129
b1994387
ILG
5130 c->restrict_filesystems = set_free(c->restrict_filesystems);
5131
8cfa775f 5132 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5133 c->syscall_archs = set_free(c->syscall_archs);
5134 c->address_families = set_free(c->address_families);
e66cf1a3 5135
5b10116e 5136 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5137 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5138
5139 c->log_level_max = -1;
5140
5141 exec_context_free_log_extra_fields(c);
08f3be7a 5142
5ac1530e
ZJS
5143 c->log_ratelimit_interval_usec = 0;
5144 c->log_ratelimit_burst = 0;
90fc172e 5145
08f3be7a
LP
5146 c->stdin_data = mfree(c->stdin_data);
5147 c->stdin_data_size = 0;
a8d08f39
LP
5148
5149 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5150 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5151
5152 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5153
43144be4 5154 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5155 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5156}
5157
34cf6c43 5158int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5159 assert(c);
5160
5161 if (!runtime_prefix)
5162 return 0;
5163
211a3d87 5164 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5165 _cleanup_free_ char *p = NULL;
e66cf1a3 5166
494d0247 5167 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5168 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5169 else
211a3d87 5170 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5171 if (!p)
5172 return -ENOMEM;
5173
7bc4bf4a
LP
5174 /* We execute this synchronously, since we need to be sure this is gone when we start the
5175 * service next. */
c6878637 5176 (void) rm_rf(p, REMOVE_ROOT);
211a3d87
LB
5177
5178 char **symlink;
5179 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5180 _cleanup_free_ char *symlink_abs = NULL;
5181
5182 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5183 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5184 else
5185 symlink_abs = path_join(runtime_prefix, *symlink);
5186 if (!symlink_abs)
5187 return -ENOMEM;
5188
5189 (void) unlink(symlink_abs);
5190 }
5191
e66cf1a3
LP
5192 }
5193
5194 return 0;
5cb5a6ff
LP
5195}
5196
bb0c0d6f
LP
5197int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5198 _cleanup_free_ char *p = NULL;
5199
5200 assert(c);
5201
5202 if (!runtime_prefix || !unit)
5203 return 0;
5204
5205 p = path_join(runtime_prefix, "credentials", unit);
5206 if (!p)
5207 return -ENOMEM;
5208
5209 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5210 * unmount it, and afterwards remove the mount point */
5211 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5212 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5213
5214 return 0;
5215}
5216
34cf6c43 5217static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5218 assert(c);
5219
a1e58e8e 5220 c->path = mfree(c->path);
6796073e 5221 c->argv = strv_free(c->argv);
43d0fcbd
LP
5222}
5223
da6053d0 5224void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5225 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5226 exec_command_done(c+i);
5227}
5228
f1acf85a 5229ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5230 ExecCommand *i;
5231
5232 while ((i = c)) {
71fda00f 5233 LIST_REMOVE(command, c, i);
43d0fcbd 5234 exec_command_done(i);
5cb5a6ff
LP
5235 free(i);
5236 }
f1acf85a
ZJS
5237
5238 return NULL;
5cb5a6ff
LP
5239}
5240
da6053d0 5241void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5242 for (size_t i = 0; i < n; i++)
f1acf85a 5243 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5244}
5245
6a1d4d9f 5246void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5247 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5248 exec_status_reset(&c[i].exec_status);
5249}
5250
5251void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5252 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5253 ExecCommand *z;
5254
5255 LIST_FOREACH(command, z, c[i])
5256 exec_status_reset(&z->exec_status);
5257 }
5258}
5259
039f0e70 5260typedef struct InvalidEnvInfo {
34cf6c43 5261 const Unit *unit;
039f0e70
LP
5262 const char *path;
5263} InvalidEnvInfo;
5264
5265static void invalid_env(const char *p, void *userdata) {
5266 InvalidEnvInfo *info = userdata;
5267
f2341e0a 5268 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5269}
5270
52c239d7
LB
5271const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5272 assert(c);
5273
5274 switch (fd_index) {
5073ff6b 5275
52c239d7
LB
5276 case STDIN_FILENO:
5277 if (c->std_input != EXEC_INPUT_NAMED_FD)
5278 return NULL;
5073ff6b 5279
52c239d7 5280 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5281
52c239d7
LB
5282 case STDOUT_FILENO:
5283 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5284 return NULL;
5073ff6b 5285
52c239d7 5286 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5287
52c239d7
LB
5288 case STDERR_FILENO:
5289 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5290 return NULL;
5073ff6b 5291
52c239d7 5292 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5293
52c239d7
LB
5294 default:
5295 return NULL;
5296 }
5297}
5298
2caa38e9
LP
5299static int exec_context_named_iofds(
5300 const ExecContext *c,
5301 const ExecParameters *p,
5302 int named_iofds[static 3]) {
5303
5b10116e 5304 size_t targets;
56fbd561 5305 const char* stdio_fdname[3];
da6053d0 5306 size_t n_fds;
52c239d7
LB
5307
5308 assert(c);
5309 assert(p);
2caa38e9 5310 assert(named_iofds);
52c239d7
LB
5311
5312 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5313 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5314 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5315
5b10116e 5316 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5317 stdio_fdname[i] = exec_context_fdname(c, i);
5318
4c47affc
FB
5319 n_fds = p->n_storage_fds + p->n_socket_fds;
5320
5b10116e 5321 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5322 if (named_iofds[STDIN_FILENO] < 0 &&
5323 c->std_input == EXEC_INPUT_NAMED_FD &&
5324 stdio_fdname[STDIN_FILENO] &&
5325 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5326
52c239d7
LB
5327 named_iofds[STDIN_FILENO] = p->fds[i];
5328 targets--;
56fbd561
ZJS
5329
5330 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5331 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5332 stdio_fdname[STDOUT_FILENO] &&
5333 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5334
52c239d7
LB
5335 named_iofds[STDOUT_FILENO] = p->fds[i];
5336 targets--;
56fbd561
ZJS
5337
5338 } else if (named_iofds[STDERR_FILENO] < 0 &&
5339 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5340 stdio_fdname[STDERR_FILENO] &&
5341 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5342
52c239d7
LB
5343 named_iofds[STDERR_FILENO] = p->fds[i];
5344 targets--;
5345 }
5346
56fbd561 5347 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5348}
5349
34cf6c43 5350static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5351 char **i, **r = NULL;
5352
5353 assert(c);
5354 assert(l);
5355
5356 STRV_FOREACH(i, c->environment_files) {
5357 char *fn;
52511fae 5358 int k;
8c7be95e
LP
5359 bool ignore = false;
5360 char **p;
7fd1b19b 5361 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5362
5363 fn = *i;
5364
5365 if (fn[0] == '-') {
5366 ignore = true;
313cefa1 5367 fn++;
8c7be95e
LP
5368 }
5369
5370 if (!path_is_absolute(fn)) {
8c7be95e
LP
5371 if (ignore)
5372 continue;
5373
5374 strv_free(r);
5375 return -EINVAL;
5376 }
5377
2bef10ab 5378 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5379 k = safe_glob(fn, 0, &pglob);
5380 if (k < 0) {
2bef10ab
PL
5381 if (ignore)
5382 continue;
8c7be95e 5383
2bef10ab 5384 strv_free(r);
d8c92e8b 5385 return k;
2bef10ab 5386 }
8c7be95e 5387
d8c92e8b
ZJS
5388 /* When we don't match anything, -ENOENT should be returned */
5389 assert(pglob.gl_pathc > 0);
5390
5b10116e 5391 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5392 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5393 if (k < 0) {
5394 if (ignore)
5395 continue;
8c7be95e 5396
2bef10ab 5397 strv_free(r);
2bef10ab 5398 return k;
e9c1ea9d 5399 }
ebc05a09 5400 /* Log invalid environment variables with filename */
039f0e70
LP
5401 if (p) {
5402 InvalidEnvInfo info = {
f2341e0a 5403 .unit = unit,
039f0e70
LP
5404 .path = pglob.gl_pathv[n]
5405 };
5406
5407 p = strv_env_clean_with_callback(p, invalid_env, &info);
5408 }
8c7be95e 5409
234519ae 5410 if (!r)
2bef10ab
PL
5411 r = p;
5412 else {
5413 char **m;
8c7be95e 5414
4ab3d29f 5415 m = strv_env_merge(r, p);
2bef10ab
PL
5416 strv_free(r);
5417 strv_free(p);
c84a9488 5418 if (!m)
2bef10ab 5419 return -ENOMEM;
2bef10ab
PL
5420
5421 r = m;
5422 }
8c7be95e
LP
5423 }
5424 }
5425
5426 *l = r;
5427
5428 return 0;
5429}
5430
6ac8fdc9 5431static bool tty_may_match_dev_console(const char *tty) {
7b912648 5432 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5433
1e22b5cd
LP
5434 if (!tty)
5435 return true;
5436
a119ec7c 5437 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5438
5439 /* trivial identity? */
5440 if (streq(tty, "console"))
5441 return true;
5442
7b912648
LP
5443 if (resolve_dev_console(&resolved) < 0)
5444 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5445
5446 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5447 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5448}
5449
6c0ae739
LP
5450static bool exec_context_may_touch_tty(const ExecContext *ec) {
5451 assert(ec);
1e22b5cd 5452
6c0ae739 5453 return ec->tty_reset ||
1e22b5cd
LP
5454 ec->tty_vhangup ||
5455 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5456 is_terminal_input(ec->std_input) ||
5457 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5458 is_terminal_output(ec->std_error);
5459}
5460
5461bool exec_context_may_touch_console(const ExecContext *ec) {
5462
5463 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5464 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5465}
5466
15ae422b
LP
5467static void strv_fprintf(FILE *f, char **l) {
5468 char **g;
5469
5470 assert(f);
5471
5472 STRV_FOREACH(g, l)
5473 fprintf(f, " %s", *g);
5474}
5475
ddc155b2
TM
5476static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5477 assert(f);
5478 assert(prefix);
5479 assert(name);
5480
5481 if (!strv_isempty(strv)) {
a7bd1656 5482 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5483 strv_fprintf(f, strv);
5484 fputs("\n", f);
5485 }
5486}
5487
34cf6c43 5488void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5291f26d 5489 char **e, **d;
add00535 5490 int r;
9eba9da4 5491
5cb5a6ff
LP
5492 assert(c);
5493 assert(f);
5494
4ad49000 5495 prefix = strempty(prefix);
5cb5a6ff
LP
5496
5497 fprintf(f,
94f04347
LP
5498 "%sUMask: %04o\n"
5499 "%sWorkingDirectory: %s\n"
451a074f 5500 "%sRootDirectory: %s\n"
15ae422b 5501 "%sNonBlocking: %s\n"
64747e2d 5502 "%sPrivateTmp: %s\n"
7f112f50 5503 "%sPrivateDevices: %s\n"
59eeb84b 5504 "%sProtectKernelTunables: %s\n"
e66a2f65 5505 "%sProtectKernelModules: %s\n"
84703040 5506 "%sProtectKernelLogs: %s\n"
fc64760d 5507 "%sProtectClock: %s\n"
59eeb84b 5508 "%sProtectControlGroups: %s\n"
d251207d
LP
5509 "%sPrivateNetwork: %s\n"
5510 "%sPrivateUsers: %s\n"
1b8689f9
LP
5511 "%sProtectHome: %s\n"
5512 "%sProtectSystem: %s\n"
5d997827 5513 "%sMountAPIVFS: %s\n"
f3e43635 5514 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5515 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5516 "%sRestrictRealtime: %s\n"
f69567cb 5517 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5518 "%sKeyringMode: %s\n"
4e399953
LP
5519 "%sProtectHostname: %s\n"
5520 "%sProtectProc: %s\n"
5521 "%sProcSubset: %s\n",
5cb5a6ff 5522 prefix, c->umask,
14eb3285
LP
5523 prefix, empty_to_root(c->working_directory),
5524 prefix, empty_to_root(c->root_directory),
15ae422b 5525 prefix, yes_no(c->non_blocking),
64747e2d 5526 prefix, yes_no(c->private_tmp),
7f112f50 5527 prefix, yes_no(c->private_devices),
59eeb84b 5528 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5529 prefix, yes_no(c->protect_kernel_modules),
84703040 5530 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5531 prefix, yes_no(c->protect_clock),
59eeb84b 5532 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5533 prefix, yes_no(c->private_network),
5534 prefix, yes_no(c->private_users),
1b8689f9
LP
5535 prefix, protect_home_to_string(c->protect_home),
5536 prefix, protect_system_to_string(c->protect_system),
5e98086d 5537 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5538 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5539 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5540 prefix, yes_no(c->restrict_realtime),
f69567cb 5541 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5542 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5543 prefix, yes_no(c->protect_hostname),
5544 prefix, protect_proc_to_string(c->protect_proc),
5545 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5546
915e6d16
LP
5547 if (c->root_image)
5548 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5549
18d73705
LB
5550 if (c->root_image_options) {
5551 MountOptions *o;
5552
5553 fprintf(f, "%sRootImageOptions:", prefix);
5554 LIST_FOREACH(mount_options, o, c->root_image_options)
5555 if (!isempty(o->options))
9ece6444
LB
5556 fprintf(f, " %s:%s",
5557 partition_designator_to_string(o->partition_designator),
5558 o->options);
18d73705
LB
5559 fprintf(f, "\n");
5560 }
5561
0389f4fa
LB
5562 if (c->root_hash) {
5563 _cleanup_free_ char *encoded = NULL;
5564 encoded = hexmem(c->root_hash, c->root_hash_size);
5565 if (encoded)
5566 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5567 }
5568
5569 if (c->root_hash_path)
5570 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5571
d4d55b0d
LB
5572 if (c->root_hash_sig) {
5573 _cleanup_free_ char *encoded = NULL;
5574 ssize_t len;
5575 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5576 if (len)
5577 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5578 }
5579
5580 if (c->root_hash_sig_path)
5581 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5582
0389f4fa
LB
5583 if (c->root_verity)
5584 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5585
8c7be95e
LP
5586 STRV_FOREACH(e, c->environment)
5587 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5588
5589 STRV_FOREACH(e, c->environment_files)
5590 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5591
b4c14404
FB
5592 STRV_FOREACH(e, c->pass_environment)
5593 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5594
00819cc1
LP
5595 STRV_FOREACH(e, c->unset_environment)
5596 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5597
53f47dfc
YW
5598 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5599
5b10116e 5600 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5601 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5602
211a3d87
LB
5603 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5604 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5605
5606 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5607 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5608 }
3536f49e 5609 }
c2bbd90b 5610
5291f26d 5611 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5612
fb33a393 5613 if (c->nice_set)
5291f26d 5614 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5615
dd6c17b1 5616 if (c->oom_score_adjust_set)
5291f26d 5617 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5618
ad21e542 5619 if (c->coredump_filter_set)
5291f26d 5620 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5621
5b10116e 5622 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5623 if (c->rlimit[i]) {
4c3a2b84 5624 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5625 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5626 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5627 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5628 }
94f04347 5629
f8b69d1d 5630 if (c->ioprio_set) {
1756a011 5631 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5632
5bead76e 5633 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5634 if (r >= 0)
5635 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5636
5bead76e 5637 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5638 }
94f04347 5639
f8b69d1d 5640 if (c->cpu_sched_set) {
1756a011 5641 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5642
837df140
YW
5643 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5644 if (r >= 0)
5645 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5646
94f04347 5647 fprintf(f,
38b48754
LP
5648 "%sCPUSchedulingPriority: %i\n"
5649 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5650 prefix, c->cpu_sched_priority,
5651 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5652 }
94f04347 5653
0985c7c4 5654 if (c->cpu_set.set) {
e7fca352
MS
5655 _cleanup_free_ char *affinity = NULL;
5656
5657 affinity = cpu_set_to_range_string(&c->cpu_set);
5658 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5659 }
5660
b070c7c0
MS
5661 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5662 _cleanup_free_ char *nodes = NULL;
5663
5664 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5665 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5666 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5667 }
5668
3a43da28 5669 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5670 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5671
5672 fprintf(f,
80876c20
LP
5673 "%sStandardInput: %s\n"
5674 "%sStandardOutput: %s\n"
5675 "%sStandardError: %s\n",
5676 prefix, exec_input_to_string(c->std_input),
5677 prefix, exec_output_to_string(c->std_output),
5678 prefix, exec_output_to_string(c->std_error));
5679
befc4a80
LP
5680 if (c->std_input == EXEC_INPUT_NAMED_FD)
5681 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5682 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5683 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5684 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5685 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5686
5687 if (c->std_input == EXEC_INPUT_FILE)
5688 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5689 if (c->std_output == EXEC_OUTPUT_FILE)
5690 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5691 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5692 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5693 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5694 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5695 if (c->std_error == EXEC_OUTPUT_FILE)
5696 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5697 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5698 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5699 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5700 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5701
80876c20
LP
5702 if (c->tty_path)
5703 fprintf(f,
6ea832a2
LP
5704 "%sTTYPath: %s\n"
5705 "%sTTYReset: %s\n"
5706 "%sTTYVHangup: %s\n"
5707 "%sTTYVTDisallocate: %s\n",
5708 prefix, c->tty_path,
5709 prefix, yes_no(c->tty_reset),
5710 prefix, yes_no(c->tty_vhangup),
5711 prefix, yes_no(c->tty_vt_disallocate));
94f04347 5712
9f6444eb 5713 if (IN_SET(c->std_output,
9f6444eb
LP
5714 EXEC_OUTPUT_KMSG,
5715 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5716 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5717 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5718 IN_SET(c->std_error,
9f6444eb
LP
5719 EXEC_OUTPUT_KMSG,
5720 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5721 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5722 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5723
5ce70e5b 5724 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5725
837df140
YW
5726 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5727 if (r >= 0)
5728 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5729
837df140
YW
5730 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5731 if (r >= 0)
5732 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5733 }
94f04347 5734
d3070fbd
LP
5735 if (c->log_level_max >= 0) {
5736 _cleanup_free_ char *t = NULL;
5737
5738 (void) log_level_to_string_alloc(c->log_level_max, &t);
5739
5740 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5741 }
5742
5291f26d 5743 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5744 fprintf(f,
5745 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5746 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5747
5ac1530e
ZJS
5748 if (c->log_ratelimit_burst > 0)
5749 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5750
5b10116e
ZJS
5751 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5752 fprintf(f, "%sLogExtraFields: ", prefix);
5753 fwrite(c->log_extra_fields[j].iov_base,
5754 1, c->log_extra_fields[j].iov_len,
5755 f);
5756 fputc('\n', f);
d3070fbd
LP
5757 }
5758
91dd5f7c
LP
5759 if (c->log_namespace)
5760 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5761
07d46372
YW
5762 if (c->secure_bits) {
5763 _cleanup_free_ char *str = NULL;
5764
5765 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5766 if (r >= 0)
5767 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5768 }
94f04347 5769
a103496c 5770 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5771 _cleanup_free_ char *str = NULL;
94f04347 5772
dd1f5bd0
YW
5773 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5774 if (r >= 0)
5775 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5776 }
5777
5778 if (c->capability_ambient_set != 0) {
dd1f5bd0 5779 _cleanup_free_ char *str = NULL;
755d4b67 5780
dd1f5bd0
YW
5781 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5782 if (r >= 0)
5783 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5784 }
5785
5786 if (c->user)
f2d3769a 5787 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5788 if (c->group)
f2d3769a 5789 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5790
29206d46
LP
5791 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5792
ddc155b2 5793 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5794
5b6319dc 5795 if (c->pam_name)
f2d3769a 5796 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5797
ddc155b2
TM
5798 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5799 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5800 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5801 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5802 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 5803 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 5804
5b10116e
ZJS
5805 for (size_t i = 0; i < c->n_bind_mounts; i++)
5806 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5807 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5808 c->bind_mounts[i].ignore_enoent ? "-": "",
5809 c->bind_mounts[i].source,
5810 c->bind_mounts[i].destination,
5811 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5812
5b10116e
ZJS
5813 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5814 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5815
5b10116e
ZJS
5816 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5817 t->path,
5818 isempty(t->options) ? "" : ":",
5819 strempty(t->options));
5820 }
2abd4e38 5821
169c1bda
LP
5822 if (c->utmp_id)
5823 fprintf(f,
5824 "%sUtmpIdentifier: %s\n",
5825 prefix, c->utmp_id);
7b52a628
MS
5826
5827 if (c->selinux_context)
5828 fprintf(f,
5f8640fb
LP
5829 "%sSELinuxContext: %s%s\n",
5830 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5831
80c21aea
WC
5832 if (c->apparmor_profile)
5833 fprintf(f,
5834 "%sAppArmorProfile: %s%s\n",
5835 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5836
5837 if (c->smack_process_label)
5838 fprintf(f,
5839 "%sSmackProcessLabel: %s%s\n",
5840 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5841
050f7277 5842 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5843 fprintf(f,
5844 "%sPersonality: %s\n",
5845 prefix, strna(personality_to_string(c->personality)));
5846
78e864e5
TM
5847 fprintf(f,
5848 "%sLockPersonality: %s\n",
5849 prefix, yes_no(c->lock_personality));
5850
17df7223 5851 if (c->syscall_filter) {
349cc4a5 5852#if HAVE_SECCOMP
8cfa775f 5853 void *id, *val;
17df7223 5854 bool first = true;
351a19b1 5855#endif
17df7223
LP
5856
5857 fprintf(f,
57183d11 5858 "%sSystemCallFilter: ",
17df7223
LP
5859 prefix);
5860
6b000af4 5861 if (!c->syscall_allow_list)
17df7223
LP
5862 fputc('~', f);
5863
349cc4a5 5864#if HAVE_SECCOMP
90e74a66 5865 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5866 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5867 const char *errno_name = NULL;
5868 int num = PTR_TO_INT(val);
17df7223
LP
5869
5870 if (first)
5871 first = false;
5872 else
5873 fputc(' ', f);
5874
57183d11 5875 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5876 fputs(strna(name), f);
8cfa775f
YW
5877
5878 if (num >= 0) {
005bfaf1 5879 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5880 if (errno_name)
5881 fprintf(f, ":%s", errno_name);
5882 else
5883 fprintf(f, ":%d", num);
5884 }
17df7223 5885 }
351a19b1 5886#endif
17df7223
LP
5887
5888 fputc('\n', f);
5889 }
5890
57183d11 5891 if (c->syscall_archs) {
349cc4a5 5892#if HAVE_SECCOMP
57183d11
LP
5893 void *id;
5894#endif
5895
5896 fprintf(f,
5897 "%sSystemCallArchitectures:",
5898 prefix);
5899
349cc4a5 5900#if HAVE_SECCOMP
90e74a66 5901 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5902 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5903#endif
5904 fputc('\n', f);
5905 }
5906
add00535
LP
5907 if (exec_context_restrict_namespaces_set(c)) {
5908 _cleanup_free_ char *s = NULL;
5909
86c2a9f1 5910 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5911 if (r >= 0)
5912 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5913 prefix, strna(s));
add00535
LP
5914 }
5915
b1994387
ILG
5916#if HAVE_LIBBPF
5917 if (exec_context_restrict_filesystems_set(c))
5918 SET_FOREACH(e, c->restrict_filesystems)
5919 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
5920#endif
5921
a8d08f39
LP
5922 if (c->network_namespace_path)
5923 fprintf(f,
5924 "%sNetworkNamespacePath: %s\n",
5925 prefix, c->network_namespace_path);
5926
3df90f24 5927 if (c->syscall_errno > 0) {
005bfaf1 5928#if HAVE_SECCOMP
3df90f24 5929 const char *errno_name;
005bfaf1 5930#endif
3df90f24
YW
5931
5932 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5933
005bfaf1
TM
5934#if HAVE_SECCOMP
5935 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5936 if (errno_name)
005bfaf1 5937 fputs(errno_name, f);
3df90f24 5938 else
005bfaf1
TM
5939 fprintf(f, "%d", c->syscall_errno);
5940#endif
5941 fputc('\n', f);
3df90f24 5942 }
b3d13314 5943
5b10116e 5944 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5945 MountOptions *o;
5946
79e20ceb 5947 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5948 c->mount_images[i].ignore_enoent ? "-": "",
5949 c->mount_images[i].source,
79e20ceb 5950 c->mount_images[i].destination);
427353f6 5951 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5952 fprintf(f, ":%s:%s",
427353f6 5953 partition_designator_to_string(o->partition_designator),
79e20ceb 5954 strempty(o->options));
427353f6
LB
5955 fprintf(f, "\n");
5956 }
93f59701
LB
5957
5958 for (size_t i = 0; i < c->n_extension_images; i++) {
5959 MountOptions *o;
5960
5961 fprintf(f, "%sExtensionImages: %s%s", prefix,
5962 c->extension_images[i].ignore_enoent ? "-": "",
5963 c->extension_images[i].source);
5964 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5965 fprintf(f, ":%s:%s",
5966 partition_designator_to_string(o->partition_designator),
5967 strempty(o->options));
5968 fprintf(f, "\n");
5969 }
5cb5a6ff
LP
5970}
5971
34cf6c43 5972bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5973 assert(c);
5974
61233823 5975 /* Returns true if the process forked off would run under
a931ad47
LP
5976 * an unchanged UID or as root. */
5977
5978 if (!c->user)
5979 return true;
5980
5981 if (streq(c->user, "root") || streq(c->user, "0"))
5982 return true;
5983
5984 return false;
5985}
5986
34cf6c43 5987int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
5988 int p;
5989
5990 assert(c);
5991
5992 if (c->ioprio_set)
5993 return c->ioprio;
5994
5995 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5996 if (p < 0)
5bead76e 5997 return ioprio_prio_value(IOPRIO_CLASS_BE, 4);
7f452159
LP
5998
5999 return p;
6000}
6001
5e98086d
ZJS
6002bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6003 assert(c);
6004
61198784 6005 /* Explicit setting wins */
5e98086d
ZJS
6006 if (c->mount_apivfs_set)
6007 return c->mount_apivfs;
6008
61198784 6009 /* Default to "yes" if root directory or image are specified */
74e12520 6010 if (exec_context_with_rootfs(c))
61198784
ZJS
6011 return true;
6012
5e98086d
ZJS
6013 return false;
6014}
6015
d3070fbd 6016void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6017 assert(c);
6018
5b10116e 6019 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6020 free(c->log_extra_fields[l].iov_base);
6021 c->log_extra_fields = mfree(c->log_extra_fields);
6022 c->n_log_extra_fields = 0;
6023}
6024
6f765baf 6025void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6026 _cleanup_close_ int fd = -1;
6027 const char *path;
6028 struct stat st;
6f765baf
LP
6029 int r;
6030
6031 assert(c);
6032
6033 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6034 exec_context_tty_reset(c, NULL);
6035
6036 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6037 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6038 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6039 if (!exec_context_may_touch_tty(c))
6040 return;
6f765baf 6041
0ba976e8
LP
6042 path = exec_context_tty_path(c);
6043 if (!path)
6044 return;
6f765baf 6045
0ba976e8
LP
6046 fd = open(path, O_PATH|O_CLOEXEC);
6047 if (fd < 0)
6048 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6049 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6050 path);
6051
6052 if (fstat(fd, &st) < 0)
6053 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6054
6055 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6056 * if things are a character device, since a proper check either means we'd have to open the TTY and
6057 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6058 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6059 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6060 if (!S_ISCHR(st.st_mode))
6061 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6062
6063 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6064 if (r < 0)
6065 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6066}
6067
4c2f5842
LP
6068int exec_context_get_clean_directories(
6069 ExecContext *c,
6070 char **prefix,
6071 ExecCleanMask mask,
6072 char ***ret) {
6073
6074 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6075 int r;
6076
6077 assert(c);
6078 assert(prefix);
6079 assert(ret);
6080
5b10116e 6081 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6082 if (!FLAGS_SET(mask, 1U << t))
6083 continue;
6084
6085 if (!prefix[t])
6086 continue;
6087
211a3d87 6088 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6089 char *j;
6090
211a3d87 6091 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6092 if (!j)
6093 return -ENOMEM;
6094
6095 r = strv_consume(&l, j);
6096 if (r < 0)
6097 return r;
7f622a19
YW
6098
6099 /* Also remove private directories unconditionally. */
6100 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6101 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6102 if (!j)
6103 return -ENOMEM;
6104
6105 r = strv_consume(&l, j);
6106 if (r < 0)
6107 return r;
6108 }
6109
6110 char **symlink;
6111 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6112 j = path_join(prefix[t], *symlink);
7f622a19
YW
6113 if (!j)
6114 return -ENOMEM;
6115
6116 r = strv_consume(&l, j);
6117 if (r < 0)
6118 return r;
6119 }
4c2f5842
LP
6120 }
6121 }
6122
6123 *ret = TAKE_PTR(l);
6124 return 0;
6125}
6126
6127int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6128 ExecCleanMask mask = 0;
6129
6130 assert(c);
6131 assert(ret);
6132
6133 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6134 if (c->directories[t].n_items > 0)
4c2f5842
LP
6135 mask |= 1U << t;
6136
6137 *ret = mask;
6138 return 0;
6139}
6140
b58b4116 6141void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6142 assert(s);
5cb5a6ff 6143
2ed26ed0
LP
6144 *s = (ExecStatus) {
6145 .pid = pid,
6146 };
6147
b58b4116
LP
6148 dual_timestamp_get(&s->start_timestamp);
6149}
6150
34cf6c43 6151void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6152 assert(s);
6153
d46b79bb 6154 if (s->pid != pid)
2ed26ed0
LP
6155 *s = (ExecStatus) {
6156 .pid = pid,
6157 };
b58b4116 6158
63983207 6159 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6160
034c6ed7
LP
6161 s->code = code;
6162 s->status = status;
169c1bda 6163
6f765baf
LP
6164 if (context && context->utmp_id)
6165 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6166}
6167
6a1d4d9f
LP
6168void exec_status_reset(ExecStatus *s) {
6169 assert(s);
6170
6171 *s = (ExecStatus) {};
6172}
6173
34cf6c43 6174void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6175 assert(s);
6176 assert(f);
6177
9fb86720
LP
6178 if (s->pid <= 0)
6179 return;
6180
4c940960
LP
6181 prefix = strempty(prefix);
6182
9fb86720 6183 fprintf(f,
ccd06097
ZJS
6184 "%sPID: "PID_FMT"\n",
6185 prefix, s->pid);
9fb86720 6186
af9d16e1 6187 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6188 fprintf(f,
6189 "%sStart Timestamp: %s\n",
04f5c018 6190 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6191
af9d16e1 6192 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6193 fprintf(f,
6194 "%sExit Timestamp: %s\n"
6195 "%sExit Code: %s\n"
6196 "%sExit Status: %i\n",
04f5c018 6197 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6198 prefix, sigchld_code_to_string(s->code),
6199 prefix, s->status);
5cb5a6ff 6200}
44d8db9e 6201
34cf6c43 6202static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6203 _cleanup_free_ char *cmd = NULL;
4c940960 6204 const char *prefix2;
44d8db9e
LP
6205
6206 assert(c);
6207 assert(f);
6208
4c940960 6209 prefix = strempty(prefix);
63c372cb 6210 prefix2 = strjoina(prefix, "\t");
44d8db9e 6211
8a62620e 6212 cmd = quote_command_line(c->argv);
44d8db9e
LP
6213 fprintf(f,
6214 "%sCommand Line: %s\n",
4bbccb02 6215 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 6216
9fb86720 6217 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6218}
6219
6220void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6221 assert(f);
6222
4c940960 6223 prefix = strempty(prefix);
44d8db9e
LP
6224
6225 LIST_FOREACH(command, c, c)
6226 exec_command_dump(c, f, prefix);
6227}
94f04347 6228
a6a80b4f
LP
6229void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6230 ExecCommand *end;
6231
6232 assert(l);
6233 assert(e);
6234
6235 if (*l) {
35b8ca3a 6236 /* It's kind of important, that we keep the order here */
71fda00f
LP
6237 LIST_FIND_TAIL(command, *l, end);
6238 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6239 } else
6240 *l = e;
6241}
6242
26fd040d
LP
6243int exec_command_set(ExecCommand *c, const char *path, ...) {
6244 va_list ap;
6245 char **l, *p;
6246
6247 assert(c);
6248 assert(path);
6249
6250 va_start(ap, path);
6251 l = strv_new_ap(path, ap);
6252 va_end(ap);
6253
6254 if (!l)
6255 return -ENOMEM;
6256
250a918d
LP
6257 p = strdup(path);
6258 if (!p) {
26fd040d
LP
6259 strv_free(l);
6260 return -ENOMEM;
6261 }
6262
6897dfe8 6263 free_and_replace(c->path, p);
26fd040d 6264
130d3d22 6265 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6266}
6267
86b23b07 6268int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6269 _cleanup_strv_free_ char **l = NULL;
86b23b07 6270 va_list ap;
86b23b07
JS
6271 int r;
6272
6273 assert(c);
6274 assert(path);
6275
6276 va_start(ap, path);
6277 l = strv_new_ap(path, ap);
6278 va_end(ap);
6279
6280 if (!l)
6281 return -ENOMEM;
6282
e287086b 6283 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6284 if (r < 0)
86b23b07 6285 return r;
86b23b07
JS
6286
6287 return 0;
6288}
6289
e8a565cb
YW
6290static void *remove_tmpdir_thread(void *p) {
6291 _cleanup_free_ char *path = p;
86b23b07 6292
e8a565cb
YW
6293 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6294 return NULL;
6295}
6296
6297static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6298 int r;
6299
6300 if (!rt)
6301 return NULL;
6302
6303 if (rt->manager)
6304 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6305
6306 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6307
6308 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6309 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6310
6311 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6312 if (r < 0)
e8a565cb 6313 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6314 else
6315 rt->tmp_dir = NULL;
e8a565cb 6316 }
613b411c 6317
56a13a49 6318 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6319 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6320
6321 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6322 if (r < 0)
e8a565cb 6323 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6324 else
6325 rt->var_tmp_dir = NULL;
e8a565cb
YW
6326 }
6327
6328 rt->id = mfree(rt->id);
6329 rt->tmp_dir = mfree(rt->tmp_dir);
6330 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6331 safe_close_pair(rt->netns_storage_socket);
a70581ff 6332 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6333 return mfree(rt);
6334}
6335
6336static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6337 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6338}
6339
56a13a49
ZJS
6340static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6341 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6342 ExecRuntime *n;
613b411c 6343
8e8009dc 6344 assert(ret);
613b411c 6345
56a13a49
ZJS
6346 id_copy = strdup(id);
6347 if (!id_copy)
6348 return -ENOMEM;
6349
8e8009dc
LP
6350 n = new(ExecRuntime, 1);
6351 if (!n)
613b411c
LP
6352 return -ENOMEM;
6353
8e8009dc 6354 *n = (ExecRuntime) {
56a13a49 6355 .id = TAKE_PTR(id_copy),
8e8009dc 6356 .netns_storage_socket = { -1, -1 },
a70581ff 6357 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6358 };
6359
6360 *ret = n;
613b411c
LP
6361 return 0;
6362}
6363
e8a565cb
YW
6364static int exec_runtime_add(
6365 Manager *m,
6366 const char *id,
56a13a49
ZJS
6367 char **tmp_dir,
6368 char **var_tmp_dir,
6369 int netns_storage_socket[2],
a70581ff 6370 int ipcns_storage_socket[2],
e8a565cb
YW
6371 ExecRuntime **ret) {
6372
6373 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6374 int r;
6375
e8a565cb 6376 assert(m);
613b411c
LP
6377 assert(id);
6378
a70581ff 6379 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6380
56a13a49 6381 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6382 if (r < 0)
6383 return r;
6384
63083706 6385 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6386 if (r < 0)
6387 return r;
e8a565cb 6388
56a13a49
ZJS
6389 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6390 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6391 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6392
6393 if (netns_storage_socket) {
56a13a49
ZJS
6394 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6395 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6396 }
6397
a70581ff
XR
6398 if (ipcns_storage_socket) {
6399 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6400 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6401 }
6402
e8a565cb
YW
6403 rt->manager = m;
6404
6405 if (ret)
6406 *ret = rt;
e8a565cb 6407 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6408 TAKE_PTR(rt);
e8a565cb
YW
6409 return 0;
6410}
6411
74aaf59b
LP
6412static int exec_runtime_make(
6413 Manager *m,
6414 const ExecContext *c,
6415 const char *id,
6416 ExecRuntime **ret) {
6417
56a13a49 6418 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6419 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6420 int r;
6421
6422 assert(m);
6423 assert(c);
6424 assert(id);
6425
6426 /* It is not necessary to create ExecRuntime object. */
a70581ff 6427 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6428 *ret = NULL;
e8a565cb 6429 return 0;
74aaf59b 6430 }
e8a565cb 6431
efa2f3a1
TM
6432 if (c->private_tmp &&
6433 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6434 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6435 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6436 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6437 if (r < 0)
6438 return r;
6439 }
6440
a8d08f39 6441 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6442 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6443 return -errno;
6444 }
6445
a70581ff
XR
6446 if (c->private_ipc || c->ipc_namespace_path) {
6447 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6448 return -errno;
6449 }
6450
6451 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6452 if (r < 0)
6453 return r;
6454
613b411c
LP
6455 return 1;
6456}
6457
e8a565cb
YW
6458int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6459 ExecRuntime *rt;
6460 int r;
613b411c 6461
e8a565cb
YW
6462 assert(m);
6463 assert(id);
6464 assert(ret);
6465
6466 rt = hashmap_get(m->exec_runtime_by_id, id);
6467 if (rt)
387f6955 6468 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6469 goto ref;
6470
74aaf59b
LP
6471 if (!create) {
6472 *ret = NULL;
e8a565cb 6473 return 0;
74aaf59b 6474 }
e8a565cb
YW
6475
6476 /* If not found, then create a new object. */
6477 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6478 if (r < 0)
e8a565cb 6479 return r;
74aaf59b
LP
6480 if (r == 0) {
6481 /* When r == 0, it is not necessary to create ExecRuntime object. */
6482 *ret = NULL;
6483 return 0;
6484 }
613b411c 6485
e8a565cb
YW
6486ref:
6487 /* increment reference counter. */
6488 rt->n_ref++;
6489 *ret = rt;
6490 return 1;
6491}
613b411c 6492
e8a565cb
YW
6493ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6494 if (!rt)
613b411c
LP
6495 return NULL;
6496
e8a565cb 6497 assert(rt->n_ref > 0);
613b411c 6498
e8a565cb
YW
6499 rt->n_ref--;
6500 if (rt->n_ref > 0)
f2341e0a
LP
6501 return NULL;
6502
e8a565cb 6503 return exec_runtime_free(rt, destroy);
613b411c
LP
6504}
6505
e8a565cb
YW
6506int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6507 ExecRuntime *rt;
e8a565cb
YW
6508
6509 assert(m);
613b411c
LP
6510 assert(f);
6511 assert(fds);
6512
90e74a66 6513 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6514 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6515
e8a565cb
YW
6516 if (rt->tmp_dir)
6517 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6518
e8a565cb
YW
6519 if (rt->var_tmp_dir)
6520 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6521
e8a565cb
YW
6522 if (rt->netns_storage_socket[0] >= 0) {
6523 int copy;
613b411c 6524
e8a565cb
YW
6525 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6526 if (copy < 0)
6527 return copy;
613b411c 6528
e8a565cb
YW
6529 fprintf(f, " netns-socket-0=%i", copy);
6530 }
613b411c 6531
e8a565cb
YW
6532 if (rt->netns_storage_socket[1] >= 0) {
6533 int copy;
613b411c 6534
e8a565cb
YW
6535 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6536 if (copy < 0)
6537 return copy;
613b411c 6538
e8a565cb
YW
6539 fprintf(f, " netns-socket-1=%i", copy);
6540 }
6541
a70581ff
XR
6542 if (rt->ipcns_storage_socket[0] >= 0) {
6543 int copy;
6544
6545 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6546 if (copy < 0)
6547 return copy;
6548
6549 fprintf(f, " ipcns-socket-0=%i", copy);
6550 }
6551
6552 if (rt->ipcns_storage_socket[1] >= 0) {
6553 int copy;
6554
6555 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6556 if (copy < 0)
6557 return copy;
6558
6559 fprintf(f, " ipcns-socket-1=%i", copy);
6560 }
6561
e8a565cb 6562 fputc('\n', f);
613b411c
LP
6563 }
6564
6565 return 0;
6566}
6567
e8a565cb
YW
6568int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6569 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6570 ExecRuntime *rt;
613b411c
LP
6571 int r;
6572
e8a565cb
YW
6573 /* This is for the migration from old (v237 or earlier) deserialization text.
6574 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6575 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6576 * so or not from the serialized text, then we always creates a new object owned by this. */
6577
6578 assert(u);
613b411c
LP
6579 assert(key);
6580 assert(value);
6581
e8a565cb
YW
6582 /* Manager manages ExecRuntime objects by the unit id.
6583 * So, we omit the serialized text when the unit does not have id (yet?)... */
6584 if (isempty(u->id)) {
6585 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6586 return 0;
6587 }
613b411c 6588
cbc165d1
ZJS
6589 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6590 return log_oom();
e8a565cb
YW
6591
6592 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6593 if (!rt) {
cbc165d1 6594 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6595 return log_oom();
613b411c 6596
e8a565cb
YW
6597 rt = rt_create;
6598 }
6599
6600 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6601 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6602 return -ENOMEM;
613b411c
LP
6603
6604 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6605 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6606 return -ENOMEM;
613b411c
LP
6607
6608 } else if (streq(key, "netns-socket-0")) {
6609 int fd;
6610
e8a565cb 6611 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6612 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6613 return 0;
613b411c 6614 }
e8a565cb
YW
6615
6616 safe_close(rt->netns_storage_socket[0]);
6617 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6618
613b411c
LP
6619 } else if (streq(key, "netns-socket-1")) {
6620 int fd;
6621
e8a565cb 6622 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6623 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6624 return 0;
613b411c 6625 }
e8a565cb
YW
6626
6627 safe_close(rt->netns_storage_socket[1]);
6628 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6629
613b411c
LP
6630 } else
6631 return 0;
6632
e8a565cb
YW
6633 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6634 if (rt_create) {
6635 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6636 if (r < 0) {
3fe91079 6637 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6638 return 0;
6639 }
613b411c 6640
e8a565cb 6641 rt_create->manager = u->manager;
613b411c 6642
e8a565cb 6643 /* Avoid cleanup */
56a13a49 6644 TAKE_PTR(rt_create);
e8a565cb 6645 }
98b47d54 6646
e8a565cb
YW
6647 return 1;
6648}
613b411c 6649
56a13a49
ZJS
6650int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6651 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6652 char *id = NULL;
a70581ff 6653 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6654 const char *p, *v = value;
6655 size_t n;
613b411c 6656
e8a565cb
YW
6657 assert(m);
6658 assert(value);
6659 assert(fds);
98b47d54 6660
e8a565cb 6661 n = strcspn(v, " ");
2f82562b 6662 id = strndupa_safe(v, n);
e8a565cb
YW
6663 if (v[n] != ' ')
6664 goto finalize;
6665 p = v + n + 1;
6666
6667 v = startswith(p, "tmp-dir=");
6668 if (v) {
6669 n = strcspn(v, " ");
56a13a49
ZJS
6670 tmp_dir = strndup(v, n);
6671 if (!tmp_dir)
6672 return log_oom();
e8a565cb
YW
6673 if (v[n] != ' ')
6674 goto finalize;
6675 p = v + n + 1;
6676 }
6677
6678 v = startswith(p, "var-tmp-dir=");
6679 if (v) {
6680 n = strcspn(v, " ");
56a13a49
ZJS
6681 var_tmp_dir = strndup(v, n);
6682 if (!var_tmp_dir)
6683 return log_oom();
e8a565cb
YW
6684 if (v[n] != ' ')
6685 goto finalize;
6686 p = v + n + 1;
6687 }
6688
6689 v = startswith(p, "netns-socket-0=");
6690 if (v) {
6691 char *buf;
6692
6693 n = strcspn(v, " ");
2f82562b 6694 buf = strndupa_safe(v, n);
c413bb28 6695
a70581ff 6696 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6697 if (r < 0)
6698 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6699 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6700 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6701 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6702 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6703 if (v[n] != ' ')
6704 goto finalize;
6705 p = v + n + 1;
613b411c
LP
6706 }
6707
e8a565cb
YW
6708 v = startswith(p, "netns-socket-1=");
6709 if (v) {
6710 char *buf;
98b47d54 6711
e8a565cb 6712 n = strcspn(v, " ");
2f82562b 6713 buf = strndupa_safe(v, n);
a70581ff
XR
6714
6715 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6716 if (r < 0)
6717 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6718 if (!fdset_contains(fds, netns_fdpair[1]))
6719 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6720 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6721 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6722 if (v[n] != ' ')
6723 goto finalize;
6724 p = v + n + 1;
6725 }
6726
6727 v = startswith(p, "ipcns-socket-0=");
6728 if (v) {
6729 char *buf;
6730
6731 n = strcspn(v, " ");
2f82562b 6732 buf = strndupa_safe(v, n);
a70581ff
XR
6733
6734 r = safe_atoi(buf, &ipcns_fdpair[0]);
6735 if (r < 0)
6736 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6737 if (!fdset_contains(fds, ipcns_fdpair[0]))
6738 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6739 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6740 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6741 if (v[n] != ' ')
6742 goto finalize;
6743 p = v + n + 1;
6744 }
6745
6746 v = startswith(p, "ipcns-socket-1=");
6747 if (v) {
6748 char *buf;
6749
6750 n = strcspn(v, " ");
2f82562b 6751 buf = strndupa_safe(v, n);
a70581ff
XR
6752
6753 r = safe_atoi(buf, &ipcns_fdpair[1]);
6754 if (r < 0)
6755 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6756 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6757 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6758 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6759 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6760 }
98b47d54 6761
e8a565cb 6762finalize:
a70581ff 6763 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6764 if (r < 0)
56a13a49
ZJS
6765 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6766 return 0;
e8a565cb 6767}
613b411c 6768
e8a565cb
YW
6769void exec_runtime_vacuum(Manager *m) {
6770 ExecRuntime *rt;
e8a565cb
YW
6771
6772 assert(m);
6773
6774 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6775
90e74a66 6776 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6777 if (rt->n_ref > 0)
6778 continue;
6779
6780 (void) exec_runtime_free(rt, false);
6781 }
613b411c
LP
6782}
6783
b9c04eaf
YW
6784void exec_params_clear(ExecParameters *p) {
6785 if (!p)
6786 return;
6787
c3f8a065
LP
6788 p->environment = strv_free(p->environment);
6789 p->fd_names = strv_free(p->fd_names);
6790 p->fds = mfree(p->fds);
6791 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6792}
6793
bb0c0d6f
LP
6794ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6795 if (!sc)
6796 return NULL;
6797
6798 free(sc->id);
6799 free(sc->data);
6800 return mfree(sc);
6801}
6802
43144be4
LP
6803ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6804 if (!lc)
6805 return NULL;
6806
6807 free(lc->id);
6808 free(lc->path);
6809 return mfree(lc);
6810}
6811
211a3d87
LB
6812void exec_directory_done(ExecDirectory *d) {
6813 if (!d)
6814 return;
6815
6816 for (size_t i = 0; i < d->n_items; i++) {
6817 free(d->items[i].path);
6818 strv_free(d->items[i].symlinks);
6819 }
6820
6821 d->items = mfree(d->items);
6822 d->n_items = 0;
6823 d->mode = 0755;
6824}
6825
6826int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6827 _cleanup_strv_free_ char **s = NULL;
6828 _cleanup_free_ char *p = NULL;
6829
6830 assert(d);
6831 assert(n);
6832 assert(path);
6833
6834 p = strdup(path);
6835 if (!p)
6836 return -ENOMEM;
6837
6838 if (symlinks) {
6839 s = strv_copy(symlinks);
6840 if (!s)
6841 return -ENOMEM;
6842 }
6843
6844 if (!GREEDY_REALLOC(*d, *n + 1))
6845 return -ENOMEM;
6846
6847 (*d)[(*n) ++] = (ExecDirectoryItem) {
6848 .path = TAKE_PTR(p),
6849 .symlinks = TAKE_PTR(s),
6850 };
6851
6852 return 0;
6853}
6854
bb0c0d6f 6855DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 6856DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 6857
80876c20
LP
6858static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6859 [EXEC_INPUT_NULL] = "null",
6860 [EXEC_INPUT_TTY] = "tty",
6861 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6862 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6863 [EXEC_INPUT_SOCKET] = "socket",
6864 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6865 [EXEC_INPUT_DATA] = "data",
2038c3f5 6866 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6867};
6868
8a0867d6
LP
6869DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6870
94f04347 6871static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6872 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6873 [EXEC_OUTPUT_NULL] = "null",
80876c20 6874 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6875 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6876 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6877 [EXEC_OUTPUT_JOURNAL] = "journal",
6878 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6879 [EXEC_OUTPUT_SOCKET] = "socket",
6880 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6881 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6882 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6883 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6884};
6885
6886DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6887
6888static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6889 [EXEC_UTMP_INIT] = "init",
6890 [EXEC_UTMP_LOGIN] = "login",
6891 [EXEC_UTMP_USER] = "user",
6892};
6893
6894DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6895
6896static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6897 [EXEC_PRESERVE_NO] = "no",
6898 [EXEC_PRESERVE_YES] = "yes",
6899 [EXEC_PRESERVE_RESTART] = "restart",
6900};
6901
6902DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6903
6b7b2ed9 6904/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6905static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6906 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6907 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6908 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6909 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6910 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6911};
6912
6913DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6914
211a3d87
LB
6915/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
6916static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6917 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
6918 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
6919 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
6920 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
6921 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
6922};
6923
6924DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
6925
6b7b2ed9
LP
6926/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6927 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6928 * directories, specifically .timer units with their timestamp touch file. */
6929static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6930 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6931 [EXEC_DIRECTORY_STATE] = "state",
6932 [EXEC_DIRECTORY_CACHE] = "cache",
6933 [EXEC_DIRECTORY_LOGS] = "logs",
6934 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6935};
6936
6937DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6938
6939/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6940 * the service payload in. */
fb2042dd
YW
6941static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6942 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6943 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6944 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6945 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6946 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6947};
6948
6949DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6950
b1edf445
LP
6951static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6952 [EXEC_KEYRING_INHERIT] = "inherit",
6953 [EXEC_KEYRING_PRIVATE] = "private",
6954 [EXEC_KEYRING_SHARED] = "shared",
6955};
6956
6957DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);