]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #22044 from keszybz/minor-man-page-adjustments
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
032b3afb 66#include "ioprio-util.h"
a1164ae3 67#include "label.h"
8dd4c05b
LP
68#include "log.h"
69#include "macro.h"
e8a565cb 70#include "manager.h"
2a341bb9 71#include "manager-dump.h"
0a970718 72#include "memory-util.h"
f5947a5e 73#include "missing_fs.h"
5bead76e 74#include "missing_ioprio.h"
35cd0ba5 75#include "mkdir-label.h"
21935150 76#include "mount-util.h"
bb0c0d6f 77#include "mountpoint-util.h"
8dd4c05b 78#include "namespace.h"
6bedfcbb 79#include "parse-util.h"
8dd4c05b 80#include "path-util.h"
0b452006 81#include "process-util.h"
d3dcf4e3 82#include "random-util.h"
78f22b97 83#include "rlimit-util.h"
8dd4c05b 84#include "rm-rf.h"
349cc4a5 85#if HAVE_SECCOMP
3ffd4af2
LP
86#include "seccomp-util.h"
87#endif
07d46372 88#include "securebits-util.h"
8dd4c05b 89#include "selinux-util.h"
24882e06 90#include "signal-util.h"
8dd4c05b 91#include "smack-util.h"
57b7a260 92#include "socket-util.h"
fd63e712 93#include "special.h"
949befd3 94#include "stat-util.h"
8b43440b 95#include "string-table.h"
07630cea 96#include "string-util.h"
8dd4c05b 97#include "strv.h"
7ccbd1ae 98#include "syslog-util.h"
8dd4c05b 99#include "terminal-util.h"
bb0c0d6f 100#include "tmpfile-util.h"
566b7d23 101#include "umask-util.h"
2d3b784d 102#include "unit-serialize.h"
b1d4f8e1 103#include "user-util.h"
8dd4c05b 104#include "utmp-wtmp.h"
5cb5a6ff 105
e056b01d 106#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 107#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 108
531dca78
LP
109#define SNDBUF_SIZE (8*1024*1024)
110
da6053d0 111static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
112 if (n_fds <= 0)
113 return 0;
114
a0d40ac5
LP
115 /* Modifies the fds array! (sorts it) */
116
034c6ed7
LP
117 assert(fds);
118
5b10116e
ZJS
119 for (int start = 0;;) {
120 int restart_from = -1;
034c6ed7 121
5b10116e 122 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
123 int nfd;
124
125 /* Already at right index? */
126 if (fds[i] == i+3)
127 continue;
128
3cc2aff1
LP
129 nfd = fcntl(fds[i], F_DUPFD, i + 3);
130 if (nfd < 0)
034c6ed7
LP
131 return -errno;
132
03e334a1 133 safe_close(fds[i]);
034c6ed7
LP
134 fds[i] = nfd;
135
136 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 137 * let's remember that and try again from here */
034c6ed7
LP
138 if (nfd != i+3 && restart_from < 0)
139 restart_from = i;
140 }
141
142 if (restart_from < 0)
143 break;
144
145 start = restart_from;
146 }
147
148 return 0;
149}
150
25b583d7 151static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 152 size_t n_fds;
e2c76839 153 int r;
47a71eed 154
25b583d7 155 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
156 if (n_fds <= 0)
157 return 0;
158
159 assert(fds);
160
9b141911
FB
161 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
162 * O_NONBLOCK only applies to socket activation though. */
47a71eed 163
5b10116e 164 for (size_t i = 0; i < n_fds; i++) {
47a71eed 165
9b141911
FB
166 if (i < n_socket_fds) {
167 r = fd_nonblock(fds[i], nonblock);
168 if (r < 0)
169 return r;
170 }
47a71eed 171
451a074f
LP
172 /* We unconditionally drop FD_CLOEXEC from the fds,
173 * since after all we want to pass these fds to our
174 * children */
47a71eed 175
3cc2aff1
LP
176 r = fd_cloexec(fds[i], false);
177 if (r < 0)
e2c76839 178 return r;
47a71eed
LP
179 }
180
181 return 0;
182}
183
1e22b5cd 184static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
185 assert(context);
186
1e22b5cd
LP
187 if (context->stdio_as_fds)
188 return NULL;
189
80876c20
LP
190 if (context->tty_path)
191 return context->tty_path;
192
193 return "/dev/console";
194}
195
1e22b5cd
LP
196static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
197 const char *path;
198
6ea832a2
LP
199 assert(context);
200
1e22b5cd 201 path = exec_context_tty_path(context);
6ea832a2 202
1e22b5cd
LP
203 if (context->tty_vhangup) {
204 if (p && p->stdin_fd >= 0)
205 (void) terminal_vhangup_fd(p->stdin_fd);
206 else if (path)
207 (void) terminal_vhangup(path);
208 }
6ea832a2 209
1e22b5cd
LP
210 if (context->tty_reset) {
211 if (p && p->stdin_fd >= 0)
212 (void) reset_terminal_fd(p->stdin_fd, true);
213 else if (path)
214 (void) reset_terminal(path);
215 }
216
51462135
DDM
217 if (p && p->stdin_fd >= 0)
218 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
219
1e22b5cd
LP
220 if (context->tty_vt_disallocate && path)
221 (void) vt_disallocate(path);
6ea832a2
LP
222}
223
6af760f3
LP
224static bool is_terminal_input(ExecInput i) {
225 return IN_SET(i,
226 EXEC_INPUT_TTY,
227 EXEC_INPUT_TTY_FORCE,
228 EXEC_INPUT_TTY_FAIL);
229}
230
3a1286b6 231static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
232 return IN_SET(o,
233 EXEC_OUTPUT_TTY,
6af760f3
LP
234 EXEC_OUTPUT_KMSG_AND_CONSOLE,
235 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
236}
237
aac8c0c3
LP
238static bool is_kmsg_output(ExecOutput o) {
239 return IN_SET(o,
240 EXEC_OUTPUT_KMSG,
241 EXEC_OUTPUT_KMSG_AND_CONSOLE);
242}
243
6af760f3
LP
244static bool exec_context_needs_term(const ExecContext *c) {
245 assert(c);
246
247 /* Return true if the execution context suggests we should set $TERM to something useful. */
248
249 if (is_terminal_input(c->std_input))
250 return true;
251
252 if (is_terminal_output(c->std_output))
253 return true;
254
255 if (is_terminal_output(c->std_error))
256 return true;
257
258 return !!c->tty_path;
3a1286b6
MS
259}
260
80876c20 261static int open_null_as(int flags, int nfd) {
046a82c1 262 int fd;
071830ff 263
80876c20 264 assert(nfd >= 0);
071830ff 265
613b411c
LP
266 fd = open("/dev/null", flags|O_NOCTTY);
267 if (fd < 0)
071830ff
LP
268 return -errno;
269
046a82c1 270 return move_fd(fd, nfd, false);
071830ff
LP
271}
272
91dd5f7c
LP
273static int connect_journal_socket(
274 int fd,
275 const char *log_namespace,
276 uid_t uid,
277 gid_t gid) {
278
f36a9d59
ZJS
279 union sockaddr_union sa;
280 socklen_t sa_len;
524daa8c
ZJS
281 uid_t olduid = UID_INVALID;
282 gid_t oldgid = GID_INVALID;
91dd5f7c 283 const char *j;
524daa8c
ZJS
284 int r;
285
91dd5f7c
LP
286 j = log_namespace ?
287 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
288 "/run/systemd/journal/stdout";
289 r = sockaddr_un_set_path(&sa.un, j);
290 if (r < 0)
291 return r;
f36a9d59 292 sa_len = r;
91dd5f7c 293
cad93f29 294 if (gid_is_valid(gid)) {
524daa8c
ZJS
295 oldgid = getgid();
296
92a17af9 297 if (setegid(gid) < 0)
524daa8c
ZJS
298 return -errno;
299 }
300
cad93f29 301 if (uid_is_valid(uid)) {
524daa8c
ZJS
302 olduid = getuid();
303
92a17af9 304 if (seteuid(uid) < 0) {
524daa8c
ZJS
305 r = -errno;
306 goto restore_gid;
307 }
308 }
309
7c248223 310 r = RET_NERRNO(connect(fd, &sa.sa, sa_len));
524daa8c
ZJS
311
312 /* If we fail to restore the uid or gid, things will likely
313 fail later on. This should only happen if an LSM interferes. */
314
cad93f29 315 if (uid_is_valid(uid))
524daa8c
ZJS
316 (void) seteuid(olduid);
317
318 restore_gid:
cad93f29 319 if (gid_is_valid(gid))
524daa8c
ZJS
320 (void) setegid(oldgid);
321
322 return r;
323}
324
fd1f9c89 325static int connect_logger_as(
34cf6c43 326 const Unit *unit,
fd1f9c89 327 const ExecContext *context,
af635cf3 328 const ExecParameters *params,
fd1f9c89
LP
329 ExecOutput output,
330 const char *ident,
fd1f9c89
LP
331 int nfd,
332 uid_t uid,
333 gid_t gid) {
334
2ac1ff68
EV
335 _cleanup_close_ int fd = -1;
336 int r;
071830ff
LP
337
338 assert(context);
af635cf3 339 assert(params);
80876c20
LP
340 assert(output < _EXEC_OUTPUT_MAX);
341 assert(ident);
342 assert(nfd >= 0);
071830ff 343
54fe0cdb
LP
344 fd = socket(AF_UNIX, SOCK_STREAM, 0);
345 if (fd < 0)
80876c20 346 return -errno;
071830ff 347
91dd5f7c 348 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
349 if (r < 0)
350 return r;
071830ff 351
2ac1ff68 352 if (shutdown(fd, SHUT_RD) < 0)
80876c20 353 return -errno;
071830ff 354
fd1f9c89 355 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 356
2ac1ff68 357 if (dprintf(fd,
62bca2c6 358 "%s\n"
80876c20
LP
359 "%s\n"
360 "%i\n"
54fe0cdb
LP
361 "%i\n"
362 "%i\n"
363 "%i\n"
4f4a1dbf 364 "%i\n",
c867611e 365 context->syslog_identifier ?: ident,
af635cf3 366 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
367 context->syslog_priority,
368 !!context->syslog_level_prefix,
f3dc6af2 369 false,
aac8c0c3 370 is_kmsg_output(output),
2ac1ff68
EV
371 is_terminal_output(output)) < 0)
372 return -errno;
80876c20 373
2ac1ff68 374 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 375}
2ac1ff68 376
3a274a21 377static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 378 int fd;
071830ff 379
80876c20
LP
380 assert(path);
381 assert(nfd >= 0);
fd1f9c89 382
3a274a21 383 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 384 if (fd < 0)
80876c20 385 return fd;
071830ff 386
046a82c1 387 return move_fd(fd, nfd, false);
80876c20 388}
071830ff 389
2038c3f5 390static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
391 union sockaddr_union sa;
392 socklen_t sa_len;
15a3e96f 393 _cleanup_close_ int fd = -1;
86fca584 394 int r;
071830ff 395
80876c20 396 assert(path);
071830ff 397
2038c3f5
LP
398 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
399 flags |= O_CREAT;
400
401 fd = open(path, flags|O_NOCTTY, mode);
402 if (fd >= 0)
15a3e96f 403 return TAKE_FD(fd);
071830ff 404
2038c3f5
LP
405 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
406 return -errno;
2038c3f5
LP
407
408 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
409
86fca584
ZJS
410 r = sockaddr_un_set_path(&sa.un, path);
411 if (r < 0)
412 return r == -EINVAL ? -ENXIO : r;
413 sa_len = r;
414
2038c3f5
LP
415 fd = socket(AF_UNIX, SOCK_STREAM, 0);
416 if (fd < 0)
417 return -errno;
418
86fca584 419 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 420 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 421 * indication that this wasn't an AF_UNIX socket after all */
071830ff 422
2038c3f5
LP
423 if ((flags & O_ACCMODE) == O_RDONLY)
424 r = shutdown(fd, SHUT_WR);
425 else if ((flags & O_ACCMODE) == O_WRONLY)
426 r = shutdown(fd, SHUT_RD);
427 else
86fca584 428 r = 0;
15a3e96f 429 if (r < 0)
2038c3f5 430 return -errno;
2038c3f5 431
15a3e96f 432 return TAKE_FD(fd);
80876c20 433}
071830ff 434
08f3be7a
LP
435static int fixup_input(
436 const ExecContext *context,
437 int socket_fd,
438 bool apply_tty_stdin) {
439
440 ExecInput std_input;
441
442 assert(context);
443
444 std_input = context->std_input;
1e3ad081
LP
445
446 if (is_terminal_input(std_input) && !apply_tty_stdin)
447 return EXEC_INPUT_NULL;
071830ff 448
03fd9c49 449 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
450 return EXEC_INPUT_NULL;
451
08f3be7a
LP
452 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
453 return EXEC_INPUT_NULL;
454
03fd9c49 455 return std_input;
4f2d528d
LP
456}
457
7966a916 458static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 459
7966a916 460 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
461 return EXEC_OUTPUT_INHERIT;
462
7966a916 463 return output;
4f2d528d
LP
464}
465
a34ceba6
LP
466static int setup_input(
467 const ExecContext *context,
468 const ExecParameters *params,
52c239d7 469 int socket_fd,
2caa38e9 470 const int named_iofds[static 3]) {
a34ceba6 471
4f2d528d 472 ExecInput i;
51462135 473 int r;
4f2d528d
LP
474
475 assert(context);
a34ceba6 476 assert(params);
2caa38e9 477 assert(named_iofds);
a34ceba6
LP
478
479 if (params->stdin_fd >= 0) {
480 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
481 return -errno;
482
483 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
484 if (isatty(STDIN_FILENO)) {
485 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
486 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 487 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 488 }
a34ceba6
LP
489
490 return STDIN_FILENO;
491 }
4f2d528d 492
08f3be7a 493 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
494
495 switch (i) {
071830ff 496
80876c20
LP
497 case EXEC_INPUT_NULL:
498 return open_null_as(O_RDONLY, STDIN_FILENO);
499
500 case EXEC_INPUT_TTY:
501 case EXEC_INPUT_TTY_FORCE:
502 case EXEC_INPUT_TTY_FAIL: {
046a82c1 503 int fd;
071830ff 504
1e22b5cd 505 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
506 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
507 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
508 ACQUIRE_TERMINAL_WAIT,
3a43da28 509 USEC_INFINITY);
970edce6 510 if (fd < 0)
80876c20
LP
511 return fd;
512
51462135
DDM
513 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
514 if (r < 0)
515 return r;
516
046a82c1 517 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
518 }
519
4f2d528d 520 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
521 assert(socket_fd >= 0);
522
7c248223 523 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 524
52c239d7 525 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
526 assert(named_iofds[STDIN_FILENO] >= 0);
527
52c239d7 528 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 529 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 530
08f3be7a
LP
531 case EXEC_INPUT_DATA: {
532 int fd;
533
534 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
535 if (fd < 0)
536 return fd;
537
538 return move_fd(fd, STDIN_FILENO, false);
539 }
540
2038c3f5
LP
541 case EXEC_INPUT_FILE: {
542 bool rw;
543 int fd;
544
545 assert(context->stdio_file[STDIN_FILENO]);
546
547 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
548 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
549
550 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
551 if (fd < 0)
552 return fd;
553
554 return move_fd(fd, STDIN_FILENO, false);
555 }
556
80876c20 557 default:
04499a70 558 assert_not_reached();
80876c20
LP
559 }
560}
561
41fc585a
LP
562static bool can_inherit_stderr_from_stdout(
563 const ExecContext *context,
564 ExecOutput o,
565 ExecOutput e) {
566
567 assert(context);
568
569 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
570 * stderr fd */
571
572 if (e == EXEC_OUTPUT_INHERIT)
573 return true;
574 if (e != o)
575 return false;
576
577 if (e == EXEC_OUTPUT_NAMED_FD)
578 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
579
8d7dab1f 580 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
581 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
582
583 return true;
584}
585
a34ceba6 586static int setup_output(
34cf6c43 587 const Unit *unit,
a34ceba6
LP
588 const ExecContext *context,
589 const ExecParameters *params,
590 int fileno,
591 int socket_fd,
2caa38e9 592 const int named_iofds[static 3],
a34ceba6 593 const char *ident,
7bce046b
LP
594 uid_t uid,
595 gid_t gid,
596 dev_t *journal_stream_dev,
597 ino_t *journal_stream_ino) {
a34ceba6 598
4f2d528d
LP
599 ExecOutput o;
600 ExecInput i;
47c1d80d 601 int r;
4f2d528d 602
f2341e0a 603 assert(unit);
80876c20 604 assert(context);
a34ceba6 605 assert(params);
80876c20 606 assert(ident);
7bce046b
LP
607 assert(journal_stream_dev);
608 assert(journal_stream_ino);
80876c20 609
a34ceba6
LP
610 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
611
612 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
613 return -errno;
614
615 return STDOUT_FILENO;
616 }
617
618 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
619 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
620 return -errno;
621
622 return STDERR_FILENO;
623 }
624
08f3be7a 625 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 626 o = fixup_output(context->std_output, socket_fd);
4f2d528d 627
eb17e935
MS
628 if (fileno == STDERR_FILENO) {
629 ExecOutput e;
630 e = fixup_output(context->std_error, socket_fd);
80876c20 631
eb17e935
MS
632 /* This expects the input and output are already set up */
633
634 /* Don't change the stderr file descriptor if we inherit all
635 * the way and are not on a tty */
636 if (e == EXEC_OUTPUT_INHERIT &&
637 o == EXEC_OUTPUT_INHERIT &&
638 i == EXEC_INPUT_NULL &&
639 !is_terminal_input(context->std_input) &&
7966a916 640 getppid() != 1)
eb17e935
MS
641 return fileno;
642
643 /* Duplicate from stdout if possible */
41fc585a 644 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 645 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 646
eb17e935 647 o = e;
80876c20 648
eb17e935 649 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
650 /* If input got downgraded, inherit the original value */
651 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 652 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 653
08f3be7a
LP
654 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
655 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 656 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 657
acb591e4
LP
658 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
659 if (getppid() != 1)
eb17e935 660 return fileno;
94f04347 661
eb17e935
MS
662 /* We need to open /dev/null here anew, to get the right access mode. */
663 return open_null_as(O_WRONLY, fileno);
071830ff 664 }
94f04347 665
eb17e935 666 switch (o) {
80876c20
LP
667
668 case EXEC_OUTPUT_NULL:
eb17e935 669 return open_null_as(O_WRONLY, fileno);
80876c20
LP
670
671 case EXEC_OUTPUT_TTY:
4f2d528d 672 if (is_terminal_input(i))
7c248223 673 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
674
675 /* We don't reset the terminal if this is just about output */
1e22b5cd 676 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 677
9a6bca7a 678 case EXEC_OUTPUT_KMSG:
28dbc1e8 679 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
680 case EXEC_OUTPUT_JOURNAL:
681 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 682 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 683 if (r < 0) {
7966a916
ZJS
684 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
685 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 686 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
687 } else {
688 struct stat st;
689
690 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
691 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
692 * services to detect whether they are connected to the journal or not.
693 *
694 * If both stdout and stderr are connected to a stream then let's make sure to store the data
695 * about STDERR as that's usually the best way to do logging. */
7bce046b 696
ab2116b1
LP
697 if (fstat(fileno, &st) >= 0 &&
698 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
699 *journal_stream_dev = st.st_dev;
700 *journal_stream_ino = st.st_ino;
701 }
47c1d80d
MS
702 }
703 return r;
4f2d528d
LP
704
705 case EXEC_OUTPUT_SOCKET:
706 assert(socket_fd >= 0);
e75a9ed1 707
7c248223 708 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 709
52c239d7 710 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
711 assert(named_iofds[fileno] >= 0);
712
52c239d7 713 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 714 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 715
566b7d23 716 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
717 case EXEC_OUTPUT_FILE_APPEND:
718 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 719 bool rw;
566b7d23 720 int fd, flags;
2038c3f5
LP
721
722 assert(context->stdio_file[fileno]);
723
724 rw = context->std_input == EXEC_INPUT_FILE &&
725 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
726
727 if (rw)
7c248223 728 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 729
566b7d23
ZD
730 flags = O_WRONLY;
731 if (o == EXEC_OUTPUT_FILE_APPEND)
732 flags |= O_APPEND;
8d7dab1f
LW
733 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
734 flags |= O_TRUNC;
566b7d23
ZD
735
736 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
737 if (fd < 0)
738 return fd;
739
566b7d23 740 return move_fd(fd, fileno, 0);
2038c3f5
LP
741 }
742
94f04347 743 default:
04499a70 744 assert_not_reached();
94f04347 745 }
071830ff
LP
746}
747
02a51aba 748static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 749 int r;
02a51aba
LP
750
751 assert(fd >= 0);
02a51aba 752
1ff74fb6 753 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
754 if (isatty(fd) < 1) {
755 if (IN_SET(errno, EINVAL, ENOTTY))
756 return 0; /* not a tty */
1ff74fb6 757
02a51aba 758 return -errno;
4b3b5bc7 759 }
02a51aba 760
4b3b5bc7 761 /* This might fail. What matters are the results. */
f2df231f 762 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
763 if (r < 0)
764 return r;
02a51aba 765
4b3b5bc7 766 return 1;
02a51aba
LP
767}
768
aedec452 769static int setup_confirm_stdio(
51462135 770 const ExecContext *context,
aedec452
LP
771 const char *vc,
772 int *ret_saved_stdin,
773 int *ret_saved_stdout) {
774
3d18b167
LP
775 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
776 int r;
80876c20 777
aedec452
LP
778 assert(ret_saved_stdin);
779 assert(ret_saved_stdout);
80876c20 780
af6da548
LP
781 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
782 if (saved_stdin < 0)
783 return -errno;
80876c20 784
af6da548 785 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
786 if (saved_stdout < 0)
787 return -errno;
80876c20 788
8854d795 789 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
790 if (fd < 0)
791 return fd;
80876c20 792
af6da548
LP
793 r = chown_terminal(fd, getuid());
794 if (r < 0)
3d18b167 795 return r;
02a51aba 796
3d18b167
LP
797 r = reset_terminal_fd(fd, true);
798 if (r < 0)
799 return r;
80876c20 800
51462135
DDM
801 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
802 if (r < 0)
803 return r;
804
aedec452
LP
805 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
806 TAKE_FD(fd);
2b33ab09
LP
807 if (r < 0)
808 return r;
80876c20 809
aedec452
LP
810 *ret_saved_stdin = TAKE_FD(saved_stdin);
811 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 812 return 0;
80876c20
LP
813}
814
63d77c92 815static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
816 assert(err < 0);
817
818 if (err == -ETIMEDOUT)
63d77c92 819 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
820 else {
821 errno = -err;
63d77c92 822 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
823 }
824}
825
63d77c92 826static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 827 _cleanup_close_ int fd = -1;
80876c20 828
3b20f877 829 assert(vc);
80876c20 830
7d5ceb64 831 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 832 if (fd < 0)
3b20f877 833 return;
80876c20 834
63d77c92 835 write_confirm_error_fd(err, fd, u);
af6da548 836}
80876c20 837
3d18b167 838static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 839 int r = 0;
80876c20 840
af6da548
LP
841 assert(saved_stdin);
842 assert(saved_stdout);
843
844 release_terminal();
845
846 if (*saved_stdin >= 0)
80876c20 847 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 848 r = -errno;
80876c20 849
af6da548 850 if (*saved_stdout >= 0)
80876c20 851 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 852 r = -errno;
80876c20 853
3d18b167
LP
854 *saved_stdin = safe_close(*saved_stdin);
855 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
856
857 return r;
858}
859
3b20f877
FB
860enum {
861 CONFIRM_PRETEND_FAILURE = -1,
862 CONFIRM_PRETEND_SUCCESS = 0,
863 CONFIRM_EXECUTE = 1,
864};
865
51462135 866static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 867 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 868 _cleanup_free_ char *e = NULL;
3b20f877 869 char c;
af6da548 870
3b20f877 871 /* For any internal errors, assume a positive response. */
51462135 872 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 873 if (r < 0) {
63d77c92 874 write_confirm_error(r, vc, u);
3b20f877
FB
875 return CONFIRM_EXECUTE;
876 }
af6da548 877
b0eb2944
FB
878 /* confirm_spawn might have been disabled while we were sleeping. */
879 if (manager_is_confirm_spawn_disabled(u->manager)) {
880 r = 1;
881 goto restore_stdio;
882 }
af6da548 883
2bcd3c26
FB
884 e = ellipsize(cmdline, 60, 100);
885 if (!e) {
886 log_oom();
887 r = CONFIRM_EXECUTE;
888 goto restore_stdio;
889 }
af6da548 890
d172b175 891 for (;;) {
539622bd 892 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 893 if (r < 0) {
63d77c92 894 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
895 r = CONFIRM_EXECUTE;
896 goto restore_stdio;
897 }
af6da548 898
d172b175 899 switch (c) {
b0eb2944
FB
900 case 'c':
901 printf("Resuming normal execution.\n");
902 manager_disable_confirm_spawn();
903 r = 1;
904 break;
dd6f9ac0
FB
905 case 'D':
906 unit_dump(u, stdout, " ");
907 continue; /* ask again */
d172b175
FB
908 case 'f':
909 printf("Failing execution.\n");
910 r = CONFIRM_PRETEND_FAILURE;
911 break;
912 case 'h':
b0eb2944
FB
913 printf(" c - continue, proceed without asking anymore\n"
914 " D - dump, show the state of the unit\n"
dd6f9ac0 915 " f - fail, don't execute the command and pretend it failed\n"
d172b175 916 " h - help\n"
eedf223a 917 " i - info, show a short summary of the unit\n"
56fde33a 918 " j - jobs, show jobs that are in progress\n"
d172b175
FB
919 " s - skip, don't execute the command and pretend it succeeded\n"
920 " y - yes, execute the command\n");
dd6f9ac0 921 continue; /* ask again */
eedf223a
FB
922 case 'i':
923 printf(" Description: %s\n"
924 " Unit: %s\n"
925 " Command: %s\n",
926 u->id, u->description, cmdline);
927 continue; /* ask again */
56fde33a
FB
928 case 'j':
929 manager_dump_jobs(u->manager, stdout, " ");
930 continue; /* ask again */
539622bd
FB
931 case 'n':
932 /* 'n' was removed in favor of 'f'. */
933 printf("Didn't understand 'n', did you mean 'f'?\n");
934 continue; /* ask again */
d172b175
FB
935 case 's':
936 printf("Skipping execution.\n");
937 r = CONFIRM_PRETEND_SUCCESS;
938 break;
939 case 'y':
940 r = CONFIRM_EXECUTE;
941 break;
942 default:
04499a70 943 assert_not_reached();
d172b175 944 }
3b20f877 945 break;
3b20f877 946 }
af6da548 947
3b20f877 948restore_stdio:
af6da548 949 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 950 return r;
80876c20
LP
951}
952
4d885bd3
DH
953static int get_fixed_user(const ExecContext *c, const char **user,
954 uid_t *uid, gid_t *gid,
955 const char **home, const char **shell) {
81a2b7ce 956 int r;
4d885bd3 957 const char *name;
81a2b7ce 958
4d885bd3 959 assert(c);
81a2b7ce 960
23deef88
LP
961 if (!c->user)
962 return 0;
963
4d885bd3
DH
964 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
965 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 966
23deef88 967 name = c->user;
fafff8f1 968 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
969 if (r < 0)
970 return r;
81a2b7ce 971
4d885bd3
DH
972 *user = name;
973 return 0;
974}
975
976static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
977 int r;
978 const char *name;
979
980 assert(c);
981
982 if (!c->group)
983 return 0;
984
985 name = c->group;
fafff8f1 986 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
987 if (r < 0)
988 return r;
989
990 *group = name;
991 return 0;
992}
993
cdc5d5c5
DH
994static int get_supplementary_groups(const ExecContext *c, const char *user,
995 const char *group, gid_t gid,
996 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
997 char **i;
998 int r, k = 0;
999 int ngroups_max;
1000 bool keep_groups = false;
1001 gid_t *groups = NULL;
1002 _cleanup_free_ gid_t *l_gids = NULL;
1003
1004 assert(c);
1005
bbeea271
DH
1006 /*
1007 * If user is given, then lookup GID and supplementary groups list.
1008 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1009 * here and as early as possible so we keep the list of supplementary
1010 * groups of the caller.
bbeea271
DH
1011 */
1012 if (user && gid_is_valid(gid) && gid != 0) {
1013 /* First step, initialize groups from /etc/groups */
1014 if (initgroups(user, gid) < 0)
1015 return -errno;
1016
1017 keep_groups = true;
1018 }
1019
ac6e8be6 1020 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1021 return 0;
1022
366ddd25
DH
1023 /*
1024 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1025 * be positive, otherwise fail.
1026 */
1027 errno = 0;
1028 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1029 if (ngroups_max <= 0)
1030 return errno_or_else(EOPNOTSUPP);
366ddd25 1031
4d885bd3
DH
1032 l_gids = new(gid_t, ngroups_max);
1033 if (!l_gids)
1034 return -ENOMEM;
81a2b7ce 1035
4d885bd3
DH
1036 if (keep_groups) {
1037 /*
1038 * Lookup the list of groups that the user belongs to, we
1039 * avoid NSS lookups here too for gid=0.
1040 */
1041 k = ngroups_max;
1042 if (getgrouplist(user, gid, l_gids, &k) < 0)
1043 return -EINVAL;
1044 } else
1045 k = 0;
81a2b7ce 1046
4d885bd3
DH
1047 STRV_FOREACH(i, c->supplementary_groups) {
1048 const char *g;
81a2b7ce 1049
4d885bd3
DH
1050 if (k >= ngroups_max)
1051 return -E2BIG;
81a2b7ce 1052
4d885bd3 1053 g = *i;
fafff8f1 1054 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1055 if (r < 0)
1056 return r;
81a2b7ce 1057
4d885bd3
DH
1058 k++;
1059 }
81a2b7ce 1060
4d885bd3
DH
1061 /*
1062 * Sets ngids to zero to drop all supplementary groups, happens
1063 * when we are under root and SupplementaryGroups= is empty.
1064 */
1065 if (k == 0) {
1066 *ngids = 0;
1067 return 0;
1068 }
81a2b7ce 1069
4d885bd3
DH
1070 /* Otherwise get the final list of supplementary groups */
1071 groups = memdup(l_gids, sizeof(gid_t) * k);
1072 if (!groups)
1073 return -ENOMEM;
1074
1075 *supplementary_gids = groups;
1076 *ngids = k;
1077
1078 groups = NULL;
1079
1080 return 0;
1081}
1082
34cf6c43 1083static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1084 int r;
1085
709dbeac
YW
1086 /* Handle SupplementaryGroups= if it is not empty */
1087 if (ngids > 0) {
4d885bd3
DH
1088 r = maybe_setgroups(ngids, supplementary_gids);
1089 if (r < 0)
97f0e76f 1090 return r;
4d885bd3 1091 }
81a2b7ce 1092
4d885bd3
DH
1093 if (gid_is_valid(gid)) {
1094 /* Then set our gids */
1095 if (setresgid(gid, gid, gid) < 0)
1096 return -errno;
81a2b7ce
LP
1097 }
1098
1099 return 0;
1100}
1101
dbdc4098
TK
1102static int set_securebits(int bits, int mask) {
1103 int current, applied;
1104 current = prctl(PR_GET_SECUREBITS);
1105 if (current < 0)
1106 return -errno;
1107 /* Clear all securebits defined in mask and set bits */
1108 applied = (current & ~mask) | bits;
1109 if (current == applied)
1110 return 0;
1111 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1112 return -errno;
1113 return 1;
1114}
1115
81a2b7ce 1116static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1117 assert(context);
dbdc4098 1118 int r;
81a2b7ce 1119
4d885bd3
DH
1120 if (!uid_is_valid(uid))
1121 return 0;
1122
479050b3 1123 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1124 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1125 * required, so we also need keep-caps in this case.
1126 */
81a2b7ce 1127
dbdc4098 1128 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1129
1130 /* First step: If we need to keep capabilities but
1131 * drop privileges we need to make sure we keep our
cbb21cca 1132 * caps, while we drop privileges. */
693ced48 1133 if (uid != 0) {
dbdc4098
TK
1134 /* Add KEEP_CAPS to the securebits */
1135 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1136 if (r < 0)
1137 return r;
693ced48 1138 }
81a2b7ce
LP
1139 }
1140
479050b3 1141 /* Second step: actually set the uids */
81a2b7ce
LP
1142 if (setresuid(uid, uid, uid) < 0)
1143 return -errno;
1144
1145 /* At this point we should have all necessary capabilities but
1146 are otherwise a normal user. However, the caps might got
1147 corrupted due to the setresuid() so we need clean them up
1148 later. This is done outside of this call. */
1149
1150 return 0;
1151}
1152
349cc4a5 1153#if HAVE_PAM
5b6319dc
LP
1154
1155static int null_conv(
1156 int num_msg,
1157 const struct pam_message **msg,
1158 struct pam_response **resp,
1159 void *appdata_ptr) {
1160
1161 /* We don't support conversations */
1162
1163 return PAM_CONV_ERR;
1164}
1165
cefc33ae
LP
1166#endif
1167
5b6319dc
LP
1168static int setup_pam(
1169 const char *name,
1170 const char *user,
940c5210 1171 uid_t uid,
2d6fce8d 1172 gid_t gid,
5b6319dc 1173 const char *tty,
2065ca69 1174 char ***env,
5b8d1f6b 1175 const int fds[], size_t n_fds) {
5b6319dc 1176
349cc4a5 1177#if HAVE_PAM
cefc33ae 1178
5b6319dc
LP
1179 static const struct pam_conv conv = {
1180 .conv = null_conv,
1181 .appdata_ptr = NULL
1182 };
1183
2d7c6aa2 1184 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1185 pam_handle_t *handle = NULL;
d6e5f3ad 1186 sigset_t old_ss;
7bb70b6e 1187 int pam_code = PAM_SUCCESS, r;
84eada2f 1188 char **nv, **e = NULL;
5b6319dc
LP
1189 bool close_session = false;
1190 pid_t pam_pid = 0, parent_pid;
970edce6 1191 int flags = 0;
5b6319dc
LP
1192
1193 assert(name);
1194 assert(user);
2065ca69 1195 assert(env);
5b6319dc
LP
1196
1197 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1198 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1199 * systemd via the cgroup logic. It will then remove the PAM
1200 * session again. The parent process will exec() the actual
1201 * daemon. We do things this way to ensure that the main PID
1202 * of the daemon is the one we initially fork()ed. */
1203
7bb70b6e
LP
1204 r = barrier_create(&barrier);
1205 if (r < 0)
2d7c6aa2
DH
1206 goto fail;
1207
553d2243 1208 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1209 flags |= PAM_SILENT;
1210
f546241b
ZJS
1211 pam_code = pam_start(name, user, &conv, &handle);
1212 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1213 handle = NULL;
1214 goto fail;
1215 }
1216
3cd24c1a
LP
1217 if (!tty) {
1218 _cleanup_free_ char *q = NULL;
1219
1220 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1221 * out if that's the case, and read the TTY off it. */
1222
1223 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1224 tty = strjoina("/dev/", q);
1225 }
1226
f546241b
ZJS
1227 if (tty) {
1228 pam_code = pam_set_item(handle, PAM_TTY, tty);
1229 if (pam_code != PAM_SUCCESS)
5b6319dc 1230 goto fail;
f546241b 1231 }
5b6319dc 1232
84eada2f
JW
1233 STRV_FOREACH(nv, *env) {
1234 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1235 if (pam_code != PAM_SUCCESS)
1236 goto fail;
1237 }
1238
970edce6 1239 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1240 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1241 goto fail;
1242
3bb39ea9
DG
1243 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1244 if (pam_code != PAM_SUCCESS)
46d7c6af 1245 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1246
970edce6 1247 pam_code = pam_open_session(handle, flags);
f546241b 1248 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1249 goto fail;
1250
1251 close_session = true;
1252
f546241b
ZJS
1253 e = pam_getenvlist(handle);
1254 if (!e) {
5b6319dc
LP
1255 pam_code = PAM_BUF_ERR;
1256 goto fail;
1257 }
1258
1259 /* Block SIGTERM, so that we know that it won't get lost in
1260 * the child */
ce30c8dc 1261
72c0a2c2 1262 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1263
df0ff127 1264 parent_pid = getpid_cached();
5b6319dc 1265
4c253ed1
LP
1266 r = safe_fork("(sd-pam)", 0, &pam_pid);
1267 if (r < 0)
5b6319dc 1268 goto fail;
4c253ed1 1269 if (r == 0) {
7bb70b6e 1270 int sig, ret = EXIT_PAM;
5b6319dc
LP
1271
1272 /* The child's job is to reset the PAM session on
1273 * termination */
2d7c6aa2 1274 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1275
1da37e58
ZJS
1276 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1277 * those fds are open here that have been opened by PAM. */
4c253ed1 1278 (void) close_many(fds, n_fds);
5b6319dc 1279
940c5210
AK
1280 /* Drop privileges - we don't need any to pam_close_session
1281 * and this will make PR_SET_PDEATHSIG work in most cases.
1282 * If this fails, ignore the error - but expect sd-pam threads
1283 * to fail to exit normally */
2d6fce8d 1284
97f0e76f
LP
1285 r = maybe_setgroups(0, NULL);
1286 if (r < 0)
1287 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1288 if (setresgid(gid, gid, gid) < 0)
1289 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1290 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1291 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1292
9c274488 1293 (void) ignore_signals(SIGPIPE);
ce30c8dc 1294
940c5210
AK
1295 /* Wait until our parent died. This will only work if
1296 * the above setresuid() succeeds, otherwise the kernel
1297 * will not allow unprivileged parents kill their privileged
1298 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1299 * to do the rest for us. */
1300 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1301 goto child_finish;
1302
2d7c6aa2
DH
1303 /* Tell the parent that our setup is done. This is especially
1304 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1305 * setup might race against our setresuid(2) call.
1306 *
1307 * If the parent aborted, we'll detect this below, hence ignore
1308 * return failure here. */
1309 (void) barrier_place(&barrier);
2d7c6aa2 1310
643f4706 1311 /* Check if our parent process might already have died? */
5b6319dc 1312 if (getppid() == parent_pid) {
d6e5f3ad
DM
1313 sigset_t ss;
1314
1315 assert_se(sigemptyset(&ss) >= 0);
1316 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1317
3dead8d9
LP
1318 for (;;) {
1319 if (sigwait(&ss, &sig) < 0) {
1320 if (errno == EINTR)
1321 continue;
1322
1323 goto child_finish;
1324 }
5b6319dc 1325
3dead8d9
LP
1326 assert(sig == SIGTERM);
1327 break;
1328 }
5b6319dc
LP
1329 }
1330
3bb39ea9
DG
1331 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1332 if (pam_code != PAM_SUCCESS)
1333 goto child_finish;
1334
3dead8d9 1335 /* If our parent died we'll end the session */
f546241b 1336 if (getppid() != parent_pid) {
970edce6 1337 pam_code = pam_close_session(handle, flags);
f546241b 1338 if (pam_code != PAM_SUCCESS)
5b6319dc 1339 goto child_finish;
f546241b 1340 }
5b6319dc 1341
7bb70b6e 1342 ret = 0;
5b6319dc
LP
1343
1344 child_finish:
970edce6 1345 pam_end(handle, pam_code | flags);
7bb70b6e 1346 _exit(ret);
5b6319dc
LP
1347 }
1348
2d7c6aa2
DH
1349 barrier_set_role(&barrier, BARRIER_PARENT);
1350
5b6319dc
LP
1351 /* If the child was forked off successfully it will do all the
1352 * cleanups, so forget about the handle here. */
1353 handle = NULL;
1354
3b8bddde 1355 /* Unblock SIGTERM again in the parent */
72c0a2c2 1356 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1357
1358 /* We close the log explicitly here, since the PAM modules
1359 * might have opened it, but we don't want this fd around. */
1360 closelog();
1361
2d7c6aa2
DH
1362 /* Synchronously wait for the child to initialize. We don't care for
1363 * errors as we cannot recover. However, warn loudly if it happens. */
1364 if (!barrier_place_and_sync(&barrier))
1365 log_error("PAM initialization failed");
1366
130d3d22 1367 return strv_free_and_replace(*env, e);
5b6319dc
LP
1368
1369fail:
970edce6
ZJS
1370 if (pam_code != PAM_SUCCESS) {
1371 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1372 r = -EPERM; /* PAM errors do not map to errno */
1373 } else
1374 log_error_errno(r, "PAM failed: %m");
9ba35398 1375
5b6319dc
LP
1376 if (handle) {
1377 if (close_session)
970edce6 1378 pam_code = pam_close_session(handle, flags);
5b6319dc 1379
970edce6 1380 pam_end(handle, pam_code | flags);
5b6319dc
LP
1381 }
1382
1383 strv_free(e);
5b6319dc
LP
1384 closelog();
1385
7bb70b6e 1386 return r;
cefc33ae
LP
1387#else
1388 return 0;
5b6319dc 1389#endif
cefc33ae 1390}
5b6319dc 1391
5d6b1584
LP
1392static void rename_process_from_path(const char *path) {
1393 char process_name[11];
1394 const char *p;
1395 size_t l;
1396
1397 /* This resulting string must fit in 10 chars (i.e. the length
1398 * of "/sbin/init") to look pretty in /bin/ps */
1399
2b6bf07d 1400 p = basename(path);
5d6b1584
LP
1401 if (isempty(p)) {
1402 rename_process("(...)");
1403 return;
1404 }
1405
1406 l = strlen(p);
1407 if (l > 8) {
1408 /* The end of the process name is usually more
1409 * interesting, since the first bit might just be
1410 * "systemd-" */
1411 p = p + l - 8;
1412 l = 8;
1413 }
1414
1415 process_name[0] = '(';
1416 memcpy(process_name+1, p, l);
1417 process_name[1+l] = ')';
1418 process_name[1+l+1] = 0;
1419
1420 rename_process(process_name);
1421}
1422
469830d1
LP
1423static bool context_has_address_families(const ExecContext *c) {
1424 assert(c);
1425
6b000af4 1426 return c->address_families_allow_list ||
469830d1
LP
1427 !set_isempty(c->address_families);
1428}
1429
1430static bool context_has_syscall_filters(const ExecContext *c) {
1431 assert(c);
1432
6b000af4 1433 return c->syscall_allow_list ||
8cfa775f 1434 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1435}
1436
9df2cdd8
TM
1437static bool context_has_syscall_logs(const ExecContext *c) {
1438 assert(c);
1439
1440 return c->syscall_log_allow_list ||
1441 !hashmap_isempty(c->syscall_log);
1442}
1443
469830d1
LP
1444static bool context_has_no_new_privileges(const ExecContext *c) {
1445 assert(c);
1446
1447 if (c->no_new_privileges)
1448 return true;
1449
1450 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1451 return false;
1452
1453 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1454 return c->lock_personality ||
469830d1 1455 c->memory_deny_write_execute ||
0538d2a8 1456 c->private_devices ||
fc64760d 1457 c->protect_clock ||
0538d2a8 1458 c->protect_hostname ||
469830d1
LP
1459 c->protect_kernel_tunables ||
1460 c->protect_kernel_modules ||
84703040 1461 c->protect_kernel_logs ||
0538d2a8
YW
1462 context_has_address_families(c) ||
1463 exec_context_restrict_namespaces_set(c) ||
1464 c->restrict_realtime ||
1465 c->restrict_suid_sgid ||
78e864e5 1466 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1467 context_has_syscall_filters(c) ||
1468 context_has_syscall_logs(c);
469830d1
LP
1469}
1470
bb0c0d6f
LP
1471static bool exec_context_has_credentials(const ExecContext *context) {
1472
1473 assert(context);
1474
1475 return !hashmap_isempty(context->set_credentials) ||
43144be4 1476 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1477}
1478
349cc4a5 1479#if HAVE_SECCOMP
17df7223 1480
83f12b27 1481static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1482
1483 if (is_seccomp_available())
1484 return false;
1485
f673b62d 1486 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1487 return true;
83f12b27
FS
1488}
1489
165a31c0 1490static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1491 uint32_t negative_action, default_action, action;
165a31c0 1492 int r;
8351ceae 1493
469830d1 1494 assert(u);
c0467cf3 1495 assert(c);
8351ceae 1496
469830d1 1497 if (!context_has_syscall_filters(c))
83f12b27
FS
1498 return 0;
1499
469830d1
LP
1500 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1501 return 0;
e9642be2 1502
005bfaf1 1503 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1504
6b000af4 1505 if (c->syscall_allow_list) {
469830d1
LP
1506 default_action = negative_action;
1507 action = SCMP_ACT_ALLOW;
7c66bae2 1508 } else {
469830d1
LP
1509 default_action = SCMP_ACT_ALLOW;
1510 action = negative_action;
57183d11 1511 }
8351ceae 1512
165a31c0 1513 if (needs_ambient_hack) {
6b000af4 1514 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1515 if (r < 0)
1516 return r;
1517 }
1518
b54f36c6 1519 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1520}
1521
9df2cdd8
TM
1522static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1523#ifdef SCMP_ACT_LOG
1524 uint32_t default_action, action;
1525#endif
1526
1527 assert(u);
1528 assert(c);
1529
1530 if (!context_has_syscall_logs(c))
1531 return 0;
1532
1533#ifdef SCMP_ACT_LOG
1534 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1535 return 0;
1536
1537 if (c->syscall_log_allow_list) {
1538 /* Log nothing but the ones listed */
1539 default_action = SCMP_ACT_ALLOW;
1540 action = SCMP_ACT_LOG;
1541 } else {
1542 /* Log everything but the ones listed */
1543 default_action = SCMP_ACT_LOG;
1544 action = SCMP_ACT_ALLOW;
1545 }
1546
1547 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1548#else
1549 /* old libseccomp */
1550 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1551 return 0;
1552#endif
1553}
1554
469830d1
LP
1555static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1556 assert(u);
4298d0b5
LP
1557 assert(c);
1558
469830d1 1559 if (set_isempty(c->syscall_archs))
83f12b27
FS
1560 return 0;
1561
469830d1
LP
1562 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1563 return 0;
4298d0b5 1564
469830d1
LP
1565 return seccomp_restrict_archs(c->syscall_archs);
1566}
4298d0b5 1567
469830d1
LP
1568static int apply_address_families(const Unit* u, const ExecContext *c) {
1569 assert(u);
1570 assert(c);
4298d0b5 1571
469830d1
LP
1572 if (!context_has_address_families(c))
1573 return 0;
4298d0b5 1574
469830d1
LP
1575 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1576 return 0;
4298d0b5 1577
6b000af4 1578 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1579}
4298d0b5 1580
83f12b27 1581static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1582 assert(u);
f3e43635
TM
1583 assert(c);
1584
469830d1 1585 if (!c->memory_deny_write_execute)
83f12b27
FS
1586 return 0;
1587
469830d1
LP
1588 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1589 return 0;
f3e43635 1590
469830d1 1591 return seccomp_memory_deny_write_execute();
f3e43635
TM
1592}
1593
83f12b27 1594static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1595 assert(u);
f4170c67
LP
1596 assert(c);
1597
469830d1 1598 if (!c->restrict_realtime)
83f12b27
FS
1599 return 0;
1600
469830d1
LP
1601 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1602 return 0;
f4170c67 1603
469830d1 1604 return seccomp_restrict_realtime();
f4170c67
LP
1605}
1606
f69567cb
LP
1607static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1608 assert(u);
1609 assert(c);
1610
1611 if (!c->restrict_suid_sgid)
1612 return 0;
1613
1614 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1615 return 0;
1616
1617 return seccomp_restrict_suid_sgid();
1618}
1619
59e856c7 1620static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1621 assert(u);
59eeb84b
LP
1622 assert(c);
1623
1624 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1625 * let's protect even those systems where this is left on in the kernel. */
1626
469830d1 1627 if (!c->protect_kernel_tunables)
59eeb84b
LP
1628 return 0;
1629
469830d1
LP
1630 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1631 return 0;
59eeb84b 1632
469830d1 1633 return seccomp_protect_sysctl();
59eeb84b
LP
1634}
1635
59e856c7 1636static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1637 assert(u);
502d704e
DH
1638 assert(c);
1639
25a8d8a0 1640 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1641
469830d1
LP
1642 if (!c->protect_kernel_modules)
1643 return 0;
1644
502d704e
DH
1645 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1646 return 0;
1647
b54f36c6 1648 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1649}
1650
84703040
KK
1651static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1652 assert(u);
1653 assert(c);
1654
1655 if (!c->protect_kernel_logs)
1656 return 0;
1657
1658 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1659 return 0;
1660
1661 return seccomp_protect_syslog();
1662}
1663
daf8f72b 1664static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1665 assert(u);
1666 assert(c);
1667
1668 if (!c->protect_clock)
1669 return 0;
1670
1671 if (skip_seccomp_unavailable(u, "ProtectClock="))
1672 return 0;
1673
1674 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1675}
1676
59e856c7 1677static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1678 assert(u);
ba128bb8
LP
1679 assert(c);
1680
8f81a5f6 1681 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1682
469830d1
LP
1683 if (!c->private_devices)
1684 return 0;
1685
ba128bb8
LP
1686 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1687 return 0;
1688
b54f36c6 1689 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1690}
1691
34cf6c43 1692static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1693 assert(u);
add00535
LP
1694 assert(c);
1695
1696 if (!exec_context_restrict_namespaces_set(c))
1697 return 0;
1698
1699 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1700 return 0;
1701
1702 return seccomp_restrict_namespaces(c->restrict_namespaces);
1703}
1704
78e864e5 1705static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1706 unsigned long personality;
1707 int r;
78e864e5
TM
1708
1709 assert(u);
1710 assert(c);
1711
1712 if (!c->lock_personality)
1713 return 0;
1714
1715 if (skip_seccomp_unavailable(u, "LockPersonality="))
1716 return 0;
1717
e8132d63
LP
1718 personality = c->personality;
1719
1720 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1721 if (personality == PERSONALITY_INVALID) {
1722
1723 r = opinionated_personality(&personality);
1724 if (r < 0)
1725 return r;
1726 }
78e864e5
TM
1727
1728 return seccomp_lock_personality(personality);
1729}
1730
c0467cf3 1731#endif
8351ceae 1732
7a8288f6
DM
1733#if HAVE_LIBBPF
1734static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
299d9417
JK
1735 assert(u);
1736 assert(u->manager);
1737
7a8288f6
DM
1738 if (lsm_bpf_supported())
1739 return false;
1740
299d9417
JK
1741 /* lsm_bpf_setup succeeded */
1742 if (u->manager->restrict_fs)
1743 return false;
1744
7a8288f6
DM
1745 log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1746 return true;
1747}
1748
1749static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1750 assert(u);
1751 assert(c);
1752
1753 if (!exec_context_restrict_filesystems_set(c))
1754 return 0;
1755
1756 if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1757 return 0;
1758
1759 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1760}
1761#endif
1762
daf8f72b 1763static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1764 assert(u);
1765 assert(c);
1766
1767 if (!c->protect_hostname)
1768 return 0;
1769
1770 if (ns_type_supported(NAMESPACE_UTS)) {
1771 if (unshare(CLONE_NEWUTS) < 0) {
1772 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1773 *ret_exit_status = EXIT_NAMESPACE;
1774 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1775 }
1776
1777 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1778 }
1779 } else
1780 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1781
1782#if HAVE_SECCOMP
8f3e342f
ZJS
1783 int r;
1784
daf8f72b
LP
1785 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1786 return 0;
1787
1788 r = seccomp_protect_hostname();
1789 if (r < 0) {
1790 *ret_exit_status = EXIT_SECCOMP;
1791 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1792 }
1793#endif
1794
1795 return 0;
1796}
1797
3042bbeb 1798static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1799 assert(idle_pipe);
1800
54eb2300
LP
1801 idle_pipe[1] = safe_close(idle_pipe[1]);
1802 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1803
1804 if (idle_pipe[0] >= 0) {
1805 int r;
1806
1807 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1808
1809 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1810 ssize_t n;
1811
31a7eb86 1812 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1813 n = write(idle_pipe[3], "x", 1);
1814 if (n > 0)
cd972d69 1815 /* Wait for systemd to react to the signal above. */
54756dce 1816 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1817 }
1818
54eb2300 1819 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1820
1821 }
1822
54eb2300 1823 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1824}
1825
fb2042dd
YW
1826static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1827
7cae38c4 1828static int build_environment(
34cf6c43 1829 const Unit *u,
9fa95f85 1830 const ExecContext *c,
1e22b5cd 1831 const ExecParameters *p,
da6053d0 1832 size_t n_fds,
7cae38c4
LP
1833 const char *home,
1834 const char *username,
1835 const char *shell,
7bce046b
LP
1836 dev_t journal_stream_dev,
1837 ino_t journal_stream_ino,
7cae38c4
LP
1838 char ***ret) {
1839
1840 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1841 size_t n_env = 0;
7cae38c4
LP
1842 char *x;
1843
4b58153d 1844 assert(u);
7cae38c4 1845 assert(c);
7c1cb6f1 1846 assert(p);
7cae38c4
LP
1847 assert(ret);
1848
dc4e2940 1849#define N_ENV_VARS 17
8d5bb13d 1850 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1851 if (!our_env)
1852 return -ENOMEM;
1853
1854 if (n_fds > 0) {
8dd4c05b
LP
1855 _cleanup_free_ char *joined = NULL;
1856
df0ff127 1857 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1858 return -ENOMEM;
1859 our_env[n_env++] = x;
1860
da6053d0 1861 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1862 return -ENOMEM;
1863 our_env[n_env++] = x;
8dd4c05b 1864
1e22b5cd 1865 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1866 if (!joined)
1867 return -ENOMEM;
1868
605405c6 1869 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1870 if (!x)
1871 return -ENOMEM;
1872 our_env[n_env++] = x;
7cae38c4
LP
1873 }
1874
b08af3b1 1875 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1876 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1877 return -ENOMEM;
1878 our_env[n_env++] = x;
1879
1e22b5cd 1880 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1881 return -ENOMEM;
1882 our_env[n_env++] = x;
1883 }
1884
fd63e712
LP
1885 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1886 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1887 * check the database directly. */
ac647978 1888 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1889 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1890 if (!x)
1891 return -ENOMEM;
1892 our_env[n_env++] = x;
1893 }
1894
7cae38c4 1895 if (home) {
b910cc72 1896 x = strjoin("HOME=", home);
7cae38c4
LP
1897 if (!x)
1898 return -ENOMEM;
7bbead1d 1899
4ff361cc 1900 path_simplify(x + 5);
7cae38c4
LP
1901 our_env[n_env++] = x;
1902 }
1903
1904 if (username) {
b910cc72 1905 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1906 if (!x)
1907 return -ENOMEM;
1908 our_env[n_env++] = x;
1909
b910cc72 1910 x = strjoin("USER=", username);
7cae38c4
LP
1911 if (!x)
1912 return -ENOMEM;
1913 our_env[n_env++] = x;
1914 }
1915
1916 if (shell) {
b910cc72 1917 x = strjoin("SHELL=", shell);
7cae38c4
LP
1918 if (!x)
1919 return -ENOMEM;
7bbead1d 1920
4ff361cc 1921 path_simplify(x + 6);
7cae38c4
LP
1922 our_env[n_env++] = x;
1923 }
1924
4b58153d
LP
1925 if (!sd_id128_is_null(u->invocation_id)) {
1926 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1927 return -ENOMEM;
1928
1929 our_env[n_env++] = x;
1930 }
1931
6af760f3
LP
1932 if (exec_context_needs_term(c)) {
1933 const char *tty_path, *term = NULL;
1934
1935 tty_path = exec_context_tty_path(c);
1936
e8cf09b2
LP
1937 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1938 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1939 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1940
e8cf09b2 1941 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1942 term = getenv("TERM");
e8cf09b2 1943
6af760f3
LP
1944 if (!term)
1945 term = default_term_for_tty(tty_path);
7cae38c4 1946
b910cc72 1947 x = strjoin("TERM=", term);
7cae38c4
LP
1948 if (!x)
1949 return -ENOMEM;
1950 our_env[n_env++] = x;
1951 }
1952
7bce046b
LP
1953 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1954 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1955 return -ENOMEM;
1956
1957 our_env[n_env++] = x;
1958 }
1959
91dd5f7c
LP
1960 if (c->log_namespace) {
1961 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1962 if (!x)
1963 return -ENOMEM;
1964
1965 our_env[n_env++] = x;
1966 }
1967
5b10116e 1968 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1969 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1970 const char *n;
1971
1972 if (!p->prefix[t])
1973 continue;
1974
211a3d87 1975 if (c->directories[t].n_items == 0)
fb2042dd
YW
1976 continue;
1977
1978 n = exec_directory_env_name_to_string(t);
1979 if (!n)
1980 continue;
1981
211a3d87
LB
1982 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1983 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1984
211a3d87
LB
1985 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1986 if (!prefixed)
1987 return -ENOMEM;
1988
1989 if (!strextend_with_separator(&joined, ":", prefixed))
1990 return -ENOMEM;
1991 }
fb2042dd
YW
1992
1993 x = strjoin(n, "=", joined);
1994 if (!x)
1995 return -ENOMEM;
1996
1997 our_env[n_env++] = x;
1998 }
1999
bb0c0d6f
LP
2000 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2001 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
2002 if (!x)
2003 return -ENOMEM;
2004
2005 our_env[n_env++] = x;
2006 }
2007
dc4e2940
YW
2008 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2009 return -ENOMEM;
2010
2011 our_env[n_env++] = x;
2012
7cae38c4 2013 our_env[n_env++] = NULL;
8d5bb13d
LP
2014 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2015#undef N_ENV_VARS
7cae38c4 2016
ae2a15bc 2017 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2018
2019 return 0;
2020}
2021
b4c14404
FB
2022static int build_pass_environment(const ExecContext *c, char ***ret) {
2023 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2024 size_t n_env = 0;
b4c14404
FB
2025 char **i;
2026
2027 STRV_FOREACH(i, c->pass_environment) {
2028 _cleanup_free_ char *x = NULL;
2029 char *v;
2030
2031 v = getenv(*i);
2032 if (!v)
2033 continue;
605405c6 2034 x = strjoin(*i, "=", v);
b4c14404
FB
2035 if (!x)
2036 return -ENOMEM;
00819cc1 2037
319a4f4b 2038 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2039 return -ENOMEM;
00819cc1 2040
1cc6c93a 2041 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2042 pass_env[n_env] = NULL;
b4c14404
FB
2043 }
2044
ae2a15bc 2045 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2046
2047 return 0;
2048}
2049
5e8deb94 2050bool exec_needs_mount_namespace(
8b44a3d2
LP
2051 const ExecContext *context,
2052 const ExecParameters *params,
4657abb5 2053 const ExecRuntime *runtime) {
8b44a3d2
LP
2054
2055 assert(context);
8b44a3d2 2056
915e6d16
LP
2057 if (context->root_image)
2058 return true;
2059
2a624c36
AP
2060 if (!strv_isempty(context->read_write_paths) ||
2061 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2062 !strv_isempty(context->inaccessible_paths) ||
2063 !strv_isempty(context->exec_paths) ||
2064 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2065 return true;
2066
42b1d8e0 2067 if (context->n_bind_mounts > 0)
d2d6c096
LP
2068 return true;
2069
2abd4e38
YW
2070 if (context->n_temporary_filesystems > 0)
2071 return true;
2072
b3d13314
LB
2073 if (context->n_mount_images > 0)
2074 return true;
2075
93f59701
LB
2076 if (context->n_extension_images > 0)
2077 return true;
2078
37ed15d7 2079 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2080 return true;
2081
2082 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2083 return true;
2084
8b44a3d2 2085 if (context->private_devices ||
228af36f 2086 context->private_mounts ||
8b44a3d2 2087 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2088 context->protect_home != PROTECT_HOME_NO ||
2089 context->protect_kernel_tunables ||
c575770b 2090 context->protect_kernel_modules ||
94a7b275 2091 context->protect_kernel_logs ||
4e399953
LP
2092 context->protect_control_groups ||
2093 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2094 context->proc_subset != PROC_SUBSET_ALL ||
2095 context->private_ipc ||
2096 context->ipc_namespace_path)
8b44a3d2
LP
2097 return true;
2098
37c56f89 2099 if (context->root_directory) {
5e98086d 2100 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2101 return true;
2102
5b10116e 2103 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2104 if (params && !params->prefix[t])
37c56f89
YW
2105 continue;
2106
211a3d87 2107 if (context->directories[t].n_items > 0)
37c56f89
YW
2108 return true;
2109 }
2110 }
5d997827 2111
42b1d8e0 2112 if (context->dynamic_user &&
211a3d87
LB
2113 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2114 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2115 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2116 return true;
2117
91dd5f7c
LP
2118 if (context->log_namespace)
2119 return true;
2120
8b44a3d2
LP
2121 return false;
2122}
2123
5749f855 2124static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2125 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2126 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2127 _cleanup_close_ int unshare_ready_fd = -1;
2128 _cleanup_(sigkill_waitp) pid_t pid = 0;
2129 uint64_t c = 1;
d251207d
LP
2130 ssize_t n;
2131 int r;
2132
5749f855
AZ
2133 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2134 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2135 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2136 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2137 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2138 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2139 * continues execution normally.
2140 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2141 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2142
5749f855
AZ
2143 /* Can only set up multiple mappings with CAP_SETUID. */
2144 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2145 r = asprintf(&uid_map,
5749f855 2146 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2147 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2148 ouid, ouid, uid, uid);
2149 else
2150 r = asprintf(&uid_map,
2151 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2152 ouid, ouid);
d251207d 2153
5749f855
AZ
2154 if (r < 0)
2155 return -ENOMEM;
2156
2157 /* Can only set up multiple mappings with CAP_SETGID. */
2158 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2159 r = asprintf(&gid_map,
5749f855 2160 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2161 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2162 ogid, ogid, gid, gid);
2163 else
2164 r = asprintf(&gid_map,
2165 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2166 ogid, ogid);
2167
2168 if (r < 0)
2169 return -ENOMEM;
d251207d
LP
2170
2171 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2172 * namespace. */
2173 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2174 if (unshare_ready_fd < 0)
2175 return -errno;
2176
2177 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2178 * failed. */
2179 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2180 return -errno;
2181
4c253ed1
LP
2182 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2183 if (r < 0)
2184 return r;
2185 if (r == 0) {
d251207d
LP
2186 _cleanup_close_ int fd = -1;
2187 const char *a;
2188 pid_t ppid;
2189
2190 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2191 * here, after the parent opened its own user namespace. */
2192
2193 ppid = getppid();
2194 errno_pipe[0] = safe_close(errno_pipe[0]);
2195
2196 /* Wait until the parent unshared the user namespace */
2197 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2198 r = -errno;
2199 goto child_fail;
2200 }
2201
2202 /* Disable the setgroups() system call in the child user namespace, for good. */
2203 a = procfs_file_alloca(ppid, "setgroups");
2204 fd = open(a, O_WRONLY|O_CLOEXEC);
2205 if (fd < 0) {
2206 if (errno != ENOENT) {
2207 r = -errno;
2208 goto child_fail;
2209 }
2210
2211 /* If the file is missing the kernel is too old, let's continue anyway. */
2212 } else {
2213 if (write(fd, "deny\n", 5) < 0) {
2214 r = -errno;
2215 goto child_fail;
2216 }
2217
2218 fd = safe_close(fd);
2219 }
2220
2221 /* First write the GID map */
2222 a = procfs_file_alloca(ppid, "gid_map");
2223 fd = open(a, O_WRONLY|O_CLOEXEC);
2224 if (fd < 0) {
2225 r = -errno;
2226 goto child_fail;
2227 }
2228 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2229 r = -errno;
2230 goto child_fail;
2231 }
2232 fd = safe_close(fd);
2233
2234 /* The write the UID map */
2235 a = procfs_file_alloca(ppid, "uid_map");
2236 fd = open(a, O_WRONLY|O_CLOEXEC);
2237 if (fd < 0) {
2238 r = -errno;
2239 goto child_fail;
2240 }
2241 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2242 r = -errno;
2243 goto child_fail;
2244 }
2245
2246 _exit(EXIT_SUCCESS);
2247
2248 child_fail:
2249 (void) write(errno_pipe[1], &r, sizeof(r));
2250 _exit(EXIT_FAILURE);
2251 }
2252
2253 errno_pipe[1] = safe_close(errno_pipe[1]);
2254
2255 if (unshare(CLONE_NEWUSER) < 0)
2256 return -errno;
2257
2258 /* Let the child know that the namespace is ready now */
2259 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2260 return -errno;
2261
2262 /* Try to read an error code from the child */
2263 n = read(errno_pipe[0], &r, sizeof(r));
2264 if (n < 0)
2265 return -errno;
2266 if (n == sizeof(r)) { /* an error code was sent to us */
2267 if (r < 0)
2268 return r;
2269 return -EIO;
2270 }
2271 if (n != 0) /* on success we should have read 0 bytes */
2272 return -EIO;
2273
8f03de53 2274 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2275 if (r < 0)
2276 return r;
2e87a1fd 2277 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2278 return -EIO;
2279
2280 return 0;
2281}
2282
494d0247
YW
2283static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2284 if (!context->dynamic_user)
2285 return false;
2286
2287 if (type == EXEC_DIRECTORY_CONFIGURATION)
2288 return false;
2289
2290 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2291 return false;
2292
2293 return true;
2294}
2295
211a3d87
LB
2296static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2297 _cleanup_free_ char *src_abs = NULL;
2298 char **dst;
2299 int r;
2300
2301 assert(source);
2302
2303 src_abs = path_join(root, source);
2304 if (!src_abs)
2305 return -ENOMEM;
2306
2307 STRV_FOREACH(dst, symlinks) {
2308 _cleanup_free_ char *dst_abs = NULL;
2309
2310 dst_abs = path_join(root, *dst);
2311 if (!dst_abs)
2312 return -ENOMEM;
2313
2314 r = mkdir_parents_label(dst_abs, 0755);
2315 if (r < 0)
2316 return r;
2317
2318 r = symlink_idempotent(src_abs, dst_abs, true);
2319 if (r < 0)
2320 return r;
2321 }
2322
2323 return 0;
2324}
2325
3536f49e 2326static int setup_exec_directory(
07689d5d
LP
2327 const ExecContext *context,
2328 const ExecParameters *params,
2329 uid_t uid,
3536f49e 2330 gid_t gid,
3536f49e 2331 ExecDirectoryType type,
211a3d87 2332 bool needs_mount_namespace,
3536f49e 2333 int *exit_status) {
07689d5d 2334
72fd1768 2335 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2336 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2337 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2338 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2339 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2340 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2341 };
07689d5d
LP
2342 int r;
2343
2344 assert(context);
2345 assert(params);
72fd1768 2346 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2347 assert(exit_status);
07689d5d 2348
3536f49e
YW
2349 if (!params->prefix[type])
2350 return 0;
2351
8679efde 2352 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2353 if (!uid_is_valid(uid))
2354 uid = 0;
2355 if (!gid_is_valid(gid))
2356 gid = 0;
2357 }
2358
211a3d87 2359 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2360 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2361
211a3d87 2362 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2363 if (!p) {
2364 r = -ENOMEM;
2365 goto fail;
2366 }
07689d5d 2367
23a7448e
YW
2368 r = mkdir_parents_label(p, 0755);
2369 if (r < 0)
3536f49e 2370 goto fail;
23a7448e 2371
494d0247 2372 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2373 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2374 * case we want to avoid leaving a directory around fully accessible that is owned by
2375 * a dynamic user whose UID is later on reused. To lock this down we use the same
2376 * trick used by container managers to prohibit host users to get access to files of
2377 * the same UID in containers: we place everything inside a directory that has an
2378 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2379 * for unprivileged host code. We then use fs namespacing to make this directory
2380 * permeable for the service itself.
6c47cd7d 2381 *
3f5b1508
LP
2382 * Specifically: for a service which wants a special directory "foo/" we first create
2383 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2384 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2385 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2386 * unprivileged host users can't look into it. Inside of the namespace of the unit
2387 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2388 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2389 * for the service and making sure it only gets access to the dirs it needs but no
2390 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2391 *
3f5b1508
LP
2392 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2393 * to be owned by the service itself.
2394 *
2395 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2396 * for sharing files or sockets with other services. */
6c47cd7d 2397
4ede9802
LP
2398 pp = path_join(params->prefix[type], "private");
2399 if (!pp) {
6c47cd7d
LP
2400 r = -ENOMEM;
2401 goto fail;
2402 }
2403
2404 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2405 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2406 if (r < 0)
2407 goto fail;
2408
211a3d87 2409 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2410 r = -ENOMEM;
2411 goto fail;
2412 }
2413
2414 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2415 r = mkdir_parents_label(pp, 0755);
2416 if (r < 0)
2417 goto fail;
2418
949befd3
LP
2419 if (is_dir(p, false) > 0 &&
2420 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2421
2422 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2423 * it over. Most likely the service has been upgraded from one that didn't use
2424 * DynamicUser=1, to one that does. */
2425
cf52c45d
LP
2426 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2427 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2428 exec_directory_type_to_string(type), p, pp);
2429
949befd3
LP
2430 if (rename(p, pp) < 0) {
2431 r = -errno;
2432 goto fail;
2433 }
2434 } else {
2435 /* Otherwise, create the actual directory for the service */
2436
2437 r = mkdir_label(pp, context->directories[type].mode);
2438 if (r < 0 && r != -EEXIST)
2439 goto fail;
2440 }
6c47cd7d 2441
df61e79a
LB
2442 /* And link it up from the original place. Note that if a mount namespace is going to be
2443 * used, then this symlink remains on the host, and a new one for the child namespace will
2444 * be created later. */
6c9c51e5 2445 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2446 if (r < 0)
2447 goto fail;
2448
6c47cd7d 2449 } else {
5c6d40d1
LP
2450 _cleanup_free_ char *target = NULL;
2451
2452 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2453 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2454 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2455
2456 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2457 * by DynamicUser=1 (see above)?
2458 *
2459 * We do this for all directory types except for ConfigurationDirectory=,
2460 * since they all support the private/ symlink logic at least in some
2461 * configurations, see above. */
5c6d40d1 2462
578dc69f
YW
2463 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2464 if (r < 0)
2465 goto fail;
2466
211a3d87 2467 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2468 if (!q) {
2469 r = -ENOMEM;
2470 goto fail;
2471 }
2472
578dc69f
YW
2473 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2474 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2475 if (r < 0)
2476 goto fail;
2477
2478 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2479
2480 /* Hmm, apparently DynamicUser= was once turned on for this service,
2481 * but is no longer. Let's move the directory back up. */
2482
cf52c45d
LP
2483 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2484 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2485 exec_directory_type_to_string(type), q, p);
2486
5c6d40d1
LP
2487 if (unlink(p) < 0) {
2488 r = -errno;
2489 goto fail;
2490 }
2491
2492 if (rename(q, p) < 0) {
2493 r = -errno;
2494 goto fail;
2495 }
2496 }
2497 }
2498
6c47cd7d 2499 r = mkdir_label(p, context->directories[type].mode);
d484580c 2500 if (r < 0) {
d484580c
LP
2501 if (r != -EEXIST)
2502 goto fail;
2503
206e9864
LP
2504 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2505 struct stat st;
2506
2507 /* Don't change the owner/access mode of the configuration directory,
2508 * as in the common case it is not written to by a service, and shall
2509 * not be writable. */
2510
2511 if (stat(p, &st) < 0) {
2512 r = -errno;
2513 goto fail;
2514 }
2515
2516 /* Still complain if the access mode doesn't match */
2517 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2518 log_warning("%s \'%s\' already exists but the mode is different. "
2519 "(File system: %o %sMode: %o)",
211a3d87 2520 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2521 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2522
6cff72eb 2523 continue;
206e9864 2524 }
6cff72eb 2525 }
a1164ae3 2526 }
07689d5d 2527
206e9864 2528 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2529 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2530 * current UID/GID ownership.) */
2531 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2532 if (r < 0)
2533 goto fail;
c71b2eb7 2534
607b358e
LP
2535 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2536 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2537 * assignments to exist. */
607b358e 2538 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2539 if (r < 0)
3536f49e 2540 goto fail;
07689d5d
LP
2541 }
2542
211a3d87
LB
2543 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2544 * they are set up later, to allow configuring empty var/run/etc. */
2545 if (!needs_mount_namespace)
2546 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2547 r = create_many_symlinks(params->prefix[type],
2548 context->directories[type].items[i].path,
2549 context->directories[type].items[i].symlinks);
2550 if (r < 0)
2551 goto fail;
2552 }
2553
07689d5d 2554 return 0;
3536f49e
YW
2555
2556fail:
2557 *exit_status = exit_status_table[type];
3536f49e 2558 return r;
07689d5d
LP
2559}
2560
bb0c0d6f
LP
2561static int write_credential(
2562 int dfd,
2563 const char *id,
2564 const void *data,
2565 size_t size,
2566 uid_t uid,
2567 bool ownership_ok) {
2568
2569 _cleanup_(unlink_and_freep) char *tmp = NULL;
2570 _cleanup_close_ int fd = -1;
2571 int r;
2572
2573 r = tempfn_random_child("", "cred", &tmp);
2574 if (r < 0)
2575 return r;
2576
2577 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2578 if (fd < 0) {
2579 tmp = mfree(tmp);
2580 return -errno;
2581 }
2582
43144be4 2583 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2584 if (r < 0)
2585 return r;
2586
2587 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2588 return -errno;
2589
2590 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2591 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2592 if (r < 0) {
2593 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2594 return r;
2595
2596 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2597 * to express: that the user gets read access and nothing
2598 * else. But if the backing fs can't support that (e.g. ramfs)
2599 * then we can use file ownership instead. But that's only safe if
2600 * we can then re-mount the whole thing read-only, so that the
2601 * user can no longer chmod() the file to gain write access. */
2602 return r;
2603
f5fbe71d 2604 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2605 return -errno;
2606 }
2607 }
2608
2609 if (renameat(dfd, tmp, dfd, id) < 0)
2610 return -errno;
2611
2612 tmp = mfree(tmp);
2613 return 0;
2614}
2615
bb0c0d6f
LP
2616static int acquire_credentials(
2617 const ExecContext *context,
2618 const ExecParameters *params,
d3dcf4e3 2619 const char *unit,
bb0c0d6f
LP
2620 const char *p,
2621 uid_t uid,
2622 bool ownership_ok) {
2623
43144be4 2624 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2625 _cleanup_close_ int dfd = -1;
43144be4 2626 ExecLoadCredential *lc;
bb0c0d6f 2627 ExecSetCredential *sc;
bb0c0d6f
LP
2628 int r;
2629
2630 assert(context);
2631 assert(p);
2632
2633 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2634 if (dfd < 0)
2635 return -errno;
2636
43144be4
LP
2637 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2638 HASHMAP_FOREACH(lc, context->load_credentials) {
2639 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
bb0c0d6f 2640 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2641 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2642 bool missing_ok = true;
bb0c0d6f
LP
2643 const char *source;
2644 size_t size, add;
2645
43144be4 2646 if (path_is_absolute(lc->path)) {
bb0c0d6f 2647 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
43144be4 2648 source = lc->path;
bb0c0d6f 2649 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2650
2651 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2652 * via the source socket address in case we read off an AF_UNIX socket. */
43144be4 2653 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
d3dcf4e3
LP
2654 return -ENOMEM;
2655
fc682be2
LP
2656 missing_ok = false;
2657
bb0c0d6f
LP
2658 } else if (params->received_credentials) {
2659 /* If this is a relative path, take it relative to the credentials we received
2660 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2661 * on a credential store, i.e. this is guaranteed to be regular files. */
43144be4 2662 j = path_join(params->received_credentials, lc->path);
bb0c0d6f
LP
2663 if (!j)
2664 return -ENOMEM;
2665
2666 source = j;
2667 } else
2668 source = NULL;
2669
2670 if (source)
43144be4
LP
2671 r = read_full_file_full(
2672 AT_FDCWD, source,
2673 UINT64_MAX,
2674 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2675 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2676 bindname,
2677 &data, &size);
bb0c0d6f
LP
2678 else
2679 r = -ENOENT;
43144be4 2680 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
fc682be2
LP
2681 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2682 * will get clear errors if we don't pass such a missing credential on as they
2683 * themselves will get ENOENT when trying to read them, which should not be much
2684 * worse than when we handle the error here and make it fatal.
2685 *
43144be4
LP
2686 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2687 * we are fine, too. */
2688 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
bb0c0d6f 2689 continue;
fc682be2 2690 }
bb0c0d6f 2691 if (r < 0)
43144be4
LP
2692 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2693
2694 if (lc->encrypted) {
2695 _cleanup_free_ void *plaintext = NULL;
2696 size_t plaintext_size = 0;
2697
2698 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2699 if (r < 0)
2700 return r;
bb0c0d6f 2701
43144be4
LP
2702 free_and_replace(data, plaintext);
2703 size = plaintext_size;
2704 }
2705
2706 add = strlen(lc->id) + size;
bb0c0d6f
LP
2707 if (add > left)
2708 return -E2BIG;
2709
43144be4 2710 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
bb0c0d6f
LP
2711 if (r < 0)
2712 return r;
2713
2714 left -= add;
2715 }
2716
43144be4
LP
2717 /* First we use the literally specified credentials. Note that they might be overridden again below,
2718 * and thus act as a "default" if the same credential is specified multiple times */
2719 HASHMAP_FOREACH(sc, context->set_credentials) {
2720 _cleanup_(erase_and_freep) void *plaintext = NULL;
2721 const char *data;
2722 size_t size, add;
2723
2724 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2725 continue;
2726 if (errno != ENOENT)
2727 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2728
2729 if (sc->encrypted) {
2730 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2731 if (r < 0)
2732 return r;
2733
2734 data = plaintext;
2735 } else {
2736 data = sc->data;
2737 size = sc->size;
2738 }
2739
2740 add = strlen(sc->id) + size;
2741 if (add > left)
2742 return -E2BIG;
2743
2744 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2745 if (r < 0)
2746 return r;
2747
2748
2749 left -= add;
2750 }
2751
bb0c0d6f
LP
2752 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2753 return -errno;
2754
2755 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2756 * accessible */
2757
2758 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2759 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2760 if (r < 0) {
2761 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2762 return r;
2763
2764 if (!ownership_ok)
2765 return r;
2766
f5fbe71d 2767 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2768 return -errno;
2769 }
2770 }
2771
2772 return 0;
2773}
2774
2775static int setup_credentials_internal(
2776 const ExecContext *context,
2777 const ExecParameters *params,
d3dcf4e3 2778 const char *unit,
bb0c0d6f
LP
2779 const char *final, /* This is where the credential store shall eventually end up at */
2780 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2781 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2782 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2783 uid_t uid) {
2784
2785 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2786 * if we mounted something; false if we definitely can't mount anything */
2787 bool final_mounted;
2788 const char *where;
2789
2790 assert(context);
2791 assert(final);
2792 assert(workspace);
2793
2794 if (reuse_workspace) {
2795 r = path_is_mount_point(workspace, NULL, 0);
2796 if (r < 0)
2797 return r;
2798 if (r > 0)
2799 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2800 else
2801 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2802 } else
2803 workspace_mounted = -1; /* ditto */
2804
2805 r = path_is_mount_point(final, NULL, 0);
2806 if (r < 0)
2807 return r;
2808 if (r > 0) {
2809 /* If the final place already has something mounted, we use that. If the workspace also has
2810 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2811 * different). */
2812 final_mounted = true;
2813
2814 if (workspace_mounted < 0) {
2815 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2816 * the final version to the workspace, and make it writable, so that we can make
2817 * changes */
2818
21935150
LP
2819 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2820 if (r < 0)
2821 return r;
bb0c0d6f 2822
21935150
LP
2823 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2824 if (r < 0)
2825 return r;
bb0c0d6f
LP
2826
2827 workspace_mounted = true;
2828 }
2829 } else
2830 final_mounted = false;
2831
2832 if (workspace_mounted < 0) {
2833 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2834 for (int try = 0;; try++) {
2835
2836 if (try == 0) {
2837 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2838 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2839 if (r >= 0) {
bb0c0d6f
LP
2840 workspace_mounted = true;
2841 break;
2842 }
2843
2844 } else if (try == 1) {
2845 _cleanup_free_ char *opts = NULL;
2846
43144be4 2847 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
2848 return -ENOMEM;
2849
2850 /* Fall back to "tmpfs" otherwise */
21935150
LP
2851 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2852 if (r >= 0) {
bb0c0d6f
LP
2853 workspace_mounted = true;
2854 break;
2855 }
2856
2857 } else {
2858 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2859 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2860 if (r < 0) {
2861 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2862 return r;
bb0c0d6f
LP
2863
2864 if (must_mount) /* If we it's not OK to use the plain directory
2865 * fallback, propagate all errors too */
21935150 2866 return r;
bb0c0d6f
LP
2867
2868 /* If we lack privileges to bind mount stuff, then let's gracefully
2869 * proceed for compat with container envs, and just use the final dir
2870 * as is. */
2871
2872 workspace_mounted = false;
2873 break;
2874 }
2875
2876 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2877 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2878 if (r < 0)
2879 return r;
bb0c0d6f
LP
2880
2881 workspace_mounted = true;
2882 break;
2883 }
2884 }
2885 }
2886
2887 assert(!must_mount || workspace_mounted > 0);
2888 where = workspace_mounted ? workspace : final;
2889
e3a0a862
CG
2890 (void) label_fix_container(where, final, 0);
2891
d3dcf4e3 2892 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2893 if (r < 0)
2894 return r;
2895
2896 if (workspace_mounted) {
2897 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2898 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2899 if (r < 0)
2900 return r;
bb0c0d6f
LP
2901
2902 /* And mount it to the final place, read-only */
21935150
LP
2903 if (final_mounted)
2904 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2905 else
2906 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2907 if (r < 0)
2908 return r;
bb0c0d6f
LP
2909 } else {
2910 _cleanup_free_ char *parent = NULL;
2911
2912 /* If we do not have our own mount put used the plain directory fallback, then we need to
2913 * open access to the top-level credential directory and the per-service directory now */
2914
2915 parent = dirname_malloc(final);
2916 if (!parent)
2917 return -ENOMEM;
2918 if (chmod(parent, 0755) < 0)
2919 return -errno;
2920 }
2921
2922 return 0;
2923}
2924
2925static int setup_credentials(
2926 const ExecContext *context,
2927 const ExecParameters *params,
2928 const char *unit,
2929 uid_t uid) {
2930
2931 _cleanup_free_ char *p = NULL, *q = NULL;
2932 const char *i;
2933 int r;
2934
2935 assert(context);
2936 assert(params);
2937
2938 if (!exec_context_has_credentials(context))
2939 return 0;
2940
2941 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2942 return -EINVAL;
2943
2944 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2945 * and the subdir we mount over with a read-only file system readable by the service's user */
2946 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2947 if (!q)
2948 return -ENOMEM;
2949
2950 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2951 if (r < 0 && r != -EEXIST)
2952 return r;
2953
2954 p = path_join(q, unit);
2955 if (!p)
2956 return -ENOMEM;
2957
2958 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2959 if (r < 0 && r != -EEXIST)
2960 return r;
2961
2962 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2963 if (r < 0) {
2964 _cleanup_free_ char *t = NULL, *u = NULL;
2965
2966 /* If this is not a privilege or support issue then propagate the error */
2967 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2968 return r;
2969
2970 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2971 * it into place, so that users can't access half-initialized credential stores. */
2972 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2973 if (!t)
2974 return -ENOMEM;
2975
2976 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2977 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2978 * after it is fully set up */
2979 u = path_join(t, unit);
2980 if (!u)
2981 return -ENOMEM;
2982
2983 FOREACH_STRING(i, t, u) {
2984 r = mkdir_label(i, 0700);
2985 if (r < 0 && r != -EEXIST)
2986 return r;
2987 }
2988
2989 r = setup_credentials_internal(
2990 context,
2991 params,
d3dcf4e3 2992 unit,
bb0c0d6f
LP
2993 p, /* final mount point */
2994 u, /* temporary workspace to overmount */
2995 true, /* reuse the workspace if it is already a mount */
2996 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2997 uid);
2998
2999 (void) rmdir(u); /* remove the workspace again if we can. */
3000
3001 if (r < 0)
3002 return r;
3003
3004 } else if (r == 0) {
3005
3006 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3007 * we can use the same directory for all cases, after turning off propagation. Question
3008 * though is: where do we turn off propagation exactly, and where do we place the workspace
3009 * directory? We need some place that is guaranteed to be a mount point in the host, and
3010 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3011 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3012 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3013 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3014 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3015 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3016 * propagation on the former, and then overmount the latter.
3017 *
3018 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3019 * for this purpose, but there are few other candidates that work equally well for us, and
3020 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3021 * that no one else sees this should be OK to do. */
bb0c0d6f 3022
21935150
LP
3023 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3024 if (r < 0)
bb0c0d6f
LP
3025 goto child_fail;
3026
3027 r = setup_credentials_internal(
3028 context,
3029 params,
d3dcf4e3 3030 unit,
bb0c0d6f
LP
3031 p, /* final mount point */
3032 "/dev/shm", /* temporary workspace to overmount */
3033 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3034 true, /* insist that something is mounted, do not allow fallback to plain directory */
3035 uid);
3036 if (r < 0)
3037 goto child_fail;
3038
3039 _exit(EXIT_SUCCESS);
3040
3041 child_fail:
3042 _exit(EXIT_FAILURE);
3043 }
3044
3045 return 0;
3046}
3047
92b423b9 3048#if ENABLE_SMACK
cefc33ae
LP
3049static int setup_smack(
3050 const ExecContext *context,
b83d5050 3051 int executable_fd) {
cefc33ae
LP
3052 int r;
3053
3054 assert(context);
b83d5050 3055 assert(executable_fd >= 0);
cefc33ae 3056
cefc33ae
LP
3057 if (context->smack_process_label) {
3058 r = mac_smack_apply_pid(0, context->smack_process_label);
3059 if (r < 0)
3060 return r;
3061 }
3062#ifdef SMACK_DEFAULT_PROCESS_LABEL
3063 else {
3064 _cleanup_free_ char *exec_label = NULL;
3065
b83d5050 3066 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3067 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3068 return r;
3069
3070 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3071 if (r < 0)
3072 return r;
3073 }
cefc33ae
LP
3074#endif
3075
3076 return 0;
3077}
92b423b9 3078#endif
cefc33ae 3079
6c47cd7d
LP
3080static int compile_bind_mounts(
3081 const ExecContext *context,
3082 const ExecParameters *params,
3083 BindMount **ret_bind_mounts,
da6053d0 3084 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3085 char ***ret_empty_directories) {
3086
3087 _cleanup_strv_free_ char **empty_directories = NULL;
3088 BindMount *bind_mounts;
5b10116e 3089 size_t n, h = 0;
6c47cd7d
LP
3090 int r;
3091
3092 assert(context);
3093 assert(params);
3094 assert(ret_bind_mounts);
3095 assert(ret_n_bind_mounts);
3096 assert(ret_empty_directories);
3097
3098 n = context->n_bind_mounts;
5b10116e 3099 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3100 if (!params->prefix[t])
3101 continue;
3102
211a3d87 3103 n += context->directories[t].n_items;
6c47cd7d
LP
3104 }
3105
3106 if (n <= 0) {
3107 *ret_bind_mounts = NULL;
3108 *ret_n_bind_mounts = 0;
3109 *ret_empty_directories = NULL;
3110 return 0;
3111 }
3112
3113 bind_mounts = new(BindMount, n);
3114 if (!bind_mounts)
3115 return -ENOMEM;
3116
5b10116e 3117 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3118 BindMount *item = context->bind_mounts + i;
3119 char *s, *d;
3120
3121 s = strdup(item->source);
3122 if (!s) {
3123 r = -ENOMEM;
3124 goto finish;
3125 }
3126
3127 d = strdup(item->destination);
3128 if (!d) {
3129 free(s);
3130 r = -ENOMEM;
3131 goto finish;
3132 }
3133
3134 bind_mounts[h++] = (BindMount) {
3135 .source = s,
3136 .destination = d,
3137 .read_only = item->read_only,
3138 .recursive = item->recursive,
3139 .ignore_enoent = item->ignore_enoent,
3140 };
3141 }
3142
5b10116e 3143 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3144 if (!params->prefix[t])
3145 continue;
3146
211a3d87 3147 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3148 continue;
3149
494d0247 3150 if (exec_directory_is_private(context, t) &&
74e12520 3151 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3152 char *private_root;
3153
3154 /* So this is for a dynamic user, and we need to make sure the process can access its own
3155 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3156 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3157
657ee2d8 3158 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3159 if (!private_root) {
3160 r = -ENOMEM;
3161 goto finish;
3162 }
3163
3164 r = strv_consume(&empty_directories, private_root);
a635a7ae 3165 if (r < 0)
6c47cd7d 3166 goto finish;
6c47cd7d
LP
3167 }
3168
211a3d87 3169 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3170 char *s, *d;
3171
494d0247 3172 if (exec_directory_is_private(context, t))
211a3d87 3173 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3174 else
211a3d87 3175 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3176 if (!s) {
3177 r = -ENOMEM;
3178 goto finish;
3179 }
3180
494d0247 3181 if (exec_directory_is_private(context, t) &&
74e12520 3182 exec_context_with_rootfs(context))
5609f688
YW
3183 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3184 * directory is not created on the root directory. So, let's bind-mount the directory
3185 * on the 'non-private' place. */
211a3d87 3186 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3187 else
3188 d = strdup(s);
6c47cd7d
LP
3189 if (!d) {
3190 free(s);
3191 r = -ENOMEM;
3192 goto finish;
3193 }
3194
3195 bind_mounts[h++] = (BindMount) {
3196 .source = s,
3197 .destination = d,
3198 .read_only = false,
9ce4e4b0 3199 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3200 .recursive = true,
3201 .ignore_enoent = false,
3202 };
3203 }
3204 }
3205
3206 assert(h == n);
3207
3208 *ret_bind_mounts = bind_mounts;
3209 *ret_n_bind_mounts = n;
ae2a15bc 3210 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3211
3212 return (int) n;
3213
3214finish:
3215 bind_mount_free_many(bind_mounts, h);
3216 return r;
3217}
3218
df61e79a
LB
3219/* ret_symlinks will contain a list of pairs src:dest that describes
3220 * the symlinks to create later on. For example, the symlinks needed
3221 * to safely give private directories to DynamicUser=1 users. */
3222static int compile_symlinks(
3223 const ExecContext *context,
3224 const ExecParameters *params,
3225 char ***ret_symlinks) {
3226
3227 _cleanup_strv_free_ char **symlinks = NULL;
3228 int r;
3229
3230 assert(context);
3231 assert(params);
3232 assert(ret_symlinks);
3233
3234 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3235 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3236 _cleanup_free_ char *private_path = NULL, *path = NULL;
3237 char **symlink;
df61e79a 3238
211a3d87
LB
3239 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3240 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3241
211a3d87
LB
3242 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3243 dst_abs = path_join(params->prefix[dt], *symlink);
3244 if (!src_abs || !dst_abs)
3245 return -ENOMEM;
df61e79a 3246
211a3d87
LB
3247 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3248 if (r < 0)
3249 return r;
3250 }
3251
3252 if (!exec_directory_is_private(context, dt))
3253 continue;
3254
3255 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3256 if (!private_path)
3257 return -ENOMEM;
3258
211a3d87 3259 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3260 if (!path)
3261 return -ENOMEM;
3262
3263 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3264 if (r < 0)
3265 return r;
3266 }
3267 }
3268
3269 *ret_symlinks = TAKE_PTR(symlinks);
3270
3271 return 0;
3272}
3273
4e677599
LP
3274static bool insist_on_sandboxing(
3275 const ExecContext *context,
3276 const char *root_dir,
3277 const char *root_image,
3278 const BindMount *bind_mounts,
3279 size_t n_bind_mounts) {
3280
4e677599
LP
3281 assert(context);
3282 assert(n_bind_mounts == 0 || bind_mounts);
3283
3284 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3285 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3286 * rearrange stuff in a way we cannot ignore gracefully. */
3287
3288 if (context->n_temporary_filesystems > 0)
3289 return true;
3290
3291 if (root_dir || root_image)
3292 return true;
3293
b3d13314
LB
3294 if (context->n_mount_images > 0)
3295 return true;
3296
4e677599
LP
3297 if (context->dynamic_user)
3298 return true;
3299
3300 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3301 * essential. */
5b10116e 3302 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3303 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3304 return true;
3305
91dd5f7c
LP
3306 if (context->log_namespace)
3307 return true;
3308
4e677599
LP
3309 return false;
3310}
3311
6818c54c 3312static int apply_mount_namespace(
34cf6c43 3313 const Unit *u,
9f71ba8d 3314 ExecCommandFlags command_flags,
6818c54c
LP
3315 const ExecContext *context,
3316 const ExecParameters *params,
7cc5ef5f
ZJS
3317 const ExecRuntime *runtime,
3318 char **error_path) {
6818c54c 3319
df61e79a 3320 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3321 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3322 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3323 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3324 NamespaceInfo ns_info;
165a31c0 3325 bool needs_sandboxing;
6c47cd7d 3326 BindMount *bind_mounts = NULL;
da6053d0 3327 size_t n_bind_mounts = 0;
6818c54c 3328 int r;
93c6bb51 3329
2b3c1b9e
DH
3330 assert(context);
3331
915e6d16
LP
3332 if (params->flags & EXEC_APPLY_CHROOT) {
3333 root_image = context->root_image;
3334
3335 if (!root_image)
3336 root_dir = context->root_directory;
3337 }
93c6bb51 3338
6c47cd7d
LP
3339 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3340 if (r < 0)
3341 return r;
3342
211a3d87 3343 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3344 r = compile_symlinks(context, params, &symlinks);
3345 if (r < 0)
3346 return r;
3347
9f71ba8d 3348 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3349 if (needs_sandboxing) {
3350 /* The runtime struct only contains the parent of the private /tmp,
3351 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3352 * that is sticky, and that's the one we want to use here.
3353 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3354
3355 if (context->private_tmp && runtime) {
56a13a49
ZJS
3356 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3357 tmp_dir = runtime->tmp_dir;
3358 else if (runtime->tmp_dir)
3359 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3360
3361 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3362 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3363 else if (runtime->var_tmp_dir)
56a13a49 3364 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3365 }
3366
b5a33299
YW
3367 ns_info = (NamespaceInfo) {
3368 .ignore_protect_paths = false,
3369 .private_dev = context->private_devices,
3370 .protect_control_groups = context->protect_control_groups,
3371 .protect_kernel_tunables = context->protect_kernel_tunables,
3372 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3373 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3374 .protect_hostname = context->protect_hostname,
5e98086d 3375 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3376 .private_mounts = context->private_mounts,
52b3d652
LP
3377 .protect_home = context->protect_home,
3378 .protect_system = context->protect_system,
4e399953
LP
3379 .protect_proc = context->protect_proc,
3380 .proc_subset = context->proc_subset,
80271a44 3381 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3382 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3383 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3384 };
ecf63c91 3385 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3386 /*
3387 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3388 * sandbox info, otherwise enforce it, don't ignore protected paths and
3389 * fail if we are enable to apply the sandbox inside the mount namespace.
3390 */
3391 ns_info = (NamespaceInfo) {
3392 .ignore_protect_paths = true,
3393 };
3394 else
3395 ns_info = (NamespaceInfo) {};
b5a33299 3396
37ed15d7
FB
3397 if (context->mount_flags == MS_SHARED)
3398 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3399
a631cbfa
LP
3400 if (exec_context_has_credentials(context) &&
3401 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3402 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3403 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3404 if (!creds_path) {
3405 r = -ENOMEM;
3406 goto finalize;
3407 }
bbb4e7f3
LP
3408 }
3409
5e8deb94
LB
3410 if (MANAGER_IS_SYSTEM(u->manager)) {
3411 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3412 if (!propagate_dir) {
3413 r = -ENOMEM;
3414 goto finalize;
3415 }
3416
5e8deb94 3417 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3418 if (!incoming_dir) {
3419 r = -ENOMEM;
3420 goto finalize;
3421 }
5e8deb94
LB
3422 }
3423
18d73705 3424 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3425 &ns_info, context->read_write_paths,
165a31c0
LP
3426 needs_sandboxing ? context->read_only_paths : NULL,
3427 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3428 needs_sandboxing ? context->exec_paths : NULL,
3429 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3430 empty_directories,
df61e79a 3431 symlinks,
6c47cd7d
LP
3432 bind_mounts,
3433 n_bind_mounts,
2abd4e38
YW
3434 context->temporary_filesystems,
3435 context->n_temporary_filesystems,
b3d13314
LB
3436 context->mount_images,
3437 context->n_mount_images,
56a13a49
ZJS
3438 tmp_dir,
3439 var_tmp_dir,
bbb4e7f3 3440 creds_path,
91dd5f7c 3441 context->log_namespace,
915e6d16 3442 context->mount_flags,
d4d55b0d
LB
3443 context->root_hash, context->root_hash_size, context->root_hash_path,
3444 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3445 context->root_verity,
93f59701
LB
3446 context->extension_images,
3447 context->n_extension_images,
5e8deb94
LB
3448 propagate_dir,
3449 incoming_dir,
3bdc25a4 3450 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3451 error_path);
93c6bb51 3452
1beab8b0 3453 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3454 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3455 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3456 * completely different execution environment. */
aca835ed 3457 if (r == -ENOANO) {
4e677599
LP
3458 if (insist_on_sandboxing(
3459 context,
3460 root_dir, root_image,
3461 bind_mounts,
3462 n_bind_mounts)) {
3463 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3464 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3465 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3466
3467 r = -EOPNOTSUPP;
3468 } else {
aca835ed 3469 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3470 r = 0;
aca835ed 3471 }
93c6bb51
DH
3472 }
3473
8062e643 3474finalize:
4e677599 3475 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3476 return r;
3477}
3478
915e6d16
LP
3479static int apply_working_directory(
3480 const ExecContext *context,
3481 const ExecParameters *params,
3482 const char *home,
376fecf6 3483 int *exit_status) {
915e6d16 3484
6732edab 3485 const char *d, *wd;
2b3c1b9e
DH
3486
3487 assert(context);
376fecf6 3488 assert(exit_status);
2b3c1b9e 3489
6732edab
LP
3490 if (context->working_directory_home) {
3491
376fecf6
LP
3492 if (!home) {
3493 *exit_status = EXIT_CHDIR;
6732edab 3494 return -ENXIO;
376fecf6 3495 }
6732edab 3496
2b3c1b9e 3497 wd = home;
6732edab 3498
14eb3285
LP
3499 } else
3500 wd = empty_to_root(context->working_directory);
e7f1e7c6 3501
fa97f630 3502 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3503 d = wd;
fa97f630 3504 else
3b0e5bb5 3505 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3506
376fecf6
LP
3507 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3508 *exit_status = EXIT_CHDIR;
2b3c1b9e 3509 return -errno;
376fecf6 3510 }
e7f1e7c6
DH
3511
3512 return 0;
3513}
3514
fa97f630
JB
3515static int apply_root_directory(
3516 const ExecContext *context,
3517 const ExecParameters *params,
3518 const bool needs_mount_ns,
3519 int *exit_status) {
3520
3521 assert(context);
3522 assert(exit_status);
3523
5b10116e 3524 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3525 if (!needs_mount_ns && context->root_directory)
3526 if (chroot(context->root_directory) < 0) {
3527 *exit_status = EXIT_CHROOT;
3528 return -errno;
3529 }
fa97f630
JB
3530
3531 return 0;
3532}
3533
b1edf445 3534static int setup_keyring(
34cf6c43 3535 const Unit *u,
b1edf445
LP
3536 const ExecContext *context,
3537 const ExecParameters *p,
3538 uid_t uid, gid_t gid) {
3539
74dd6b51 3540 key_serial_t keyring;
e64c2d0b
DJL
3541 int r = 0;
3542 uid_t saved_uid;
3543 gid_t saved_gid;
74dd6b51
LP
3544
3545 assert(u);
b1edf445 3546 assert(context);
74dd6b51
LP
3547 assert(p);
3548
3549 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3550 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3551 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3552 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3553 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3554 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3555
b1edf445
LP
3556 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3557 return 0;
3558
e64c2d0b
DJL
3559 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3560 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3561 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3562 * & group is just as nasty as acquiring a reference to the user keyring. */
3563
3564 saved_uid = getuid();
3565 saved_gid = getgid();
3566
3567 if (gid_is_valid(gid) && gid != saved_gid) {
3568 if (setregid(gid, -1) < 0)
3569 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3570 }
3571
3572 if (uid_is_valid(uid) && uid != saved_uid) {
3573 if (setreuid(uid, -1) < 0) {
3574 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3575 goto out;
3576 }
3577 }
3578
74dd6b51
LP
3579 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3580 if (keyring == -1) {
3581 if (errno == ENOSYS)
8002fb97 3582 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3583 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3584 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3585 else if (errno == EDQUOT)
8002fb97 3586 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3587 else
e64c2d0b 3588 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3589
e64c2d0b 3590 goto out;
74dd6b51
LP
3591 }
3592
e64c2d0b
DJL
3593 /* When requested link the user keyring into the session keyring. */
3594 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3595
3596 if (keyctl(KEYCTL_LINK,
3597 KEY_SPEC_USER_KEYRING,
3598 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3599 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3600 goto out;
3601 }
3602 }
3603
3604 /* Restore uid/gid back */
3605 if (uid_is_valid(uid) && uid != saved_uid) {
3606 if (setreuid(saved_uid, -1) < 0) {
3607 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3608 goto out;
3609 }
3610 }
3611
3612 if (gid_is_valid(gid) && gid != saved_gid) {
3613 if (setregid(saved_gid, -1) < 0)
3614 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3615 }
3616
3617 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3618 if (!sd_id128_is_null(u->invocation_id)) {
3619 key_serial_t key;
3620
3621 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3622 if (key == -1)
8002fb97 3623 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3624 else {
3625 if (keyctl(KEYCTL_SETPERM, key,
3626 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3627 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3628 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3629 }
3630 }
3631
e64c2d0b 3632out:
37b22b3b 3633 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3634 /* no extra logging, as only the first already reported error matters */
3635 if (getuid() != saved_uid)
3636 (void) setreuid(saved_uid, -1);
b1edf445 3637
e64c2d0b
DJL
3638 if (getgid() != saved_gid)
3639 (void) setregid(saved_gid, -1);
b1edf445 3640
e64c2d0b 3641 return r;
74dd6b51
LP
3642}
3643
3042bbeb 3644static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3645 assert(array);
3646 assert(n);
2caa38e9 3647 assert(pair);
29206d46
LP
3648
3649 if (pair[0] >= 0)
3650 array[(*n)++] = pair[0];
3651 if (pair[1] >= 0)
3652 array[(*n)++] = pair[1];
3653}
3654
a34ceba6
LP
3655static int close_remaining_fds(
3656 const ExecParameters *params,
34cf6c43
YW
3657 const ExecRuntime *runtime,
3658 const DynamicCreds *dcreds,
00d9ef85 3659 int user_lookup_fd,
a34ceba6 3660 int socket_fd,
5b8d1f6b 3661 const int *fds, size_t n_fds) {
a34ceba6 3662
da6053d0 3663 size_t n_dont_close = 0;
00d9ef85 3664 int dont_close[n_fds + 12];
a34ceba6
LP
3665
3666 assert(params);
3667
3668 if (params->stdin_fd >= 0)
3669 dont_close[n_dont_close++] = params->stdin_fd;
3670 if (params->stdout_fd >= 0)
3671 dont_close[n_dont_close++] = params->stdout_fd;
3672 if (params->stderr_fd >= 0)
3673 dont_close[n_dont_close++] = params->stderr_fd;
3674
3675 if (socket_fd >= 0)
3676 dont_close[n_dont_close++] = socket_fd;
3677 if (n_fds > 0) {
3678 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3679 n_dont_close += n_fds;
3680 }
3681
a70581ff 3682 if (runtime) {
29206d46 3683 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3684 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3685 }
29206d46
LP
3686
3687 if (dcreds) {
3688 if (dcreds->user)
3689 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3690 if (dcreds->group)
3691 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3692 }
3693
00d9ef85
LP
3694 if (user_lookup_fd >= 0)
3695 dont_close[n_dont_close++] = user_lookup_fd;
3696
a34ceba6
LP
3697 return close_all_fds(dont_close, n_dont_close);
3698}
3699
00d9ef85
LP
3700static int send_user_lookup(
3701 Unit *unit,
3702 int user_lookup_fd,
3703 uid_t uid,
3704 gid_t gid) {
3705
3706 assert(unit);
3707
3708 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3709 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3710 * specified. */
3711
3712 if (user_lookup_fd < 0)
3713 return 0;
3714
3715 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3716 return 0;
3717
3718 if (writev(user_lookup_fd,
3719 (struct iovec[]) {
e6a7ec4b
LP
3720 IOVEC_INIT(&uid, sizeof(uid)),
3721 IOVEC_INIT(&gid, sizeof(gid)),
3722 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3723 return -errno;
3724
3725 return 0;
3726}
3727
6732edab
LP
3728static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3729 int r;
3730
3731 assert(c);
3732 assert(home);
3733 assert(buf);
3734
3735 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3736
3737 if (*home)
3738 return 0;
3739
3740 if (!c->working_directory_home)
3741 return 0;
3742
6732edab
LP
3743 r = get_home_dir(buf);
3744 if (r < 0)
3745 return r;
3746
3747 *home = *buf;
3748 return 1;
3749}
3750
da50b85a
LP
3751static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3752 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3753 int r;
3754
3755 assert(c);
3756 assert(p);
3757 assert(ret);
3758
3759 assert(c->dynamic_user);
3760
3761 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3762 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3763 * directories. */
3764
5b10116e 3765 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3766 if (t == EXEC_DIRECTORY_CONFIGURATION)
3767 continue;
3768
3769 if (!p->prefix[t])
3770 continue;
3771
211a3d87 3772 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3773 char *e;
3774
494d0247 3775 if (exec_directory_is_private(c, t))
211a3d87 3776 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3777 else
211a3d87 3778 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3779 if (!e)
3780 return -ENOMEM;
3781
3782 r = strv_consume(&list, e);
3783 if (r < 0)
3784 return r;
3785 }
3786 }
3787
ae2a15bc 3788 *ret = TAKE_PTR(list);
da50b85a
LP
3789
3790 return 0;
3791}
3792
78f93209
LP
3793static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3794 bool using_subcgroup;
3795 char *p;
3796
3797 assert(params);
3798 assert(ret);
3799
3800 if (!params->cgroup_path)
3801 return -EINVAL;
3802
3803 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3804 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3805 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3806 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3807 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3808 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3809 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3810 * flag, which is only passed for the former statements, not for the latter. */
3811
3812 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3813 if (using_subcgroup)
657ee2d8 3814 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3815 else
3816 p = strdup(params->cgroup_path);
3817 if (!p)
3818 return -ENOMEM;
3819
3820 *ret = p;
3821 return using_subcgroup;
3822}
3823
e2b2fb7f
MS
3824static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3825 _cleanup_(cpu_set_reset) CPUSet s = {};
3826 int r;
3827
3828 assert(c);
3829 assert(ret);
3830
3831 if (!c->numa_policy.nodes.set) {
3832 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3833 return 0;
3834 }
3835
3836 r = numa_to_cpu_set(&c->numa_policy, &s);
3837 if (r < 0)
3838 return r;
3839
3840 cpu_set_reset(ret);
3841
3842 return cpu_set_add_all(ret, &s);
3843}
3844
3845bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3846 assert(c);
3847
3848 return c->cpu_affinity_from_numa;
3849}
3850
1da37e58
ZJS
3851static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3852 int r;
3853
3854 assert(fds);
3855 assert(n_fds);
3856 assert(*n_fds < fds_size);
3857 assert(ret_fd);
3858
3859 if (fd < 0) {
3860 *ret_fd = -1;
3861 return 0;
3862 }
3863
3864 if (fd < 3 + (int) *n_fds) {
3865 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3866 * the fds we pass to the process (or which are closed only during execve). */
3867
3868 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3869 if (r < 0)
3870 return -errno;
3871
3872 CLOSE_AND_REPLACE(fd, r);
3873 }
3874
3875 *ret_fd = fds[*n_fds] = fd;
3876 (*n_fds) ++;
3877 return 1;
3878}
3879
ff0af2a1 3880static int exec_child(
f2341e0a 3881 Unit *unit,
34cf6c43 3882 const ExecCommand *command,
ff0af2a1
LP
3883 const ExecContext *context,
3884 const ExecParameters *params,
3885 ExecRuntime *runtime,
29206d46 3886 DynamicCreds *dcreds,
ff0af2a1 3887 int socket_fd,
2caa38e9 3888 const int named_iofds[static 3],
4c47affc 3889 int *fds,
da6053d0 3890 size_t n_socket_fds,
25b583d7 3891 size_t n_storage_fds,
ff0af2a1 3892 char **files_env,
00d9ef85 3893 int user_lookup_fd,
12145637 3894 int *exit_status) {
d35fbf6b 3895
8c35c10d 3896 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3897 int r, ngids = 0, exec_fd;
4d885bd3
DH
3898 _cleanup_free_ gid_t *supplementary_gids = NULL;
3899 const char *username = NULL, *groupname = NULL;
5686391b 3900 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3901 const char *home = NULL, *shell = NULL;
7ca69792 3902 char **final_argv = NULL;
7bce046b
LP
3903 dev_t journal_stream_dev = 0;
3904 ino_t journal_stream_ino = 0;
5749f855 3905 bool userns_set_up = false;
165a31c0
LP
3906 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3907 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3908 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3909 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3910#if HAVE_SELINUX
7f59dd35 3911 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3912 bool use_selinux = false;
ecfbc84f 3913#endif
f9fa32f0 3914#if ENABLE_SMACK
43b1f709 3915 bool use_smack = false;
ecfbc84f 3916#endif
349cc4a5 3917#if HAVE_APPARMOR
43b1f709 3918 bool use_apparmor = false;
ecfbc84f 3919#endif
5749f855
AZ
3920 uid_t saved_uid = getuid();
3921 gid_t saved_gid = getgid();
fed1e721
LP
3922 uid_t uid = UID_INVALID;
3923 gid_t gid = GID_INVALID;
1da37e58
ZJS
3924 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3925 n_keep_fds; /* total number of fds not to close */
165a31c0 3926 int secure_bits;
afb11bf1
DG
3927 _cleanup_free_ gid_t *gids_after_pam = NULL;
3928 int ngids_after_pam = 0;
034c6ed7 3929
f2341e0a 3930 assert(unit);
5cb5a6ff
LP
3931 assert(command);
3932 assert(context);
d35fbf6b 3933 assert(params);
ff0af2a1 3934 assert(exit_status);
d35fbf6b
DM
3935
3936 rename_process_from_path(command->path);
3937
9c274488
LP
3938 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3939 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3940 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3941 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3942 SIGNALS_IGNORE);
d35fbf6b
DM
3943
3944 if (context->ignore_sigpipe)
9c274488 3945 (void) ignore_signals(SIGPIPE);
d35fbf6b 3946
ff0af2a1
LP
3947 r = reset_signal_mask();
3948 if (r < 0) {
3949 *exit_status = EXIT_SIGNAL_MASK;
12145637 3950 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3951 }
034c6ed7 3952
d35fbf6b
DM
3953 if (params->idle_pipe)
3954 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3955
2c027c62
LP
3956 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3957 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3958 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3959 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3960
d35fbf6b 3961 log_forget_fds();
2c027c62 3962 log_set_open_when_needed(true);
4f2d528d 3963
40a80078
LP
3964 /* In case anything used libc syslog(), close this here, too */
3965 closelog();
3966
b1994387 3967 int keep_fds[n_fds + 3];
1da37e58
ZJS
3968 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3969 n_keep_fds = n_fds;
3970
3971 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3972 if (r < 0) {
3973 *exit_status = EXIT_FDS;
3974 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3975 }
3976
b1994387
ILG
3977#if HAVE_LIBBPF
3978 if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
3979 int bpf_map_fd = -1;
3980
3981 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
3982 if (bpf_map_fd < 0) {
3983 *exit_status = EXIT_FDS;
3984 return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
3985 }
3986
3987 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
3988 if (r < 0) {
3989 *exit_status = EXIT_FDS;
3990 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3991 }
3992 }
3993#endif
3994
1da37e58 3995 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3996 if (r < 0) {
3997 *exit_status = EXIT_FDS;
12145637 3998 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3999 }
4000
0af07108
ZJS
4001 if (!context->same_pgrp &&
4002 setsid() < 0) {
4003 *exit_status = EXIT_SETSID;
4004 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4005 }
9e2f7c11 4006
1e22b5cd 4007 exec_context_tty_reset(context, params);
d35fbf6b 4008
c891efaf 4009 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4010 _cleanup_free_ char *cmdline = NULL;
4011
4ef15008 4012 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4013 if (!cmdline) {
0460aa5c 4014 *exit_status = EXIT_MEMORY;
12145637 4015 return log_oom();
3b20f877 4016 }
d35fbf6b 4017
4ef15008 4018 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4019 if (r != CONFIRM_EXECUTE) {
4020 if (r == CONFIRM_PRETEND_SUCCESS) {
4021 *exit_status = EXIT_SUCCESS;
4022 return 0;
4023 }
ff0af2a1 4024 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4025 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4026 "Execution cancelled by the user");
d35fbf6b
DM
4027 }
4028 }
1a63a750 4029
d521916d
LP
4030 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4031 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4032 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4033 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4034 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4035 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4036 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4037 *exit_status = EXIT_MEMORY;
4038 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4039 }
4040
29206d46 4041 if (context->dynamic_user && dcreds) {
da50b85a 4042 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4043
d521916d 4044 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4045 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4046 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4047 *exit_status = EXIT_USER;
12145637 4048 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4049 }
4050
da50b85a
LP
4051 r = compile_suggested_paths(context, params, &suggested_paths);
4052 if (r < 0) {
4053 *exit_status = EXIT_MEMORY;
4054 return log_oom();
4055 }
4056
4057 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4058 if (r < 0) {
4059 *exit_status = EXIT_USER;
d85ff944
YW
4060 if (r == -EILSEQ)
4061 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4062 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4063 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4064 }
524daa8c 4065
70dd455c 4066 if (!uid_is_valid(uid)) {
29206d46 4067 *exit_status = EXIT_USER;
d85ff944 4068 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4069 }
4070
4071 if (!gid_is_valid(gid)) {
4072 *exit_status = EXIT_USER;
d85ff944 4073 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4074 }
5bc7452b 4075
29206d46
LP
4076 if (dcreds->user)
4077 username = dcreds->user->name;
4078
4079 } else {
4d885bd3
DH
4080 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4081 if (r < 0) {
4082 *exit_status = EXIT_USER;
12145637 4083 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4084 }
5bc7452b 4085
4d885bd3
DH
4086 r = get_fixed_group(context, &groupname, &gid);
4087 if (r < 0) {
4088 *exit_status = EXIT_GROUP;
12145637 4089 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4090 }
cdc5d5c5 4091 }
29206d46 4092
cdc5d5c5
DH
4093 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4094 r = get_supplementary_groups(context, username, groupname, gid,
4095 &supplementary_gids, &ngids);
4096 if (r < 0) {
4097 *exit_status = EXIT_GROUP;
12145637 4098 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4099 }
5bc7452b 4100
00d9ef85
LP
4101 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4102 if (r < 0) {
4103 *exit_status = EXIT_USER;
12145637 4104 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4105 }
4106
4107 user_lookup_fd = safe_close(user_lookup_fd);
4108
6732edab
LP
4109 r = acquire_home(context, uid, &home, &home_buffer);
4110 if (r < 0) {
4111 *exit_status = EXIT_CHDIR;
12145637 4112 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4113 }
4114
d35fbf6b
DM
4115 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4116 * must sure to drop O_NONBLOCK */
4117 if (socket_fd >= 0)
a34ceba6 4118 (void) fd_nonblock(socket_fd, false);
acbb0225 4119
4c70a4a7
MS
4120 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4121 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4122 if (params->cgroup_path) {
4123 _cleanup_free_ char *p = NULL;
4124
4125 r = exec_parameters_get_cgroup_path(params, &p);
4126 if (r < 0) {
4127 *exit_status = EXIT_CGROUP;
4128 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4129 }
4130
4131 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4132 if (r < 0) {
4133 *exit_status = EXIT_CGROUP;
4134 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4135 }
4136 }
4137
a8d08f39 4138 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4139 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4140 if (r < 0) {
4141 *exit_status = EXIT_NETWORK;
4142 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4143 }
4144 }
4145
a70581ff
XR
4146 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4147 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4148 if (r < 0) {
4149 *exit_status = EXIT_NAMESPACE;
4150 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4151 }
4152 }
4153
52c239d7 4154 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4155 if (r < 0) {
4156 *exit_status = EXIT_STDIN;
12145637 4157 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4158 }
034c6ed7 4159
52c239d7 4160 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4161 if (r < 0) {
4162 *exit_status = EXIT_STDOUT;
12145637 4163 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4164 }
4165
52c239d7 4166 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4167 if (r < 0) {
4168 *exit_status = EXIT_STDERR;
12145637 4169 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4170 }
4171
d35fbf6b 4172 if (context->oom_score_adjust_set) {
9f8168eb
LP
4173 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4174 * prohibit write access to this file, and we shouldn't trip up over that. */
4175 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4176 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4177 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4178 else if (r < 0) {
ff0af2a1 4179 *exit_status = EXIT_OOM_ADJUST;
12145637 4180 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4181 }
d35fbf6b
DM
4182 }
4183
ad21e542
ZJS
4184 if (context->coredump_filter_set) {
4185 r = set_coredump_filter(context->coredump_filter);
4186 if (ERRNO_IS_PRIVILEGE(r))
4187 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4188 else if (r < 0)
4189 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4190 }
4191
39090201
DJL
4192 if (context->nice_set) {
4193 r = setpriority_closest(context->nice);
4194 if (r < 0)
4195 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4196 }
613b411c 4197
d35fbf6b
DM
4198 if (context->cpu_sched_set) {
4199 struct sched_param param = {
4200 .sched_priority = context->cpu_sched_priority,
4201 };
4202
ff0af2a1
LP
4203 r = sched_setscheduler(0,
4204 context->cpu_sched_policy |
4205 (context->cpu_sched_reset_on_fork ?
4206 SCHED_RESET_ON_FORK : 0),
4207 &param);
4208 if (r < 0) {
4209 *exit_status = EXIT_SETSCHEDULER;
12145637 4210 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4211 }
d35fbf6b 4212 }
fc9b2a84 4213
e2b2fb7f
MS
4214 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4215 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4216 const CPUSet *cpu_set;
4217
4218 if (context->cpu_affinity_from_numa) {
4219 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4220 if (r < 0) {
4221 *exit_status = EXIT_CPUAFFINITY;
4222 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4223 }
4224
4225 cpu_set = &converted_cpu_set;
4226 } else
4227 cpu_set = &context->cpu_set;
4228
4229 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4230 *exit_status = EXIT_CPUAFFINITY;
12145637 4231 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4232 }
e2b2fb7f 4233 }
034c6ed7 4234
b070c7c0
MS
4235 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4236 r = apply_numa_policy(&context->numa_policy);
4237 if (r == -EOPNOTSUPP)
33fe9e3f 4238 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4239 else if (r < 0) {
4240 *exit_status = EXIT_NUMA_POLICY;
4241 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4242 }
4243 }
4244
d35fbf6b
DM
4245 if (context->ioprio_set)
4246 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4247 *exit_status = EXIT_IOPRIO;
12145637 4248 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4249 }
da726a4d 4250
d35fbf6b
DM
4251 if (context->timer_slack_nsec != NSEC_INFINITY)
4252 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4253 *exit_status = EXIT_TIMERSLACK;
12145637 4254 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4255 }
9eba9da4 4256
21022b9d
LP
4257 if (context->personality != PERSONALITY_INVALID) {
4258 r = safe_personality(context->personality);
4259 if (r < 0) {
ff0af2a1 4260 *exit_status = EXIT_PERSONALITY;
12145637 4261 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4262 }
21022b9d 4263 }
94f04347 4264
33331d11
VB
4265 if (context->utmp_id) {
4266 const char *line = context->tty_path ?
4267 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4268 NULL;
df0ff127 4269 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4270 line,
023a4f67
LP
4271 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4272 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4273 USER_PROCESS,
6a93917d 4274 username);
33331d11 4275 }
d35fbf6b 4276
08f67696 4277 if (uid_is_valid(uid)) {
ff0af2a1
LP
4278 r = chown_terminal(STDIN_FILENO, uid);
4279 if (r < 0) {
4280 *exit_status = EXIT_STDIN;
12145637 4281 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4282 }
d35fbf6b 4283 }
8e274523 4284
4e1dfa45 4285 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4286 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4287 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4288 * touch a single hierarchy too. */
584b8688 4289 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4290 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4291 if (r < 0) {
4292 *exit_status = EXIT_CGROUP;
12145637 4293 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4294 }
d35fbf6b 4295 }
034c6ed7 4296
211a3d87
LB
4297 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4298
5b10116e 4299 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4300 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4301 if (r < 0)
4302 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4303 }
94f04347 4304
bb0c0d6f
LP
4305 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4306 r = setup_credentials(context, params, unit->id, uid);
4307 if (r < 0) {
4308 *exit_status = EXIT_CREDENTIALS;
4309 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4310 }
4311 }
4312
7bce046b 4313 r = build_environment(
fd63e712 4314 unit,
7bce046b
LP
4315 context,
4316 params,
4317 n_fds,
4318 home,
4319 username,
4320 shell,
4321 journal_stream_dev,
4322 journal_stream_ino,
4323 &our_env);
2065ca69
JW
4324 if (r < 0) {
4325 *exit_status = EXIT_MEMORY;
12145637 4326 return log_oom();
2065ca69
JW
4327 }
4328
4329 r = build_pass_environment(context, &pass_env);
4330 if (r < 0) {
4331 *exit_status = EXIT_MEMORY;
12145637 4332 return log_oom();
2065ca69
JW
4333 }
4334
8c35c10d 4335 /* The PATH variable is set to the default path in params->environment.
4336 * However, this is overridden if user specified fields have PATH set.
4337 * The intention is to also override PATH if the user does
4338 * not specify PATH and the user has specified ExecSearchPath
4339 */
4340
4341 if (!strv_isempty(context->exec_search_path)) {
4342 _cleanup_free_ char *joined = NULL;
4343
4344 joined = strv_join(context->exec_search_path, ":");
4345 if (!joined) {
4346 *exit_status = EXIT_MEMORY;
4347 return log_oom();
4348 }
4349
4350 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4351 if (r < 0) {
4352 *exit_status = EXIT_MEMORY;
4353 return log_oom();
4354 }
4355 }
4356
4ab3d29f 4357 accum_env = strv_env_merge(params->environment,
2065ca69 4358 our_env,
8c35c10d 4359 joined_exec_search_path,
2065ca69
JW
4360 pass_env,
4361 context->environment,
44e5d006 4362 files_env);
2065ca69
JW
4363 if (!accum_env) {
4364 *exit_status = EXIT_MEMORY;
12145637 4365 return log_oom();
2065ca69 4366 }
1280503b 4367 accum_env = strv_env_clean(accum_env);
2065ca69 4368
096424d1 4369 (void) umask(context->umask);
b213e1c1 4370
b1edf445 4371 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4372 if (r < 0) {
4373 *exit_status = EXIT_KEYRING;
12145637 4374 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4375 }
4376
165a31c0 4377 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4378 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4379
165a31c0
LP
4380 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4381 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4382
165a31c0
LP
4383 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4384 if (needs_ambient_hack)
4385 needs_setuid = false;
4386 else
4387 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4388
4389 if (needs_sandboxing) {
7f18ef0a
FK
4390 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4391 * present. The actual MAC context application will happen later, as late as possible, to avoid
4392 * impacting our own code paths. */
4393
349cc4a5 4394#if HAVE_SELINUX
43b1f709 4395 use_selinux = mac_selinux_use();
7f18ef0a 4396#endif
f9fa32f0 4397#if ENABLE_SMACK
43b1f709 4398 use_smack = mac_smack_use();
7f18ef0a 4399#endif
349cc4a5 4400#if HAVE_APPARMOR
43b1f709 4401 use_apparmor = mac_apparmor_use();
7f18ef0a 4402#endif
165a31c0 4403 }
7f18ef0a 4404
ce932d2d
LP
4405 if (needs_sandboxing) {
4406 int which_failed;
4407
4408 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4409 * is set here. (See below.) */
4410
4411 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4412 if (r < 0) {
4413 *exit_status = EXIT_LIMITS;
4414 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4415 }
4416 }
4417
0af07108 4418 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4419 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4420 * wins here. (See above.) */
4421
1da37e58 4422 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4423 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4424 if (r < 0) {
4425 *exit_status = EXIT_PAM;
4426 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4427 }
ac45f971 4428
0af07108
ZJS
4429 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4430 if (ngids_after_pam < 0) {
4431 *exit_status = EXIT_MEMORY;
4432 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4433 }
b213e1c1 4434 }
5749f855 4435
0af07108 4436 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4437 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4438 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4439 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4440
4441 userns_set_up = true;
4442 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4443 if (r < 0) {
4444 *exit_status = EXIT_USER;
4445 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4446 }
4447 }
4448
a8d08f39
LP
4449 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4450
6e2d7c4f 4451 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4452 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4453 if (r == -EPERM)
4454 log_unit_warning_errno(unit, r,
4455 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4456 else if (r < 0) {
6e2d7c4f
MS
4457 *exit_status = EXIT_NETWORK;
4458 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4459 }
a8d08f39
LP
4460 } else if (context->network_namespace_path) {
4461 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4462 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4463 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4464 } else
4465 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4466 }
169c1bda 4467
a70581ff
XR
4468 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4469
4470 if (ns_type_supported(NAMESPACE_IPC)) {
4471 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4472 if (r == -EPERM)
4473 log_unit_warning_errno(unit, r,
4474 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4475 else if (r < 0) {
4476 *exit_status = EXIT_NAMESPACE;
4477 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4478 }
4479 } else if (context->ipc_namespace_path) {
4480 *exit_status = EXIT_NAMESPACE;
4481 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4482 "IPCNamespacePath= is not supported, refusing.");
4483 } else
4484 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4485 }
4486
ee818b89 4487 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4488 _cleanup_free_ char *error_path = NULL;
4489
9f71ba8d 4490 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4491 if (r < 0) {
4492 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4493 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4494 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4495 }
d35fbf6b 4496 }
81a2b7ce 4497
daf8f72b
LP
4498 if (needs_sandboxing) {
4499 r = apply_protect_hostname(unit, context, exit_status);
4500 if (r < 0)
4501 return r;
aecd5ac6
TM
4502 }
4503
5749f855
AZ
4504 /* Drop groups as early as possible.
4505 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4506 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4507 if (needs_setuid) {
afb11bf1
DG
4508 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4509 int ngids_to_enforce = 0;
4510
4511 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4512 ngids,
4513 gids_after_pam,
4514 ngids_after_pam,
4515 &gids_to_enforce);
4516 if (ngids_to_enforce < 0) {
4517 *exit_status = EXIT_MEMORY;
4518 return log_unit_error_errno(unit,
4519 ngids_to_enforce,
4520 "Failed to merge group lists. Group membership might be incorrect: %m");
4521 }
4522
4523 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4524 if (r < 0) {
4525 *exit_status = EXIT_GROUP;
12145637 4526 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4527 }
165a31c0 4528 }
096424d1 4529
5749f855
AZ
4530 /* If the user namespace was not set up above, try to do it now.
4531 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4532 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4533 * case of mount namespaces being less privileged when the mount point list is copied from a
4534 * different user namespace). */
9008e1ac 4535
5749f855
AZ
4536 if (needs_sandboxing && context->private_users && !userns_set_up) {
4537 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4538 if (r < 0) {
4539 *exit_status = EXIT_USER;
4540 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4541 }
4542 }
4543
9f71ba8d
ZJS
4544 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4545 * shall execute. */
4546
4547 _cleanup_free_ char *executable = NULL;
b83d5050 4548 _cleanup_close_ int executable_fd = -1;
8c35c10d 4549 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4550 if (r < 0) {
4551 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4552 log_unit_struct_errno(unit, LOG_INFO, r,
4553 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4554 LOG_UNIT_INVOCATION_ID(unit),
4555 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4556 command->path),
4557 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4558 return 0;
4559 }
4560
4561 *exit_status = EXIT_EXEC;
c2503e35
RH
4562
4563 return log_unit_struct_errno(unit, LOG_INFO, r,
4564 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4565 LOG_UNIT_INVOCATION_ID(unit),
4566 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4567 command->path),
4568 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4569 }
4570
b83d5050
ZJS
4571 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4572 if (r < 0) {
4573 *exit_status = EXIT_FDS;
4574 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4575 }
4576
9f71ba8d 4577#if HAVE_SELINUX
49590d67
MS
4578 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4579 int fd = -1;
4580
4581 if (socket_fd >= 0)
4582 fd = socket_fd;
4583 else if (params->n_socket_fds == 1)
4584 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4585 * use context from that fd to compute the label. */
4586 fd = params->fds[0];
4587
4588 if (fd >= 0) {
4589 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4590 if (r < 0) {
4591 if (!context->selinux_context_ignore) {
4592 *exit_status = EXIT_SELINUX_CONTEXT;
4593 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4594 }
4595 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4596 }
9f71ba8d
ZJS
4597 }
4598 }
4599#endif
4600
165a31c0 4601 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4602 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4603 * however if we have it as we want to keep it open until the final execve(). */
4604
1da37e58 4605 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4606 if (r >= 0)
4607 r = shift_fds(fds, n_fds);
4608 if (r >= 0)
25b583d7 4609 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4610 if (r < 0) {
4611 *exit_status = EXIT_FDS;
12145637 4612 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4613 }
e66cf1a3 4614
5686391b
LP
4615 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4616 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4617 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4618 * came this far. */
4619
165a31c0 4620 secure_bits = context->secure_bits;
e66cf1a3 4621
165a31c0
LP
4622 if (needs_sandboxing) {
4623 uint64_t bset;
e66cf1a3 4624
ce932d2d
LP
4625 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4626 * requested. (Note this is placed after the general resource limit initialization, see
4627 * above, in order to take precedence.) */
f4170c67
LP
4628 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4629 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4630 *exit_status = EXIT_LIMITS;
12145637 4631 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4632 }
4633 }
4634
37ac2744
JB
4635#if ENABLE_SMACK
4636 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4637 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4638 if (use_smack) {
b83d5050 4639 r = setup_smack(context, executable_fd);
29ff6247 4640 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4641 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4642 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4643 }
4644 }
4645#endif
4646
165a31c0
LP
4647 bset = context->capability_bounding_set;
4648 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4649 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4650 * instead of us doing that */
4651 if (needs_ambient_hack)
4652 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4653 (UINT64_C(1) << CAP_SETUID) |
4654 (UINT64_C(1) << CAP_SETGID);
4655
4656 if (!cap_test_all(bset)) {
4657 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4658 if (r < 0) {
4659 *exit_status = EXIT_CAPABILITIES;
12145637 4660 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4661 }
4c2630eb 4662 }
3b8bddde 4663
16fcb191
TK
4664 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4665 * keep-caps set.
4666 * To be able to raise the ambient capabilities after setresuid() they have to be
4667 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4668 * After setresuid() the ambient capabilities can be raised as they are present in
4669 * the permitted and inhertiable set. However it is possible that someone wants to
4670 * set ambient capabilities without changing the user, so we also set the ambient
4671 * capabilities here.
4672 * The requested ambient capabilities are raised in the inheritable set if the
4673 * second argument is true. */
943800f4 4674 if (!needs_ambient_hack) {
755d4b67
IP
4675 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4676 if (r < 0) {
4677 *exit_status = EXIT_CAPABILITIES;
12145637 4678 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4679 }
755d4b67 4680 }
165a31c0 4681 }
755d4b67 4682
fa97f630
JB
4683 /* chroot to root directory first, before we lose the ability to chroot */
4684 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4685 if (r < 0)
4686 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4687
165a31c0 4688 if (needs_setuid) {
08f67696 4689 if (uid_is_valid(uid)) {
ff0af2a1
LP
4690 r = enforce_user(context, uid);
4691 if (r < 0) {
4692 *exit_status = EXIT_USER;
12145637 4693 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4694 }
165a31c0
LP
4695
4696 if (!needs_ambient_hack &&
4697 context->capability_ambient_set != 0) {
755d4b67 4698
16fcb191 4699 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4700 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4701 if (r < 0) {
4702 *exit_status = EXIT_CAPABILITIES;
12145637 4703 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4704 }
755d4b67 4705 }
5b6319dc 4706 }
165a31c0 4707 }
d35fbf6b 4708
56ef8db9
JB
4709 /* Apply working directory here, because the working directory might be on NFS and only the user running
4710 * this service might have the correct privilege to change to the working directory */
fa97f630 4711 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4712 if (r < 0)
4713 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4714
165a31c0 4715 if (needs_sandboxing) {
37ac2744 4716 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4717 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4718 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4719 * are restricted. */
4720
349cc4a5 4721#if HAVE_SELINUX
43b1f709 4722 if (use_selinux) {
5cd9cd35
LP
4723 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4724
4725 if (exec_context) {
4726 r = setexeccon(exec_context);
006d1864
TM
4727 if (r < 0) {
4728 if (!context->selinux_context_ignore) {
4729 *exit_status = EXIT_SELINUX_CONTEXT;
4730 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4731 }
4732 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4733 }
4734 }
4735 }
4736#endif
4737
349cc4a5 4738#if HAVE_APPARMOR
43b1f709 4739 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4740 r = aa_change_onexec(context->apparmor_profile);
4741 if (r < 0 && !context->apparmor_profile_ignore) {
4742 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4743 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4744 }
4745 }
4746#endif
4747
165a31c0 4748 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4749 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4750 * CAP_SETPCAP. */
4751 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4752 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4753 * effective set here.
4754 * The effective set is overwritten during execve with the following values:
4755 * - ambient set (for non-root processes)
4756 * - (inheritable | bounding) set for root processes)
4757 *
4758 * Hence there is no security impact to raise it in the effective set before execve
4759 */
4760 r = capability_gain_cap_setpcap(NULL);
4761 if (r < 0) {
4762 *exit_status = EXIT_CAPABILITIES;
4763 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4764 }
755d4b67 4765 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4766 *exit_status = EXIT_SECUREBITS;
12145637 4767 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4768 }
dbdc4098 4769 }
5b6319dc 4770
59eeb84b 4771 if (context_has_no_new_privileges(context))
d35fbf6b 4772 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4773 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4774 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4775 }
4776
349cc4a5 4777#if HAVE_SECCOMP
469830d1
LP
4778 r = apply_address_families(unit, context);
4779 if (r < 0) {
4780 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4781 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4782 }
04aa0cb9 4783
469830d1
LP
4784 r = apply_memory_deny_write_execute(unit, context);
4785 if (r < 0) {
4786 *exit_status = EXIT_SECCOMP;
12145637 4787 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4788 }
f4170c67 4789
469830d1
LP
4790 r = apply_restrict_realtime(unit, context);
4791 if (r < 0) {
4792 *exit_status = EXIT_SECCOMP;
12145637 4793 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4794 }
4795
f69567cb
LP
4796 r = apply_restrict_suid_sgid(unit, context);
4797 if (r < 0) {
4798 *exit_status = EXIT_SECCOMP;
4799 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4800 }
4801
add00535
LP
4802 r = apply_restrict_namespaces(unit, context);
4803 if (r < 0) {
4804 *exit_status = EXIT_SECCOMP;
12145637 4805 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4806 }
4807
469830d1
LP
4808 r = apply_protect_sysctl(unit, context);
4809 if (r < 0) {
4810 *exit_status = EXIT_SECCOMP;
12145637 4811 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4812 }
4813
469830d1
LP
4814 r = apply_protect_kernel_modules(unit, context);
4815 if (r < 0) {
4816 *exit_status = EXIT_SECCOMP;
12145637 4817 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4818 }
4819
84703040
KK
4820 r = apply_protect_kernel_logs(unit, context);
4821 if (r < 0) {
4822 *exit_status = EXIT_SECCOMP;
4823 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4824 }
4825
fc64760d
KK
4826 r = apply_protect_clock(unit, context);
4827 if (r < 0) {
4828 *exit_status = EXIT_SECCOMP;
4829 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4830 }
4831
469830d1
LP
4832 r = apply_private_devices(unit, context);
4833 if (r < 0) {
4834 *exit_status = EXIT_SECCOMP;
12145637 4835 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4836 }
4837
4838 r = apply_syscall_archs(unit, context);
4839 if (r < 0) {
4840 *exit_status = EXIT_SECCOMP;
12145637 4841 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4842 }
4843
78e864e5
TM
4844 r = apply_lock_personality(unit, context);
4845 if (r < 0) {
4846 *exit_status = EXIT_SECCOMP;
12145637 4847 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4848 }
4849
9df2cdd8
TM
4850 r = apply_syscall_log(unit, context);
4851 if (r < 0) {
4852 *exit_status = EXIT_SECCOMP;
4853 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4854 }
4855
5cd9cd35
LP
4856 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4857 * by the filter as little as possible. */
165a31c0 4858 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4859 if (r < 0) {
4860 *exit_status = EXIT_SECCOMP;
12145637 4861 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4862 }
4863#endif
b1994387
ILG
4864
4865#if HAVE_LIBBPF
4866 r = apply_restrict_filesystems(unit, context);
4867 if (r < 0) {
4868 *exit_status = EXIT_BPF;
4869 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
4870 }
4871#endif
4872
d35fbf6b 4873 }
034c6ed7 4874
00819cc1
LP
4875 if (!strv_isempty(context->unset_environment)) {
4876 char **ee = NULL;
4877
4878 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4879 if (!ee) {
4880 *exit_status = EXIT_MEMORY;
12145637 4881 return log_oom();
00819cc1
LP
4882 }
4883
130d3d22 4884 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4885 }
4886
7ca69792
AZ
4887 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4888 replaced_argv = replace_env_argv(command->argv, accum_env);
4889 if (!replaced_argv) {
4890 *exit_status = EXIT_MEMORY;
4891 return log_oom();
4892 }
4893 final_argv = replaced_argv;
4894 } else
4895 final_argv = command->argv;
034c6ed7 4896
f1d34068 4897 if (DEBUG_LOGGING) {
c2b2df60 4898 _cleanup_free_ char *line = NULL;
81a2b7ce 4899
4ef15008 4900 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
4901 if (!line) {
4902 *exit_status = EXIT_MEMORY;
4903 return log_oom();
4904 }
4905
4906 log_unit_struct(unit, LOG_DEBUG,
4907 "EXECUTABLE=%s", executable,
4908 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 4909 }
dd305ec9 4910
5686391b
LP
4911 if (exec_fd >= 0) {
4912 uint8_t hot = 1;
4913
4914 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4915 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4916
4917 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4918 *exit_status = EXIT_EXEC;
4919 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4920 }
4921 }
4922
a6d9111c 4923 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4924
4925 if (exec_fd >= 0) {
4926 uint8_t hot = 0;
4927
4928 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4929 * that POLLHUP on it no longer means execve() succeeded. */
4930
4931 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4932 *exit_status = EXIT_EXEC;
4933 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4934 }
4935 }
12145637 4936
ff0af2a1 4937 *exit_status = EXIT_EXEC;
9f71ba8d 4938 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4939}
81a2b7ce 4940
34cf6c43 4941static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4942static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4943
f2341e0a
LP
4944int exec_spawn(Unit *unit,
4945 ExecCommand *command,
d35fbf6b
DM
4946 const ExecContext *context,
4947 const ExecParameters *params,
4948 ExecRuntime *runtime,
29206d46 4949 DynamicCreds *dcreds,
d35fbf6b 4950 pid_t *ret) {
8351ceae 4951
ee39ca20 4952 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4953 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4954 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4955 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4956 _cleanup_free_ char *line = NULL;
d35fbf6b 4957 pid_t pid;
8351ceae 4958
f2341e0a 4959 assert(unit);
d35fbf6b
DM
4960 assert(command);
4961 assert(context);
4962 assert(ret);
4963 assert(params);
25b583d7 4964 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4965
d35fbf6b
DM
4966 if (context->std_input == EXEC_INPUT_SOCKET ||
4967 context->std_output == EXEC_OUTPUT_SOCKET ||
4968 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4969
d85ff944
YW
4970 if (params->n_socket_fds > 1)
4971 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4972
d85ff944
YW
4973 if (params->n_socket_fds == 0)
4974 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4975
d35fbf6b
DM
4976 socket_fd = params->fds[0];
4977 } else {
4978 socket_fd = -1;
4979 fds = params->fds;
9b141911 4980 n_socket_fds = params->n_socket_fds;
25b583d7 4981 n_storage_fds = params->n_storage_fds;
d35fbf6b 4982 }
94f04347 4983
34cf6c43 4984 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4985 if (r < 0)
4986 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4987
f2341e0a 4988 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4989 if (r < 0)
f2341e0a 4990 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4991
4ef15008 4992 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
4993 if (!line)
4994 return log_oom();
fab56fc5 4995
9f71ba8d
ZJS
4996 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4997 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4998 mac_selinux_maybe_reload();
4999
c2503e35
RH
5000 log_unit_struct(unit, LOG_DEBUG,
5001 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5002 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5003 the mount namespace in the child, but we want to log
5004 from the parent, so we need to use the (possibly
5005 inaccurate) path here. */
5006 LOG_UNIT_INVOCATION_ID(unit));
12145637 5007
78f93209
LP
5008 if (params->cgroup_path) {
5009 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5010 if (r < 0)
5011 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5012 if (r > 0) { /* We are using a child cgroup */
5013 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5014 if (r < 0)
5015 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
5016
5017 /* Normally we would not propagate the oomd xattrs to children but since we created this
5018 * sub-cgroup internally we should do it. */
5019 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
5020 }
5021 }
5022
d35fbf6b
DM
5023 pid = fork();
5024 if (pid < 0)
74129a12 5025 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5026
5027 if (pid == 0) {
12145637 5028 int exit_status = EXIT_SUCCESS;
ff0af2a1 5029
f2341e0a
LP
5030 r = exec_child(unit,
5031 command,
ff0af2a1
LP
5032 context,
5033 params,
5034 runtime,
29206d46 5035 dcreds,
ff0af2a1 5036 socket_fd,
52c239d7 5037 named_iofds,
4c47affc 5038 fds,
9b141911 5039 n_socket_fds,
25b583d7 5040 n_storage_fds,
ff0af2a1 5041 files_env,
00d9ef85 5042 unit->manager->user_lookup_fds[1],
12145637
LP
5043 &exit_status);
5044
e1714f02
ZJS
5045 if (r < 0) {
5046 const char *status =
5047 exit_status_to_string(exit_status,
e04ed6db 5048 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5049
c2503e35
RH
5050 log_unit_struct_errno(unit, LOG_ERR, r,
5051 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5052 LOG_UNIT_INVOCATION_ID(unit),
5053 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5054 status, command->path),
5055 "EXECUTABLE=%s", command->path);
e1714f02 5056 }
4c2630eb 5057
ff0af2a1 5058 _exit(exit_status);
034c6ed7
LP
5059 }
5060
f2341e0a 5061 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5062
78f93209
LP
5063 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5064 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5065 * process will be killed too). */
5066 if (subcgroup_path)
5067 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5068
b58b4116 5069 exec_status_start(&command->exec_status, pid);
9fb86720 5070
034c6ed7 5071 *ret = pid;
5cb5a6ff
LP
5072 return 0;
5073}
5074
034c6ed7
LP
5075void exec_context_init(ExecContext *c) {
5076 assert(c);
5077
4c12626c 5078 c->umask = 0022;
0692548c 5079 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5080 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5081 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5082 c->syslog_level_prefix = true;
353e12c2 5083 c->ignore_sigpipe = true;
3a43da28 5084 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5085 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5086 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5087 c->directories[t].mode = 0755;
12213aed 5088 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5089 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5090 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5091 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5092 c->log_level_max = -1;
005bfaf1
TM
5093#if HAVE_SECCOMP
5094 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5095#endif
51462135
DDM
5096 c->tty_rows = UINT_MAX;
5097 c->tty_cols = UINT_MAX;
b070c7c0 5098 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5099}
5100
613b411c 5101void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5102 assert(c);
5103
6796073e
LP
5104 c->environment = strv_free(c->environment);
5105 c->environment_files = strv_free(c->environment_files);
b4c14404 5106 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5107 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5108
31ce987c 5109 rlimit_free_all(c->rlimit);
034c6ed7 5110
5b10116e 5111 for (size_t l = 0; l < 3; l++) {
52c239d7 5112 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5113 c->stdio_file[l] = mfree(c->stdio_file[l]);
5114 }
52c239d7 5115
a1e58e8e
LP
5116 c->working_directory = mfree(c->working_directory);
5117 c->root_directory = mfree(c->root_directory);
915e6d16 5118 c->root_image = mfree(c->root_image);
18d73705 5119 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5120 c->root_hash = mfree(c->root_hash);
5121 c->root_hash_size = 0;
5122 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5123 c->root_hash_sig = mfree(c->root_hash_sig);
5124 c->root_hash_sig_size = 0;
5125 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5126 c->root_verity = mfree(c->root_verity);
93f59701 5127 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
5128 c->tty_path = mfree(c->tty_path);
5129 c->syslog_identifier = mfree(c->syslog_identifier);
5130 c->user = mfree(c->user);
5131 c->group = mfree(c->group);
034c6ed7 5132
6796073e 5133 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5134
a1e58e8e 5135 c->pam_name = mfree(c->pam_name);
5b6319dc 5136
2a624c36
AP
5137 c->read_only_paths = strv_free(c->read_only_paths);
5138 c->read_write_paths = strv_free(c->read_write_paths);
5139 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5140 c->exec_paths = strv_free(c->exec_paths);
5141 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5142 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5143
d2d6c096 5144 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5145 c->bind_mounts = NULL;
5146 c->n_bind_mounts = 0;
2abd4e38
YW
5147 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5148 c->temporary_filesystems = NULL;
5149 c->n_temporary_filesystems = 0;
b3d13314 5150 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5151
0985c7c4 5152 cpu_set_reset(&c->cpu_set);
b070c7c0 5153 numa_policy_reset(&c->numa_policy);
86a3475b 5154
a1e58e8e
LP
5155 c->utmp_id = mfree(c->utmp_id);
5156 c->selinux_context = mfree(c->selinux_context);
5157 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5158 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5159
b1994387
ILG
5160 c->restrict_filesystems = set_free(c->restrict_filesystems);
5161
8cfa775f 5162 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5163 c->syscall_archs = set_free(c->syscall_archs);
5164 c->address_families = set_free(c->address_families);
e66cf1a3 5165
5b10116e 5166 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5167 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5168
5169 c->log_level_max = -1;
5170
5171 exec_context_free_log_extra_fields(c);
08f3be7a 5172
5ac1530e
ZJS
5173 c->log_ratelimit_interval_usec = 0;
5174 c->log_ratelimit_burst = 0;
90fc172e 5175
08f3be7a
LP
5176 c->stdin_data = mfree(c->stdin_data);
5177 c->stdin_data_size = 0;
a8d08f39
LP
5178
5179 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5180 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5181
5182 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5183
43144be4 5184 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5185 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5186}
5187
34cf6c43 5188int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5189 assert(c);
5190
5191 if (!runtime_prefix)
5192 return 0;
5193
211a3d87 5194 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5195 _cleanup_free_ char *p = NULL;
e66cf1a3 5196
494d0247 5197 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5198 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5199 else
211a3d87 5200 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5201 if (!p)
5202 return -ENOMEM;
5203
7bc4bf4a
LP
5204 /* We execute this synchronously, since we need to be sure this is gone when we start the
5205 * service next. */
c6878637 5206 (void) rm_rf(p, REMOVE_ROOT);
211a3d87
LB
5207
5208 char **symlink;
5209 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5210 _cleanup_free_ char *symlink_abs = NULL;
5211
5212 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5213 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5214 else
5215 symlink_abs = path_join(runtime_prefix, *symlink);
5216 if (!symlink_abs)
5217 return -ENOMEM;
5218
5219 (void) unlink(symlink_abs);
5220 }
5221
e66cf1a3
LP
5222 }
5223
5224 return 0;
5cb5a6ff
LP
5225}
5226
bb0c0d6f
LP
5227int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5228 _cleanup_free_ char *p = NULL;
5229
5230 assert(c);
5231
5232 if (!runtime_prefix || !unit)
5233 return 0;
5234
5235 p = path_join(runtime_prefix, "credentials", unit);
5236 if (!p)
5237 return -ENOMEM;
5238
5239 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5240 * unmount it, and afterwards remove the mount point */
5241 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5242 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5243
5244 return 0;
5245}
5246
34cf6c43 5247static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5248 assert(c);
5249
a1e58e8e 5250 c->path = mfree(c->path);
6796073e 5251 c->argv = strv_free(c->argv);
43d0fcbd
LP
5252}
5253
da6053d0 5254void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5255 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5256 exec_command_done(c+i);
5257}
5258
f1acf85a 5259ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5260 ExecCommand *i;
5261
5262 while ((i = c)) {
71fda00f 5263 LIST_REMOVE(command, c, i);
43d0fcbd 5264 exec_command_done(i);
5cb5a6ff
LP
5265 free(i);
5266 }
f1acf85a
ZJS
5267
5268 return NULL;
5cb5a6ff
LP
5269}
5270
da6053d0 5271void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5272 for (size_t i = 0; i < n; i++)
f1acf85a 5273 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5274}
5275
6a1d4d9f 5276void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5277 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5278 exec_status_reset(&c[i].exec_status);
5279}
5280
5281void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5282 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5283 ExecCommand *z;
5284
5285 LIST_FOREACH(command, z, c[i])
5286 exec_status_reset(&z->exec_status);
5287 }
5288}
5289
039f0e70 5290typedef struct InvalidEnvInfo {
34cf6c43 5291 const Unit *unit;
039f0e70
LP
5292 const char *path;
5293} InvalidEnvInfo;
5294
5295static void invalid_env(const char *p, void *userdata) {
5296 InvalidEnvInfo *info = userdata;
5297
f2341e0a 5298 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5299}
5300
52c239d7
LB
5301const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5302 assert(c);
5303
5304 switch (fd_index) {
5073ff6b 5305
52c239d7
LB
5306 case STDIN_FILENO:
5307 if (c->std_input != EXEC_INPUT_NAMED_FD)
5308 return NULL;
5073ff6b 5309
52c239d7 5310 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5311
52c239d7
LB
5312 case STDOUT_FILENO:
5313 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5314 return NULL;
5073ff6b 5315
52c239d7 5316 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5317
52c239d7
LB
5318 case STDERR_FILENO:
5319 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5320 return NULL;
5073ff6b 5321
52c239d7 5322 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5323
52c239d7
LB
5324 default:
5325 return NULL;
5326 }
5327}
5328
2caa38e9
LP
5329static int exec_context_named_iofds(
5330 const ExecContext *c,
5331 const ExecParameters *p,
5332 int named_iofds[static 3]) {
5333
5b10116e 5334 size_t targets;
56fbd561 5335 const char* stdio_fdname[3];
da6053d0 5336 size_t n_fds;
52c239d7
LB
5337
5338 assert(c);
5339 assert(p);
2caa38e9 5340 assert(named_iofds);
52c239d7
LB
5341
5342 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5343 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5344 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5345
5b10116e 5346 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5347 stdio_fdname[i] = exec_context_fdname(c, i);
5348
4c47affc
FB
5349 n_fds = p->n_storage_fds + p->n_socket_fds;
5350
5b10116e 5351 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5352 if (named_iofds[STDIN_FILENO] < 0 &&
5353 c->std_input == EXEC_INPUT_NAMED_FD &&
5354 stdio_fdname[STDIN_FILENO] &&
5355 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5356
52c239d7
LB
5357 named_iofds[STDIN_FILENO] = p->fds[i];
5358 targets--;
56fbd561
ZJS
5359
5360 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5361 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5362 stdio_fdname[STDOUT_FILENO] &&
5363 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5364
52c239d7
LB
5365 named_iofds[STDOUT_FILENO] = p->fds[i];
5366 targets--;
56fbd561
ZJS
5367
5368 } else if (named_iofds[STDERR_FILENO] < 0 &&
5369 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5370 stdio_fdname[STDERR_FILENO] &&
5371 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5372
52c239d7
LB
5373 named_iofds[STDERR_FILENO] = p->fds[i];
5374 targets--;
5375 }
5376
56fbd561 5377 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5378}
5379
34cf6c43 5380static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5381 char **i, **r = NULL;
5382
5383 assert(c);
5384 assert(l);
5385
5386 STRV_FOREACH(i, c->environment_files) {
5387 char *fn;
52511fae 5388 int k;
8c7be95e
LP
5389 bool ignore = false;
5390 char **p;
7fd1b19b 5391 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5392
5393 fn = *i;
5394
5395 if (fn[0] == '-') {
5396 ignore = true;
313cefa1 5397 fn++;
8c7be95e
LP
5398 }
5399
5400 if (!path_is_absolute(fn)) {
8c7be95e
LP
5401 if (ignore)
5402 continue;
5403
5404 strv_free(r);
5405 return -EINVAL;
5406 }
5407
2bef10ab 5408 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5409 k = safe_glob(fn, 0, &pglob);
5410 if (k < 0) {
2bef10ab
PL
5411 if (ignore)
5412 continue;
8c7be95e 5413
2bef10ab 5414 strv_free(r);
d8c92e8b 5415 return k;
2bef10ab 5416 }
8c7be95e 5417
d8c92e8b
ZJS
5418 /* When we don't match anything, -ENOENT should be returned */
5419 assert(pglob.gl_pathc > 0);
5420
5b10116e 5421 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5422 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5423 if (k < 0) {
5424 if (ignore)
5425 continue;
8c7be95e 5426
2bef10ab 5427 strv_free(r);
2bef10ab 5428 return k;
e9c1ea9d 5429 }
ebc05a09 5430 /* Log invalid environment variables with filename */
039f0e70
LP
5431 if (p) {
5432 InvalidEnvInfo info = {
f2341e0a 5433 .unit = unit,
039f0e70
LP
5434 .path = pglob.gl_pathv[n]
5435 };
5436
5437 p = strv_env_clean_with_callback(p, invalid_env, &info);
5438 }
8c7be95e 5439
234519ae 5440 if (!r)
2bef10ab
PL
5441 r = p;
5442 else {
5443 char **m;
8c7be95e 5444
4ab3d29f 5445 m = strv_env_merge(r, p);
2bef10ab
PL
5446 strv_free(r);
5447 strv_free(p);
c84a9488 5448 if (!m)
2bef10ab 5449 return -ENOMEM;
2bef10ab
PL
5450
5451 r = m;
5452 }
8c7be95e
LP
5453 }
5454 }
5455
5456 *l = r;
5457
5458 return 0;
5459}
5460
6ac8fdc9 5461static bool tty_may_match_dev_console(const char *tty) {
7b912648 5462 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5463
1e22b5cd
LP
5464 if (!tty)
5465 return true;
5466
a119ec7c 5467 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5468
5469 /* trivial identity? */
5470 if (streq(tty, "console"))
5471 return true;
5472
7b912648
LP
5473 if (resolve_dev_console(&resolved) < 0)
5474 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5475
5476 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5477 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5478}
5479
6c0ae739
LP
5480static bool exec_context_may_touch_tty(const ExecContext *ec) {
5481 assert(ec);
1e22b5cd 5482
6c0ae739 5483 return ec->tty_reset ||
1e22b5cd
LP
5484 ec->tty_vhangup ||
5485 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5486 is_terminal_input(ec->std_input) ||
5487 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5488 is_terminal_output(ec->std_error);
5489}
5490
5491bool exec_context_may_touch_console(const ExecContext *ec) {
5492
5493 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5494 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5495}
5496
15ae422b
LP
5497static void strv_fprintf(FILE *f, char **l) {
5498 char **g;
5499
5500 assert(f);
5501
5502 STRV_FOREACH(g, l)
5503 fprintf(f, " %s", *g);
5504}
5505
ddc155b2
TM
5506static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5507 assert(f);
5508 assert(prefix);
5509 assert(name);
5510
5511 if (!strv_isempty(strv)) {
a7bd1656 5512 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5513 strv_fprintf(f, strv);
5514 fputs("\n", f);
5515 }
5516}
5517
34cf6c43 5518void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5291f26d 5519 char **e, **d;
add00535 5520 int r;
9eba9da4 5521
5cb5a6ff
LP
5522 assert(c);
5523 assert(f);
5524
4ad49000 5525 prefix = strempty(prefix);
5cb5a6ff
LP
5526
5527 fprintf(f,
94f04347
LP
5528 "%sUMask: %04o\n"
5529 "%sWorkingDirectory: %s\n"
451a074f 5530 "%sRootDirectory: %s\n"
15ae422b 5531 "%sNonBlocking: %s\n"
64747e2d 5532 "%sPrivateTmp: %s\n"
7f112f50 5533 "%sPrivateDevices: %s\n"
59eeb84b 5534 "%sProtectKernelTunables: %s\n"
e66a2f65 5535 "%sProtectKernelModules: %s\n"
84703040 5536 "%sProtectKernelLogs: %s\n"
fc64760d 5537 "%sProtectClock: %s\n"
59eeb84b 5538 "%sProtectControlGroups: %s\n"
d251207d
LP
5539 "%sPrivateNetwork: %s\n"
5540 "%sPrivateUsers: %s\n"
1b8689f9
LP
5541 "%sProtectHome: %s\n"
5542 "%sProtectSystem: %s\n"
5d997827 5543 "%sMountAPIVFS: %s\n"
f3e43635 5544 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5545 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5546 "%sRestrictRealtime: %s\n"
f69567cb 5547 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5548 "%sKeyringMode: %s\n"
4e399953
LP
5549 "%sProtectHostname: %s\n"
5550 "%sProtectProc: %s\n"
5551 "%sProcSubset: %s\n",
5cb5a6ff 5552 prefix, c->umask,
14eb3285
LP
5553 prefix, empty_to_root(c->working_directory),
5554 prefix, empty_to_root(c->root_directory),
15ae422b 5555 prefix, yes_no(c->non_blocking),
64747e2d 5556 prefix, yes_no(c->private_tmp),
7f112f50 5557 prefix, yes_no(c->private_devices),
59eeb84b 5558 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5559 prefix, yes_no(c->protect_kernel_modules),
84703040 5560 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5561 prefix, yes_no(c->protect_clock),
59eeb84b 5562 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5563 prefix, yes_no(c->private_network),
5564 prefix, yes_no(c->private_users),
1b8689f9
LP
5565 prefix, protect_home_to_string(c->protect_home),
5566 prefix, protect_system_to_string(c->protect_system),
5e98086d 5567 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5568 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5569 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5570 prefix, yes_no(c->restrict_realtime),
f69567cb 5571 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5572 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5573 prefix, yes_no(c->protect_hostname),
5574 prefix, protect_proc_to_string(c->protect_proc),
5575 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5576
915e6d16
LP
5577 if (c->root_image)
5578 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5579
18d73705
LB
5580 if (c->root_image_options) {
5581 MountOptions *o;
5582
5583 fprintf(f, "%sRootImageOptions:", prefix);
5584 LIST_FOREACH(mount_options, o, c->root_image_options)
5585 if (!isempty(o->options))
9ece6444
LB
5586 fprintf(f, " %s:%s",
5587 partition_designator_to_string(o->partition_designator),
5588 o->options);
18d73705
LB
5589 fprintf(f, "\n");
5590 }
5591
0389f4fa
LB
5592 if (c->root_hash) {
5593 _cleanup_free_ char *encoded = NULL;
5594 encoded = hexmem(c->root_hash, c->root_hash_size);
5595 if (encoded)
5596 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5597 }
5598
5599 if (c->root_hash_path)
5600 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5601
d4d55b0d
LB
5602 if (c->root_hash_sig) {
5603 _cleanup_free_ char *encoded = NULL;
5604 ssize_t len;
5605 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5606 if (len)
5607 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5608 }
5609
5610 if (c->root_hash_sig_path)
5611 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5612
0389f4fa
LB
5613 if (c->root_verity)
5614 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5615
8c7be95e
LP
5616 STRV_FOREACH(e, c->environment)
5617 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5618
5619 STRV_FOREACH(e, c->environment_files)
5620 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5621
b4c14404
FB
5622 STRV_FOREACH(e, c->pass_environment)
5623 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5624
00819cc1
LP
5625 STRV_FOREACH(e, c->unset_environment)
5626 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5627
53f47dfc
YW
5628 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5629
5b10116e 5630 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5631 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5632
211a3d87
LB
5633 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5634 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5635
5636 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5637 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5638 }
3536f49e 5639 }
c2bbd90b 5640
5291f26d 5641 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5642
fb33a393 5643 if (c->nice_set)
5291f26d 5644 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5645
dd6c17b1 5646 if (c->oom_score_adjust_set)
5291f26d 5647 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5648
ad21e542 5649 if (c->coredump_filter_set)
5291f26d 5650 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5651
5b10116e 5652 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5653 if (c->rlimit[i]) {
4c3a2b84 5654 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5655 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5656 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5657 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5658 }
94f04347 5659
f8b69d1d 5660 if (c->ioprio_set) {
1756a011 5661 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5662
5bead76e 5663 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5664 if (r >= 0)
5665 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5666
5bead76e 5667 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5668 }
94f04347 5669
f8b69d1d 5670 if (c->cpu_sched_set) {
1756a011 5671 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5672
837df140
YW
5673 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5674 if (r >= 0)
5675 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5676
94f04347 5677 fprintf(f,
38b48754
LP
5678 "%sCPUSchedulingPriority: %i\n"
5679 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5680 prefix, c->cpu_sched_priority,
5681 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5682 }
94f04347 5683
0985c7c4 5684 if (c->cpu_set.set) {
e7fca352
MS
5685 _cleanup_free_ char *affinity = NULL;
5686
5687 affinity = cpu_set_to_range_string(&c->cpu_set);
5688 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5689 }
5690
b070c7c0
MS
5691 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5692 _cleanup_free_ char *nodes = NULL;
5693
5694 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5695 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5696 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5697 }
5698
3a43da28 5699 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5700 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5701
5702 fprintf(f,
80876c20
LP
5703 "%sStandardInput: %s\n"
5704 "%sStandardOutput: %s\n"
5705 "%sStandardError: %s\n",
5706 prefix, exec_input_to_string(c->std_input),
5707 prefix, exec_output_to_string(c->std_output),
5708 prefix, exec_output_to_string(c->std_error));
5709
befc4a80
LP
5710 if (c->std_input == EXEC_INPUT_NAMED_FD)
5711 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5712 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5713 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5714 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5715 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5716
5717 if (c->std_input == EXEC_INPUT_FILE)
5718 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5719 if (c->std_output == EXEC_OUTPUT_FILE)
5720 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5721 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5722 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5723 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5724 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5725 if (c->std_error == EXEC_OUTPUT_FILE)
5726 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5727 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5728 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5729 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5730 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5731
80876c20
LP
5732 if (c->tty_path)
5733 fprintf(f,
6ea832a2
LP
5734 "%sTTYPath: %s\n"
5735 "%sTTYReset: %s\n"
5736 "%sTTYVHangup: %s\n"
51462135
DDM
5737 "%sTTYVTDisallocate: %s\n"
5738 "%sTTYRows: %u\n"
5739 "%sTTYColumns: %u\n",
6ea832a2
LP
5740 prefix, c->tty_path,
5741 prefix, yes_no(c->tty_reset),
5742 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5743 prefix, yes_no(c->tty_vt_disallocate),
5744 prefix, c->tty_rows,
5745 prefix, c->tty_cols);
94f04347 5746
9f6444eb 5747 if (IN_SET(c->std_output,
9f6444eb
LP
5748 EXEC_OUTPUT_KMSG,
5749 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5750 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5751 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5752 IN_SET(c->std_error,
9f6444eb
LP
5753 EXEC_OUTPUT_KMSG,
5754 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5755 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5756 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5757
5ce70e5b 5758 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5759
837df140
YW
5760 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5761 if (r >= 0)
5762 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5763
837df140
YW
5764 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5765 if (r >= 0)
5766 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5767 }
94f04347 5768
d3070fbd
LP
5769 if (c->log_level_max >= 0) {
5770 _cleanup_free_ char *t = NULL;
5771
5772 (void) log_level_to_string_alloc(c->log_level_max, &t);
5773
5774 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5775 }
5776
5291f26d 5777 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5778 fprintf(f,
5779 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5780 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5781
5ac1530e
ZJS
5782 if (c->log_ratelimit_burst > 0)
5783 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5784
5b10116e
ZJS
5785 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5786 fprintf(f, "%sLogExtraFields: ", prefix);
5787 fwrite(c->log_extra_fields[j].iov_base,
5788 1, c->log_extra_fields[j].iov_len,
5789 f);
5790 fputc('\n', f);
d3070fbd
LP
5791 }
5792
91dd5f7c
LP
5793 if (c->log_namespace)
5794 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5795
07d46372
YW
5796 if (c->secure_bits) {
5797 _cleanup_free_ char *str = NULL;
5798
5799 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5800 if (r >= 0)
5801 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5802 }
94f04347 5803
a103496c 5804 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5805 _cleanup_free_ char *str = NULL;
94f04347 5806
dd1f5bd0
YW
5807 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5808 if (r >= 0)
5809 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5810 }
5811
5812 if (c->capability_ambient_set != 0) {
dd1f5bd0 5813 _cleanup_free_ char *str = NULL;
755d4b67 5814
dd1f5bd0
YW
5815 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5816 if (r >= 0)
5817 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5818 }
5819
5820 if (c->user)
f2d3769a 5821 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5822 if (c->group)
f2d3769a 5823 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5824
29206d46
LP
5825 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5826
ddc155b2 5827 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5828
5b6319dc 5829 if (c->pam_name)
f2d3769a 5830 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5831
ddc155b2
TM
5832 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5833 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5834 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5835 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5836 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 5837 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 5838
5b10116e
ZJS
5839 for (size_t i = 0; i < c->n_bind_mounts; i++)
5840 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5841 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5842 c->bind_mounts[i].ignore_enoent ? "-": "",
5843 c->bind_mounts[i].source,
5844 c->bind_mounts[i].destination,
5845 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5846
5b10116e
ZJS
5847 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5848 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5849
5b10116e
ZJS
5850 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5851 t->path,
5852 isempty(t->options) ? "" : ":",
5853 strempty(t->options));
5854 }
2abd4e38 5855
169c1bda
LP
5856 if (c->utmp_id)
5857 fprintf(f,
5858 "%sUtmpIdentifier: %s\n",
5859 prefix, c->utmp_id);
7b52a628
MS
5860
5861 if (c->selinux_context)
5862 fprintf(f,
5f8640fb
LP
5863 "%sSELinuxContext: %s%s\n",
5864 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5865
80c21aea
WC
5866 if (c->apparmor_profile)
5867 fprintf(f,
5868 "%sAppArmorProfile: %s%s\n",
5869 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5870
5871 if (c->smack_process_label)
5872 fprintf(f,
5873 "%sSmackProcessLabel: %s%s\n",
5874 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5875
050f7277 5876 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5877 fprintf(f,
5878 "%sPersonality: %s\n",
5879 prefix, strna(personality_to_string(c->personality)));
5880
78e864e5
TM
5881 fprintf(f,
5882 "%sLockPersonality: %s\n",
5883 prefix, yes_no(c->lock_personality));
5884
17df7223 5885 if (c->syscall_filter) {
349cc4a5 5886#if HAVE_SECCOMP
8cfa775f 5887 void *id, *val;
17df7223 5888 bool first = true;
351a19b1 5889#endif
17df7223
LP
5890
5891 fprintf(f,
57183d11 5892 "%sSystemCallFilter: ",
17df7223
LP
5893 prefix);
5894
6b000af4 5895 if (!c->syscall_allow_list)
17df7223
LP
5896 fputc('~', f);
5897
349cc4a5 5898#if HAVE_SECCOMP
90e74a66 5899 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5900 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5901 const char *errno_name = NULL;
5902 int num = PTR_TO_INT(val);
17df7223
LP
5903
5904 if (first)
5905 first = false;
5906 else
5907 fputc(' ', f);
5908
57183d11 5909 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5910 fputs(strna(name), f);
8cfa775f
YW
5911
5912 if (num >= 0) {
005bfaf1 5913 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5914 if (errno_name)
5915 fprintf(f, ":%s", errno_name);
5916 else
5917 fprintf(f, ":%d", num);
5918 }
17df7223 5919 }
351a19b1 5920#endif
17df7223
LP
5921
5922 fputc('\n', f);
5923 }
5924
57183d11 5925 if (c->syscall_archs) {
349cc4a5 5926#if HAVE_SECCOMP
57183d11
LP
5927 void *id;
5928#endif
5929
5930 fprintf(f,
5931 "%sSystemCallArchitectures:",
5932 prefix);
5933
349cc4a5 5934#if HAVE_SECCOMP
90e74a66 5935 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5936 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5937#endif
5938 fputc('\n', f);
5939 }
5940
add00535
LP
5941 if (exec_context_restrict_namespaces_set(c)) {
5942 _cleanup_free_ char *s = NULL;
5943
86c2a9f1 5944 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5945 if (r >= 0)
5946 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5947 prefix, strna(s));
add00535
LP
5948 }
5949
b1994387
ILG
5950#if HAVE_LIBBPF
5951 if (exec_context_restrict_filesystems_set(c))
5952 SET_FOREACH(e, c->restrict_filesystems)
5953 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
5954#endif
5955
a8d08f39
LP
5956 if (c->network_namespace_path)
5957 fprintf(f,
5958 "%sNetworkNamespacePath: %s\n",
5959 prefix, c->network_namespace_path);
5960
3df90f24 5961 if (c->syscall_errno > 0) {
005bfaf1 5962#if HAVE_SECCOMP
3df90f24 5963 const char *errno_name;
005bfaf1 5964#endif
3df90f24
YW
5965
5966 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5967
005bfaf1
TM
5968#if HAVE_SECCOMP
5969 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5970 if (errno_name)
005bfaf1 5971 fputs(errno_name, f);
3df90f24 5972 else
005bfaf1
TM
5973 fprintf(f, "%d", c->syscall_errno);
5974#endif
5975 fputc('\n', f);
3df90f24 5976 }
b3d13314 5977
5b10116e 5978 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5979 MountOptions *o;
5980
79e20ceb 5981 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5982 c->mount_images[i].ignore_enoent ? "-": "",
5983 c->mount_images[i].source,
79e20ceb 5984 c->mount_images[i].destination);
427353f6 5985 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5986 fprintf(f, ":%s:%s",
427353f6 5987 partition_designator_to_string(o->partition_designator),
79e20ceb 5988 strempty(o->options));
427353f6
LB
5989 fprintf(f, "\n");
5990 }
93f59701
LB
5991
5992 for (size_t i = 0; i < c->n_extension_images; i++) {
5993 MountOptions *o;
5994
5995 fprintf(f, "%sExtensionImages: %s%s", prefix,
5996 c->extension_images[i].ignore_enoent ? "-": "",
5997 c->extension_images[i].source);
5998 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5999 fprintf(f, ":%s:%s",
6000 partition_designator_to_string(o->partition_designator),
6001 strempty(o->options));
6002 fprintf(f, "\n");
6003 }
5cb5a6ff
LP
6004}
6005
34cf6c43 6006bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6007 assert(c);
6008
61233823 6009 /* Returns true if the process forked off would run under
a931ad47
LP
6010 * an unchanged UID or as root. */
6011
6012 if (!c->user)
6013 return true;
6014
6015 if (streq(c->user, "root") || streq(c->user, "0"))
6016 return true;
6017
6018 return false;
6019}
6020
34cf6c43 6021int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6022 int p;
6023
6024 assert(c);
6025
6026 if (c->ioprio_set)
6027 return c->ioprio;
6028
6029 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6030 if (p < 0)
0692548c 6031 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6032
8b330d7d 6033 return ioprio_normalize(p);
7f452159
LP
6034}
6035
5e98086d
ZJS
6036bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6037 assert(c);
6038
61198784 6039 /* Explicit setting wins */
5e98086d
ZJS
6040 if (c->mount_apivfs_set)
6041 return c->mount_apivfs;
6042
61198784 6043 /* Default to "yes" if root directory or image are specified */
74e12520 6044 if (exec_context_with_rootfs(c))
61198784
ZJS
6045 return true;
6046
5e98086d
ZJS
6047 return false;
6048}
6049
d3070fbd 6050void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6051 assert(c);
6052
5b10116e 6053 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6054 free(c->log_extra_fields[l].iov_base);
6055 c->log_extra_fields = mfree(c->log_extra_fields);
6056 c->n_log_extra_fields = 0;
6057}
6058
6f765baf 6059void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6060 _cleanup_close_ int fd = -1;
6061 const char *path;
6062 struct stat st;
6f765baf
LP
6063 int r;
6064
6065 assert(c);
6066
6067 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6068 exec_context_tty_reset(c, NULL);
6069
6070 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6071 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6072 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6073 if (!exec_context_may_touch_tty(c))
6074 return;
6f765baf 6075
0ba976e8
LP
6076 path = exec_context_tty_path(c);
6077 if (!path)
6078 return;
6f765baf 6079
0ba976e8
LP
6080 fd = open(path, O_PATH|O_CLOEXEC);
6081 if (fd < 0)
6082 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6083 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6084 path);
6085
6086 if (fstat(fd, &st) < 0)
6087 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6088
6089 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6090 * if things are a character device, since a proper check either means we'd have to open the TTY and
6091 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6092 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6093 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6094 if (!S_ISCHR(st.st_mode))
6095 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6096
6097 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6098 if (r < 0)
6099 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6100}
6101
4c2f5842
LP
6102int exec_context_get_clean_directories(
6103 ExecContext *c,
6104 char **prefix,
6105 ExecCleanMask mask,
6106 char ***ret) {
6107
6108 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6109 int r;
6110
6111 assert(c);
6112 assert(prefix);
6113 assert(ret);
6114
5b10116e 6115 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6116 if (!FLAGS_SET(mask, 1U << t))
6117 continue;
6118
6119 if (!prefix[t])
6120 continue;
6121
211a3d87 6122 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6123 char *j;
6124
211a3d87 6125 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6126 if (!j)
6127 return -ENOMEM;
6128
6129 r = strv_consume(&l, j);
6130 if (r < 0)
6131 return r;
7f622a19
YW
6132
6133 /* Also remove private directories unconditionally. */
6134 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6135 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6136 if (!j)
6137 return -ENOMEM;
6138
6139 r = strv_consume(&l, j);
6140 if (r < 0)
6141 return r;
6142 }
6143
6144 char **symlink;
6145 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6146 j = path_join(prefix[t], *symlink);
7f622a19
YW
6147 if (!j)
6148 return -ENOMEM;
6149
6150 r = strv_consume(&l, j);
6151 if (r < 0)
6152 return r;
6153 }
4c2f5842
LP
6154 }
6155 }
6156
6157 *ret = TAKE_PTR(l);
6158 return 0;
6159}
6160
6161int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6162 ExecCleanMask mask = 0;
6163
6164 assert(c);
6165 assert(ret);
6166
6167 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6168 if (c->directories[t].n_items > 0)
4c2f5842
LP
6169 mask |= 1U << t;
6170
6171 *ret = mask;
6172 return 0;
6173}
6174
b58b4116 6175void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6176 assert(s);
5cb5a6ff 6177
2ed26ed0
LP
6178 *s = (ExecStatus) {
6179 .pid = pid,
6180 };
6181
b58b4116
LP
6182 dual_timestamp_get(&s->start_timestamp);
6183}
6184
34cf6c43 6185void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6186 assert(s);
6187
d46b79bb 6188 if (s->pid != pid)
2ed26ed0
LP
6189 *s = (ExecStatus) {
6190 .pid = pid,
6191 };
b58b4116 6192
63983207 6193 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6194
034c6ed7
LP
6195 s->code = code;
6196 s->status = status;
169c1bda 6197
6f765baf
LP
6198 if (context && context->utmp_id)
6199 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6200}
6201
6a1d4d9f
LP
6202void exec_status_reset(ExecStatus *s) {
6203 assert(s);
6204
6205 *s = (ExecStatus) {};
6206}
6207
34cf6c43 6208void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6209 assert(s);
6210 assert(f);
6211
9fb86720
LP
6212 if (s->pid <= 0)
6213 return;
6214
4c940960
LP
6215 prefix = strempty(prefix);
6216
9fb86720 6217 fprintf(f,
ccd06097
ZJS
6218 "%sPID: "PID_FMT"\n",
6219 prefix, s->pid);
9fb86720 6220
af9d16e1 6221 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6222 fprintf(f,
6223 "%sStart Timestamp: %s\n",
04f5c018 6224 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6225
af9d16e1 6226 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6227 fprintf(f,
6228 "%sExit Timestamp: %s\n"
6229 "%sExit Code: %s\n"
6230 "%sExit Status: %i\n",
04f5c018 6231 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6232 prefix, sigchld_code_to_string(s->code),
6233 prefix, s->status);
5cb5a6ff 6234}
44d8db9e 6235
34cf6c43 6236static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6237 _cleanup_free_ char *cmd = NULL;
4c940960 6238 const char *prefix2;
44d8db9e
LP
6239
6240 assert(c);
6241 assert(f);
6242
4c940960 6243 prefix = strempty(prefix);
63c372cb 6244 prefix2 = strjoina(prefix, "\t");
44d8db9e 6245
4ef15008 6246 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
44d8db9e
LP
6247 fprintf(f,
6248 "%sCommand Line: %s\n",
7c248223 6249 prefix, cmd ?: strerror_safe(ENOMEM));
44d8db9e 6250
9fb86720 6251 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6252}
6253
6254void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6255 assert(f);
6256
4c940960 6257 prefix = strempty(prefix);
44d8db9e
LP
6258
6259 LIST_FOREACH(command, c, c)
6260 exec_command_dump(c, f, prefix);
6261}
94f04347 6262
a6a80b4f
LP
6263void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6264 ExecCommand *end;
6265
6266 assert(l);
6267 assert(e);
6268
6269 if (*l) {
35b8ca3a 6270 /* It's kind of important, that we keep the order here */
71fda00f
LP
6271 LIST_FIND_TAIL(command, *l, end);
6272 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6273 } else
6274 *l = e;
6275}
6276
26fd040d
LP
6277int exec_command_set(ExecCommand *c, const char *path, ...) {
6278 va_list ap;
6279 char **l, *p;
6280
6281 assert(c);
6282 assert(path);
6283
6284 va_start(ap, path);
6285 l = strv_new_ap(path, ap);
6286 va_end(ap);
6287
6288 if (!l)
6289 return -ENOMEM;
6290
250a918d
LP
6291 p = strdup(path);
6292 if (!p) {
26fd040d
LP
6293 strv_free(l);
6294 return -ENOMEM;
6295 }
6296
6897dfe8 6297 free_and_replace(c->path, p);
26fd040d 6298
130d3d22 6299 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6300}
6301
86b23b07 6302int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6303 _cleanup_strv_free_ char **l = NULL;
86b23b07 6304 va_list ap;
86b23b07
JS
6305 int r;
6306
6307 assert(c);
6308 assert(path);
6309
6310 va_start(ap, path);
6311 l = strv_new_ap(path, ap);
6312 va_end(ap);
6313
6314 if (!l)
6315 return -ENOMEM;
6316
e287086b 6317 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6318 if (r < 0)
86b23b07 6319 return r;
86b23b07
JS
6320
6321 return 0;
6322}
6323
e8a565cb
YW
6324static void *remove_tmpdir_thread(void *p) {
6325 _cleanup_free_ char *path = p;
86b23b07 6326
e8a565cb
YW
6327 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6328 return NULL;
6329}
6330
6331static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6332 int r;
6333
6334 if (!rt)
6335 return NULL;
6336
6337 if (rt->manager)
6338 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6339
6340 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6341
6342 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6343 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6344
6345 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6346 if (r < 0)
e8a565cb 6347 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6348 else
6349 rt->tmp_dir = NULL;
e8a565cb 6350 }
613b411c 6351
56a13a49 6352 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6353 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6354
6355 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6356 if (r < 0)
e8a565cb 6357 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6358 else
6359 rt->var_tmp_dir = NULL;
e8a565cb
YW
6360 }
6361
6362 rt->id = mfree(rt->id);
6363 rt->tmp_dir = mfree(rt->tmp_dir);
6364 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6365 safe_close_pair(rt->netns_storage_socket);
a70581ff 6366 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6367 return mfree(rt);
6368}
6369
6370static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6371 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6372}
6373
56a13a49
ZJS
6374static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6375 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6376 ExecRuntime *n;
613b411c 6377
8e8009dc 6378 assert(ret);
613b411c 6379
56a13a49
ZJS
6380 id_copy = strdup(id);
6381 if (!id_copy)
6382 return -ENOMEM;
6383
8e8009dc
LP
6384 n = new(ExecRuntime, 1);
6385 if (!n)
613b411c
LP
6386 return -ENOMEM;
6387
8e8009dc 6388 *n = (ExecRuntime) {
56a13a49 6389 .id = TAKE_PTR(id_copy),
8e8009dc 6390 .netns_storage_socket = { -1, -1 },
a70581ff 6391 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6392 };
6393
6394 *ret = n;
613b411c
LP
6395 return 0;
6396}
6397
e8a565cb
YW
6398static int exec_runtime_add(
6399 Manager *m,
6400 const char *id,
56a13a49
ZJS
6401 char **tmp_dir,
6402 char **var_tmp_dir,
6403 int netns_storage_socket[2],
a70581ff 6404 int ipcns_storage_socket[2],
e8a565cb
YW
6405 ExecRuntime **ret) {
6406
6407 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6408 int r;
6409
e8a565cb 6410 assert(m);
613b411c
LP
6411 assert(id);
6412
a70581ff 6413 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6414
56a13a49 6415 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6416 if (r < 0)
6417 return r;
6418
63083706 6419 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6420 if (r < 0)
6421 return r;
e8a565cb 6422
56a13a49
ZJS
6423 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6424 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6425 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6426
6427 if (netns_storage_socket) {
56a13a49
ZJS
6428 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6429 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6430 }
6431
a70581ff
XR
6432 if (ipcns_storage_socket) {
6433 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6434 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6435 }
6436
e8a565cb
YW
6437 rt->manager = m;
6438
6439 if (ret)
6440 *ret = rt;
e8a565cb 6441 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6442 TAKE_PTR(rt);
e8a565cb
YW
6443 return 0;
6444}
6445
74aaf59b
LP
6446static int exec_runtime_make(
6447 Manager *m,
6448 const ExecContext *c,
6449 const char *id,
6450 ExecRuntime **ret) {
6451
56a13a49 6452 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6453 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6454 int r;
6455
6456 assert(m);
6457 assert(c);
6458 assert(id);
6459
6460 /* It is not necessary to create ExecRuntime object. */
a70581ff 6461 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6462 *ret = NULL;
e8a565cb 6463 return 0;
74aaf59b 6464 }
e8a565cb 6465
efa2f3a1
TM
6466 if (c->private_tmp &&
6467 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6468 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6469 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6470 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6471 if (r < 0)
6472 return r;
6473 }
6474
a8d08f39 6475 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6476 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6477 return -errno;
6478 }
6479
a70581ff
XR
6480 if (c->private_ipc || c->ipc_namespace_path) {
6481 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6482 return -errno;
6483 }
6484
6485 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6486 if (r < 0)
6487 return r;
6488
613b411c
LP
6489 return 1;
6490}
6491
e8a565cb
YW
6492int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6493 ExecRuntime *rt;
6494 int r;
613b411c 6495
e8a565cb
YW
6496 assert(m);
6497 assert(id);
6498 assert(ret);
6499
6500 rt = hashmap_get(m->exec_runtime_by_id, id);
6501 if (rt)
387f6955 6502 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6503 goto ref;
6504
74aaf59b
LP
6505 if (!create) {
6506 *ret = NULL;
e8a565cb 6507 return 0;
74aaf59b 6508 }
e8a565cb
YW
6509
6510 /* If not found, then create a new object. */
6511 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6512 if (r < 0)
e8a565cb 6513 return r;
74aaf59b
LP
6514 if (r == 0) {
6515 /* When r == 0, it is not necessary to create ExecRuntime object. */
6516 *ret = NULL;
6517 return 0;
6518 }
613b411c 6519
e8a565cb
YW
6520ref:
6521 /* increment reference counter. */
6522 rt->n_ref++;
6523 *ret = rt;
6524 return 1;
6525}
613b411c 6526
e8a565cb
YW
6527ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6528 if (!rt)
613b411c
LP
6529 return NULL;
6530
e8a565cb 6531 assert(rt->n_ref > 0);
613b411c 6532
e8a565cb
YW
6533 rt->n_ref--;
6534 if (rt->n_ref > 0)
f2341e0a
LP
6535 return NULL;
6536
e8a565cb 6537 return exec_runtime_free(rt, destroy);
613b411c
LP
6538}
6539
e8a565cb
YW
6540int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6541 ExecRuntime *rt;
e8a565cb
YW
6542
6543 assert(m);
613b411c
LP
6544 assert(f);
6545 assert(fds);
6546
90e74a66 6547 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6548 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6549
e8a565cb
YW
6550 if (rt->tmp_dir)
6551 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6552
e8a565cb
YW
6553 if (rt->var_tmp_dir)
6554 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6555
e8a565cb
YW
6556 if (rt->netns_storage_socket[0] >= 0) {
6557 int copy;
613b411c 6558
e8a565cb
YW
6559 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6560 if (copy < 0)
6561 return copy;
613b411c 6562
e8a565cb
YW
6563 fprintf(f, " netns-socket-0=%i", copy);
6564 }
613b411c 6565
e8a565cb
YW
6566 if (rt->netns_storage_socket[1] >= 0) {
6567 int copy;
613b411c 6568
e8a565cb
YW
6569 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6570 if (copy < 0)
6571 return copy;
613b411c 6572
e8a565cb
YW
6573 fprintf(f, " netns-socket-1=%i", copy);
6574 }
6575
a70581ff
XR
6576 if (rt->ipcns_storage_socket[0] >= 0) {
6577 int copy;
6578
6579 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6580 if (copy < 0)
6581 return copy;
6582
6583 fprintf(f, " ipcns-socket-0=%i", copy);
6584 }
6585
6586 if (rt->ipcns_storage_socket[1] >= 0) {
6587 int copy;
6588
6589 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6590 if (copy < 0)
6591 return copy;
6592
6593 fprintf(f, " ipcns-socket-1=%i", copy);
6594 }
6595
e8a565cb 6596 fputc('\n', f);
613b411c
LP
6597 }
6598
6599 return 0;
6600}
6601
e8a565cb
YW
6602int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6603 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6604 ExecRuntime *rt;
613b411c
LP
6605 int r;
6606
e8a565cb
YW
6607 /* This is for the migration from old (v237 or earlier) deserialization text.
6608 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6609 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6610 * so or not from the serialized text, then we always creates a new object owned by this. */
6611
6612 assert(u);
613b411c
LP
6613 assert(key);
6614 assert(value);
6615
e8a565cb
YW
6616 /* Manager manages ExecRuntime objects by the unit id.
6617 * So, we omit the serialized text when the unit does not have id (yet?)... */
6618 if (isempty(u->id)) {
6619 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6620 return 0;
6621 }
613b411c 6622
cbc165d1
ZJS
6623 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6624 return log_oom();
e8a565cb
YW
6625
6626 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6627 if (!rt) {
cbc165d1 6628 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6629 return log_oom();
613b411c 6630
e8a565cb
YW
6631 rt = rt_create;
6632 }
6633
6634 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6635 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6636 return -ENOMEM;
613b411c
LP
6637
6638 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6639 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6640 return -ENOMEM;
613b411c
LP
6641
6642 } else if (streq(key, "netns-socket-0")) {
6643 int fd;
6644
e8a565cb 6645 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6646 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6647 return 0;
613b411c 6648 }
e8a565cb
YW
6649
6650 safe_close(rt->netns_storage_socket[0]);
6651 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6652
613b411c
LP
6653 } else if (streq(key, "netns-socket-1")) {
6654 int fd;
6655
e8a565cb 6656 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6657 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6658 return 0;
613b411c 6659 }
e8a565cb
YW
6660
6661 safe_close(rt->netns_storage_socket[1]);
6662 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6663
613b411c
LP
6664 } else
6665 return 0;
6666
e8a565cb
YW
6667 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6668 if (rt_create) {
6669 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6670 if (r < 0) {
3fe91079 6671 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6672 return 0;
6673 }
613b411c 6674
e8a565cb 6675 rt_create->manager = u->manager;
613b411c 6676
e8a565cb 6677 /* Avoid cleanup */
56a13a49 6678 TAKE_PTR(rt_create);
e8a565cb 6679 }
98b47d54 6680
e8a565cb
YW
6681 return 1;
6682}
613b411c 6683
56a13a49
ZJS
6684int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6685 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6686 char *id = NULL;
a70581ff 6687 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6688 const char *p, *v = value;
6689 size_t n;
613b411c 6690
e8a565cb
YW
6691 assert(m);
6692 assert(value);
6693 assert(fds);
98b47d54 6694
e8a565cb 6695 n = strcspn(v, " ");
2f82562b 6696 id = strndupa_safe(v, n);
e8a565cb
YW
6697 if (v[n] != ' ')
6698 goto finalize;
6699 p = v + n + 1;
6700
6701 v = startswith(p, "tmp-dir=");
6702 if (v) {
6703 n = strcspn(v, " ");
56a13a49
ZJS
6704 tmp_dir = strndup(v, n);
6705 if (!tmp_dir)
6706 return log_oom();
e8a565cb
YW
6707 if (v[n] != ' ')
6708 goto finalize;
6709 p = v + n + 1;
6710 }
6711
6712 v = startswith(p, "var-tmp-dir=");
6713 if (v) {
6714 n = strcspn(v, " ");
56a13a49
ZJS
6715 var_tmp_dir = strndup(v, n);
6716 if (!var_tmp_dir)
6717 return log_oom();
e8a565cb
YW
6718 if (v[n] != ' ')
6719 goto finalize;
6720 p = v + n + 1;
6721 }
6722
6723 v = startswith(p, "netns-socket-0=");
6724 if (v) {
6725 char *buf;
6726
6727 n = strcspn(v, " ");
2f82562b 6728 buf = strndupa_safe(v, n);
c413bb28 6729
a70581ff 6730 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6731 if (r < 0)
6732 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6733 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6734 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6735 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6736 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6737 if (v[n] != ' ')
6738 goto finalize;
6739 p = v + n + 1;
613b411c
LP
6740 }
6741
e8a565cb
YW
6742 v = startswith(p, "netns-socket-1=");
6743 if (v) {
6744 char *buf;
98b47d54 6745
e8a565cb 6746 n = strcspn(v, " ");
2f82562b 6747 buf = strndupa_safe(v, n);
a70581ff
XR
6748
6749 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6750 if (r < 0)
6751 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6752 if (!fdset_contains(fds, netns_fdpair[1]))
6753 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6754 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6755 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6756 if (v[n] != ' ')
6757 goto finalize;
6758 p = v + n + 1;
6759 }
6760
6761 v = startswith(p, "ipcns-socket-0=");
6762 if (v) {
6763 char *buf;
6764
6765 n = strcspn(v, " ");
2f82562b 6766 buf = strndupa_safe(v, n);
a70581ff
XR
6767
6768 r = safe_atoi(buf, &ipcns_fdpair[0]);
6769 if (r < 0)
6770 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6771 if (!fdset_contains(fds, ipcns_fdpair[0]))
6772 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6773 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6774 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6775 if (v[n] != ' ')
6776 goto finalize;
6777 p = v + n + 1;
6778 }
6779
6780 v = startswith(p, "ipcns-socket-1=");
6781 if (v) {
6782 char *buf;
6783
6784 n = strcspn(v, " ");
2f82562b 6785 buf = strndupa_safe(v, n);
a70581ff
XR
6786
6787 r = safe_atoi(buf, &ipcns_fdpair[1]);
6788 if (r < 0)
6789 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6790 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6791 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6792 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6793 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6794 }
98b47d54 6795
e8a565cb 6796finalize:
a70581ff 6797 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6798 if (r < 0)
56a13a49
ZJS
6799 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6800 return 0;
e8a565cb 6801}
613b411c 6802
e8a565cb
YW
6803void exec_runtime_vacuum(Manager *m) {
6804 ExecRuntime *rt;
e8a565cb
YW
6805
6806 assert(m);
6807
6808 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6809
90e74a66 6810 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6811 if (rt->n_ref > 0)
6812 continue;
6813
6814 (void) exec_runtime_free(rt, false);
6815 }
613b411c
LP
6816}
6817
b9c04eaf
YW
6818void exec_params_clear(ExecParameters *p) {
6819 if (!p)
6820 return;
6821
c3f8a065
LP
6822 p->environment = strv_free(p->environment);
6823 p->fd_names = strv_free(p->fd_names);
6824 p->fds = mfree(p->fds);
6825 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6826}
6827
bb0c0d6f
LP
6828ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6829 if (!sc)
6830 return NULL;
6831
6832 free(sc->id);
6833 free(sc->data);
6834 return mfree(sc);
6835}
6836
43144be4
LP
6837ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6838 if (!lc)
6839 return NULL;
6840
6841 free(lc->id);
6842 free(lc->path);
6843 return mfree(lc);
6844}
6845
211a3d87
LB
6846void exec_directory_done(ExecDirectory *d) {
6847 if (!d)
6848 return;
6849
6850 for (size_t i = 0; i < d->n_items; i++) {
6851 free(d->items[i].path);
6852 strv_free(d->items[i].symlinks);
6853 }
6854
6855 d->items = mfree(d->items);
6856 d->n_items = 0;
6857 d->mode = 0755;
6858}
6859
6860int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6861 _cleanup_strv_free_ char **s = NULL;
6862 _cleanup_free_ char *p = NULL;
6863
6864 assert(d);
6865 assert(n);
6866 assert(path);
6867
6868 p = strdup(path);
6869 if (!p)
6870 return -ENOMEM;
6871
6872 if (symlinks) {
6873 s = strv_copy(symlinks);
6874 if (!s)
6875 return -ENOMEM;
6876 }
6877
6878 if (!GREEDY_REALLOC(*d, *n + 1))
6879 return -ENOMEM;
6880
6881 (*d)[(*n) ++] = (ExecDirectoryItem) {
6882 .path = TAKE_PTR(p),
6883 .symlinks = TAKE_PTR(s),
6884 };
6885
6886 return 0;
6887}
6888
bb0c0d6f 6889DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 6890DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 6891
80876c20
LP
6892static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6893 [EXEC_INPUT_NULL] = "null",
6894 [EXEC_INPUT_TTY] = "tty",
6895 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6896 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6897 [EXEC_INPUT_SOCKET] = "socket",
6898 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6899 [EXEC_INPUT_DATA] = "data",
2038c3f5 6900 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6901};
6902
8a0867d6
LP
6903DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6904
94f04347 6905static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6906 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6907 [EXEC_OUTPUT_NULL] = "null",
80876c20 6908 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6909 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6910 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6911 [EXEC_OUTPUT_JOURNAL] = "journal",
6912 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6913 [EXEC_OUTPUT_SOCKET] = "socket",
6914 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6915 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6916 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6917 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6918};
6919
6920DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6921
6922static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6923 [EXEC_UTMP_INIT] = "init",
6924 [EXEC_UTMP_LOGIN] = "login",
6925 [EXEC_UTMP_USER] = "user",
6926};
6927
6928DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6929
6930static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6931 [EXEC_PRESERVE_NO] = "no",
6932 [EXEC_PRESERVE_YES] = "yes",
6933 [EXEC_PRESERVE_RESTART] = "restart",
6934};
6935
6936DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6937
6b7b2ed9 6938/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6939static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6940 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6941 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6942 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6943 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6944 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6945};
6946
6947DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6948
211a3d87
LB
6949/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
6950static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6951 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
6952 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
6953 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
6954 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
6955 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
6956};
6957
6958DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
6959
6b7b2ed9
LP
6960/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6961 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6962 * directories, specifically .timer units with their timestamp touch file. */
6963static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6964 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6965 [EXEC_DIRECTORY_STATE] = "state",
6966 [EXEC_DIRECTORY_CACHE] = "cache",
6967 [EXEC_DIRECTORY_LOGS] = "logs",
6968 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6969};
6970
6971DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6972
6973/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6974 * the service payload in. */
fb2042dd
YW
6975static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6976 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6977 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6978 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6979 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6980 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6981};
6982
6983DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6984
b1edf445
LP
6985static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6986 [EXEC_KEYRING_INHERIT] = "inherit",
6987 [EXEC_KEYRING_PRIVATE] = "private",
6988 [EXEC_KEYRING_SHARED] = "shared",
6989};
6990
6991DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);