]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
shell-completion: add journalctl --facility
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
a1164ae3 66#include "label.h"
8dd4c05b
LP
67#include "log.h"
68#include "macro.h"
e8a565cb 69#include "manager.h"
2a341bb9 70#include "manager-dump.h"
0a970718 71#include "memory-util.h"
f5947a5e 72#include "missing_fs.h"
5bead76e 73#include "missing_ioprio.h"
35cd0ba5 74#include "mkdir-label.h"
21935150 75#include "mount-util.h"
bb0c0d6f 76#include "mountpoint-util.h"
8dd4c05b 77#include "namespace.h"
6bedfcbb 78#include "parse-util.h"
8dd4c05b 79#include "path-util.h"
0b452006 80#include "process-util.h"
d3dcf4e3 81#include "random-util.h"
78f22b97 82#include "rlimit-util.h"
8dd4c05b 83#include "rm-rf.h"
349cc4a5 84#if HAVE_SECCOMP
3ffd4af2
LP
85#include "seccomp-util.h"
86#endif
07d46372 87#include "securebits-util.h"
8dd4c05b 88#include "selinux-util.h"
24882e06 89#include "signal-util.h"
8dd4c05b 90#include "smack-util.h"
57b7a260 91#include "socket-util.h"
fd63e712 92#include "special.h"
949befd3 93#include "stat-util.h"
8b43440b 94#include "string-table.h"
07630cea 95#include "string-util.h"
8dd4c05b 96#include "strv.h"
7ccbd1ae 97#include "syslog-util.h"
8dd4c05b 98#include "terminal-util.h"
bb0c0d6f 99#include "tmpfile-util.h"
566b7d23 100#include "umask-util.h"
2d3b784d 101#include "unit-serialize.h"
b1d4f8e1 102#include "user-util.h"
8dd4c05b 103#include "utmp-wtmp.h"
5cb5a6ff 104
e056b01d 105#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 106#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 107
531dca78
LP
108#define SNDBUF_SIZE (8*1024*1024)
109
da6053d0 110static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
111 if (n_fds <= 0)
112 return 0;
113
a0d40ac5
LP
114 /* Modifies the fds array! (sorts it) */
115
034c6ed7
LP
116 assert(fds);
117
5b10116e
ZJS
118 for (int start = 0;;) {
119 int restart_from = -1;
034c6ed7 120
5b10116e 121 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
122 int nfd;
123
124 /* Already at right index? */
125 if (fds[i] == i+3)
126 continue;
127
3cc2aff1
LP
128 nfd = fcntl(fds[i], F_DUPFD, i + 3);
129 if (nfd < 0)
034c6ed7
LP
130 return -errno;
131
03e334a1 132 safe_close(fds[i]);
034c6ed7
LP
133 fds[i] = nfd;
134
135 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 136 * let's remember that and try again from here */
034c6ed7
LP
137 if (nfd != i+3 && restart_from < 0)
138 restart_from = i;
139 }
140
141 if (restart_from < 0)
142 break;
143
144 start = restart_from;
145 }
146
147 return 0;
148}
149
25b583d7 150static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 151 size_t n_fds;
e2c76839 152 int r;
47a71eed 153
25b583d7 154 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
155 if (n_fds <= 0)
156 return 0;
157
158 assert(fds);
159
9b141911
FB
160 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
161 * O_NONBLOCK only applies to socket activation though. */
47a71eed 162
5b10116e 163 for (size_t i = 0; i < n_fds; i++) {
47a71eed 164
9b141911
FB
165 if (i < n_socket_fds) {
166 r = fd_nonblock(fds[i], nonblock);
167 if (r < 0)
168 return r;
169 }
47a71eed 170
451a074f
LP
171 /* We unconditionally drop FD_CLOEXEC from the fds,
172 * since after all we want to pass these fds to our
173 * children */
47a71eed 174
3cc2aff1
LP
175 r = fd_cloexec(fds[i], false);
176 if (r < 0)
e2c76839 177 return r;
47a71eed
LP
178 }
179
180 return 0;
181}
182
1e22b5cd 183static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
184 assert(context);
185
1e22b5cd
LP
186 if (context->stdio_as_fds)
187 return NULL;
188
80876c20
LP
189 if (context->tty_path)
190 return context->tty_path;
191
192 return "/dev/console";
193}
194
1e22b5cd
LP
195static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
196 const char *path;
197
6ea832a2
LP
198 assert(context);
199
1e22b5cd 200 path = exec_context_tty_path(context);
6ea832a2 201
1e22b5cd
LP
202 if (context->tty_vhangup) {
203 if (p && p->stdin_fd >= 0)
204 (void) terminal_vhangup_fd(p->stdin_fd);
205 else if (path)
206 (void) terminal_vhangup(path);
207 }
6ea832a2 208
1e22b5cd
LP
209 if (context->tty_reset) {
210 if (p && p->stdin_fd >= 0)
211 (void) reset_terminal_fd(p->stdin_fd, true);
212 else if (path)
213 (void) reset_terminal(path);
214 }
215
51462135
DDM
216 if (p && p->stdin_fd >= 0)
217 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
218
1e22b5cd
LP
219 if (context->tty_vt_disallocate && path)
220 (void) vt_disallocate(path);
6ea832a2
LP
221}
222
6af760f3
LP
223static bool is_terminal_input(ExecInput i) {
224 return IN_SET(i,
225 EXEC_INPUT_TTY,
226 EXEC_INPUT_TTY_FORCE,
227 EXEC_INPUT_TTY_FAIL);
228}
229
3a1286b6 230static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
231 return IN_SET(o,
232 EXEC_OUTPUT_TTY,
6af760f3
LP
233 EXEC_OUTPUT_KMSG_AND_CONSOLE,
234 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
235}
236
aac8c0c3
LP
237static bool is_kmsg_output(ExecOutput o) {
238 return IN_SET(o,
239 EXEC_OUTPUT_KMSG,
240 EXEC_OUTPUT_KMSG_AND_CONSOLE);
241}
242
6af760f3
LP
243static bool exec_context_needs_term(const ExecContext *c) {
244 assert(c);
245
246 /* Return true if the execution context suggests we should set $TERM to something useful. */
247
248 if (is_terminal_input(c->std_input))
249 return true;
250
251 if (is_terminal_output(c->std_output))
252 return true;
253
254 if (is_terminal_output(c->std_error))
255 return true;
256
257 return !!c->tty_path;
3a1286b6
MS
258}
259
80876c20 260static int open_null_as(int flags, int nfd) {
046a82c1 261 int fd;
071830ff 262
80876c20 263 assert(nfd >= 0);
071830ff 264
613b411c
LP
265 fd = open("/dev/null", flags|O_NOCTTY);
266 if (fd < 0)
071830ff
LP
267 return -errno;
268
046a82c1 269 return move_fd(fd, nfd, false);
071830ff
LP
270}
271
91dd5f7c
LP
272static int connect_journal_socket(
273 int fd,
274 const char *log_namespace,
275 uid_t uid,
276 gid_t gid) {
277
f36a9d59
ZJS
278 union sockaddr_union sa;
279 socklen_t sa_len;
524daa8c
ZJS
280 uid_t olduid = UID_INVALID;
281 gid_t oldgid = GID_INVALID;
91dd5f7c 282 const char *j;
524daa8c
ZJS
283 int r;
284
91dd5f7c
LP
285 j = log_namespace ?
286 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
287 "/run/systemd/journal/stdout";
288 r = sockaddr_un_set_path(&sa.un, j);
289 if (r < 0)
290 return r;
f36a9d59 291 sa_len = r;
91dd5f7c 292
cad93f29 293 if (gid_is_valid(gid)) {
524daa8c
ZJS
294 oldgid = getgid();
295
92a17af9 296 if (setegid(gid) < 0)
524daa8c
ZJS
297 return -errno;
298 }
299
cad93f29 300 if (uid_is_valid(uid)) {
524daa8c
ZJS
301 olduid = getuid();
302
92a17af9 303 if (seteuid(uid) < 0) {
524daa8c
ZJS
304 r = -errno;
305 goto restore_gid;
306 }
307 }
308
7c248223 309 r = RET_NERRNO(connect(fd, &sa.sa, sa_len));
524daa8c
ZJS
310
311 /* If we fail to restore the uid or gid, things will likely
312 fail later on. This should only happen if an LSM interferes. */
313
cad93f29 314 if (uid_is_valid(uid))
524daa8c
ZJS
315 (void) seteuid(olduid);
316
317 restore_gid:
cad93f29 318 if (gid_is_valid(gid))
524daa8c
ZJS
319 (void) setegid(oldgid);
320
321 return r;
322}
323
fd1f9c89 324static int connect_logger_as(
34cf6c43 325 const Unit *unit,
fd1f9c89 326 const ExecContext *context,
af635cf3 327 const ExecParameters *params,
fd1f9c89
LP
328 ExecOutput output,
329 const char *ident,
fd1f9c89
LP
330 int nfd,
331 uid_t uid,
332 gid_t gid) {
333
2ac1ff68
EV
334 _cleanup_close_ int fd = -1;
335 int r;
071830ff
LP
336
337 assert(context);
af635cf3 338 assert(params);
80876c20
LP
339 assert(output < _EXEC_OUTPUT_MAX);
340 assert(ident);
341 assert(nfd >= 0);
071830ff 342
54fe0cdb
LP
343 fd = socket(AF_UNIX, SOCK_STREAM, 0);
344 if (fd < 0)
80876c20 345 return -errno;
071830ff 346
91dd5f7c 347 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
348 if (r < 0)
349 return r;
071830ff 350
2ac1ff68 351 if (shutdown(fd, SHUT_RD) < 0)
80876c20 352 return -errno;
071830ff 353
fd1f9c89 354 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 355
2ac1ff68 356 if (dprintf(fd,
62bca2c6 357 "%s\n"
80876c20
LP
358 "%s\n"
359 "%i\n"
54fe0cdb
LP
360 "%i\n"
361 "%i\n"
362 "%i\n"
4f4a1dbf 363 "%i\n",
c867611e 364 context->syslog_identifier ?: ident,
af635cf3 365 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
366 context->syslog_priority,
367 !!context->syslog_level_prefix,
f3dc6af2 368 false,
aac8c0c3 369 is_kmsg_output(output),
2ac1ff68
EV
370 is_terminal_output(output)) < 0)
371 return -errno;
80876c20 372
2ac1ff68 373 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 374}
2ac1ff68 375
3a274a21 376static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 377 int fd;
071830ff 378
80876c20
LP
379 assert(path);
380 assert(nfd >= 0);
fd1f9c89 381
3a274a21 382 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 383 if (fd < 0)
80876c20 384 return fd;
071830ff 385
046a82c1 386 return move_fd(fd, nfd, false);
80876c20 387}
071830ff 388
2038c3f5 389static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
390 union sockaddr_union sa;
391 socklen_t sa_len;
15a3e96f 392 _cleanup_close_ int fd = -1;
86fca584 393 int r;
071830ff 394
80876c20 395 assert(path);
071830ff 396
2038c3f5
LP
397 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
398 flags |= O_CREAT;
399
400 fd = open(path, flags|O_NOCTTY, mode);
401 if (fd >= 0)
15a3e96f 402 return TAKE_FD(fd);
071830ff 403
2038c3f5
LP
404 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
405 return -errno;
2038c3f5
LP
406
407 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
408
86fca584
ZJS
409 r = sockaddr_un_set_path(&sa.un, path);
410 if (r < 0)
411 return r == -EINVAL ? -ENXIO : r;
412 sa_len = r;
413
2038c3f5
LP
414 fd = socket(AF_UNIX, SOCK_STREAM, 0);
415 if (fd < 0)
416 return -errno;
417
86fca584 418 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 419 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 420 * indication that this wasn't an AF_UNIX socket after all */
071830ff 421
2038c3f5
LP
422 if ((flags & O_ACCMODE) == O_RDONLY)
423 r = shutdown(fd, SHUT_WR);
424 else if ((flags & O_ACCMODE) == O_WRONLY)
425 r = shutdown(fd, SHUT_RD);
426 else
86fca584 427 r = 0;
15a3e96f 428 if (r < 0)
2038c3f5 429 return -errno;
2038c3f5 430
15a3e96f 431 return TAKE_FD(fd);
80876c20 432}
071830ff 433
08f3be7a
LP
434static int fixup_input(
435 const ExecContext *context,
436 int socket_fd,
437 bool apply_tty_stdin) {
438
439 ExecInput std_input;
440
441 assert(context);
442
443 std_input = context->std_input;
1e3ad081
LP
444
445 if (is_terminal_input(std_input) && !apply_tty_stdin)
446 return EXEC_INPUT_NULL;
071830ff 447
03fd9c49 448 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
449 return EXEC_INPUT_NULL;
450
08f3be7a
LP
451 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
452 return EXEC_INPUT_NULL;
453
03fd9c49 454 return std_input;
4f2d528d
LP
455}
456
7966a916 457static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 458
7966a916 459 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
460 return EXEC_OUTPUT_INHERIT;
461
7966a916 462 return output;
4f2d528d
LP
463}
464
a34ceba6
LP
465static int setup_input(
466 const ExecContext *context,
467 const ExecParameters *params,
52c239d7 468 int socket_fd,
2caa38e9 469 const int named_iofds[static 3]) {
a34ceba6 470
4f2d528d 471 ExecInput i;
51462135 472 int r;
4f2d528d
LP
473
474 assert(context);
a34ceba6 475 assert(params);
2caa38e9 476 assert(named_iofds);
a34ceba6
LP
477
478 if (params->stdin_fd >= 0) {
479 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
480 return -errno;
481
482 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
483 if (isatty(STDIN_FILENO)) {
484 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
485 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 486 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 487 }
a34ceba6
LP
488
489 return STDIN_FILENO;
490 }
4f2d528d 491
08f3be7a 492 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
493
494 switch (i) {
071830ff 495
80876c20
LP
496 case EXEC_INPUT_NULL:
497 return open_null_as(O_RDONLY, STDIN_FILENO);
498
499 case EXEC_INPUT_TTY:
500 case EXEC_INPUT_TTY_FORCE:
501 case EXEC_INPUT_TTY_FAIL: {
046a82c1 502 int fd;
071830ff 503
1e22b5cd 504 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
505 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
506 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
507 ACQUIRE_TERMINAL_WAIT,
3a43da28 508 USEC_INFINITY);
970edce6 509 if (fd < 0)
80876c20
LP
510 return fd;
511
51462135
DDM
512 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
513 if (r < 0)
514 return r;
515
046a82c1 516 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
517 }
518
4f2d528d 519 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
520 assert(socket_fd >= 0);
521
7c248223 522 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 523
52c239d7 524 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
525 assert(named_iofds[STDIN_FILENO] >= 0);
526
52c239d7 527 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 528 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 529
08f3be7a
LP
530 case EXEC_INPUT_DATA: {
531 int fd;
532
533 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
534 if (fd < 0)
535 return fd;
536
537 return move_fd(fd, STDIN_FILENO, false);
538 }
539
2038c3f5
LP
540 case EXEC_INPUT_FILE: {
541 bool rw;
542 int fd;
543
544 assert(context->stdio_file[STDIN_FILENO]);
545
546 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
547 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
548
549 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
550 if (fd < 0)
551 return fd;
552
553 return move_fd(fd, STDIN_FILENO, false);
554 }
555
80876c20 556 default:
04499a70 557 assert_not_reached();
80876c20
LP
558 }
559}
560
41fc585a
LP
561static bool can_inherit_stderr_from_stdout(
562 const ExecContext *context,
563 ExecOutput o,
564 ExecOutput e) {
565
566 assert(context);
567
568 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
569 * stderr fd */
570
571 if (e == EXEC_OUTPUT_INHERIT)
572 return true;
573 if (e != o)
574 return false;
575
576 if (e == EXEC_OUTPUT_NAMED_FD)
577 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
578
8d7dab1f 579 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
580 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
581
582 return true;
583}
584
a34ceba6 585static int setup_output(
34cf6c43 586 const Unit *unit,
a34ceba6
LP
587 const ExecContext *context,
588 const ExecParameters *params,
589 int fileno,
590 int socket_fd,
2caa38e9 591 const int named_iofds[static 3],
a34ceba6 592 const char *ident,
7bce046b
LP
593 uid_t uid,
594 gid_t gid,
595 dev_t *journal_stream_dev,
596 ino_t *journal_stream_ino) {
a34ceba6 597
4f2d528d
LP
598 ExecOutput o;
599 ExecInput i;
47c1d80d 600 int r;
4f2d528d 601
f2341e0a 602 assert(unit);
80876c20 603 assert(context);
a34ceba6 604 assert(params);
80876c20 605 assert(ident);
7bce046b
LP
606 assert(journal_stream_dev);
607 assert(journal_stream_ino);
80876c20 608
a34ceba6
LP
609 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
610
611 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
612 return -errno;
613
614 return STDOUT_FILENO;
615 }
616
617 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
618 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
619 return -errno;
620
621 return STDERR_FILENO;
622 }
623
08f3be7a 624 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 625 o = fixup_output(context->std_output, socket_fd);
4f2d528d 626
eb17e935
MS
627 if (fileno == STDERR_FILENO) {
628 ExecOutput e;
629 e = fixup_output(context->std_error, socket_fd);
80876c20 630
eb17e935
MS
631 /* This expects the input and output are already set up */
632
633 /* Don't change the stderr file descriptor if we inherit all
634 * the way and are not on a tty */
635 if (e == EXEC_OUTPUT_INHERIT &&
636 o == EXEC_OUTPUT_INHERIT &&
637 i == EXEC_INPUT_NULL &&
638 !is_terminal_input(context->std_input) &&
7966a916 639 getppid() != 1)
eb17e935
MS
640 return fileno;
641
642 /* Duplicate from stdout if possible */
41fc585a 643 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 644 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 645
eb17e935 646 o = e;
80876c20 647
eb17e935 648 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
649 /* If input got downgraded, inherit the original value */
650 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 651 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 652
08f3be7a
LP
653 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
654 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 655 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 656
acb591e4
LP
657 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
658 if (getppid() != 1)
eb17e935 659 return fileno;
94f04347 660
eb17e935
MS
661 /* We need to open /dev/null here anew, to get the right access mode. */
662 return open_null_as(O_WRONLY, fileno);
071830ff 663 }
94f04347 664
eb17e935 665 switch (o) {
80876c20
LP
666
667 case EXEC_OUTPUT_NULL:
eb17e935 668 return open_null_as(O_WRONLY, fileno);
80876c20
LP
669
670 case EXEC_OUTPUT_TTY:
4f2d528d 671 if (is_terminal_input(i))
7c248223 672 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
673
674 /* We don't reset the terminal if this is just about output */
1e22b5cd 675 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 676
9a6bca7a 677 case EXEC_OUTPUT_KMSG:
28dbc1e8 678 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
679 case EXEC_OUTPUT_JOURNAL:
680 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 681 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 682 if (r < 0) {
7966a916
ZJS
683 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
684 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 685 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
686 } else {
687 struct stat st;
688
689 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
690 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
691 * services to detect whether they are connected to the journal or not.
692 *
693 * If both stdout and stderr are connected to a stream then let's make sure to store the data
694 * about STDERR as that's usually the best way to do logging. */
7bce046b 695
ab2116b1
LP
696 if (fstat(fileno, &st) >= 0 &&
697 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
698 *journal_stream_dev = st.st_dev;
699 *journal_stream_ino = st.st_ino;
700 }
47c1d80d
MS
701 }
702 return r;
4f2d528d
LP
703
704 case EXEC_OUTPUT_SOCKET:
705 assert(socket_fd >= 0);
e75a9ed1 706
7c248223 707 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 708
52c239d7 709 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
710 assert(named_iofds[fileno] >= 0);
711
52c239d7 712 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 713 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 714
566b7d23 715 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
716 case EXEC_OUTPUT_FILE_APPEND:
717 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 718 bool rw;
566b7d23 719 int fd, flags;
2038c3f5
LP
720
721 assert(context->stdio_file[fileno]);
722
723 rw = context->std_input == EXEC_INPUT_FILE &&
724 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
725
726 if (rw)
7c248223 727 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 728
566b7d23
ZD
729 flags = O_WRONLY;
730 if (o == EXEC_OUTPUT_FILE_APPEND)
731 flags |= O_APPEND;
8d7dab1f
LW
732 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
733 flags |= O_TRUNC;
566b7d23
ZD
734
735 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
736 if (fd < 0)
737 return fd;
738
566b7d23 739 return move_fd(fd, fileno, 0);
2038c3f5
LP
740 }
741
94f04347 742 default:
04499a70 743 assert_not_reached();
94f04347 744 }
071830ff
LP
745}
746
02a51aba 747static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 748 int r;
02a51aba
LP
749
750 assert(fd >= 0);
02a51aba 751
1ff74fb6 752 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
753 if (isatty(fd) < 1) {
754 if (IN_SET(errno, EINVAL, ENOTTY))
755 return 0; /* not a tty */
1ff74fb6 756
02a51aba 757 return -errno;
4b3b5bc7 758 }
02a51aba 759
4b3b5bc7 760 /* This might fail. What matters are the results. */
f2df231f 761 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
762 if (r < 0)
763 return r;
02a51aba 764
4b3b5bc7 765 return 1;
02a51aba
LP
766}
767
aedec452 768static int setup_confirm_stdio(
51462135 769 const ExecContext *context,
aedec452
LP
770 const char *vc,
771 int *ret_saved_stdin,
772 int *ret_saved_stdout) {
773
3d18b167
LP
774 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
775 int r;
80876c20 776
aedec452
LP
777 assert(ret_saved_stdin);
778 assert(ret_saved_stdout);
80876c20 779
af6da548
LP
780 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
781 if (saved_stdin < 0)
782 return -errno;
80876c20 783
af6da548 784 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
785 if (saved_stdout < 0)
786 return -errno;
80876c20 787
8854d795 788 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
789 if (fd < 0)
790 return fd;
80876c20 791
af6da548
LP
792 r = chown_terminal(fd, getuid());
793 if (r < 0)
3d18b167 794 return r;
02a51aba 795
3d18b167
LP
796 r = reset_terminal_fd(fd, true);
797 if (r < 0)
798 return r;
80876c20 799
51462135
DDM
800 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
801 if (r < 0)
802 return r;
803
aedec452
LP
804 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
805 TAKE_FD(fd);
2b33ab09
LP
806 if (r < 0)
807 return r;
80876c20 808
aedec452
LP
809 *ret_saved_stdin = TAKE_FD(saved_stdin);
810 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 811 return 0;
80876c20
LP
812}
813
63d77c92 814static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
815 assert(err < 0);
816
817 if (err == -ETIMEDOUT)
63d77c92 818 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
819 else {
820 errno = -err;
63d77c92 821 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
822 }
823}
824
63d77c92 825static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 826 _cleanup_close_ int fd = -1;
80876c20 827
3b20f877 828 assert(vc);
80876c20 829
7d5ceb64 830 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 831 if (fd < 0)
3b20f877 832 return;
80876c20 833
63d77c92 834 write_confirm_error_fd(err, fd, u);
af6da548 835}
80876c20 836
3d18b167 837static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 838 int r = 0;
80876c20 839
af6da548
LP
840 assert(saved_stdin);
841 assert(saved_stdout);
842
843 release_terminal();
844
845 if (*saved_stdin >= 0)
80876c20 846 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 847 r = -errno;
80876c20 848
af6da548 849 if (*saved_stdout >= 0)
80876c20 850 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 851 r = -errno;
80876c20 852
3d18b167
LP
853 *saved_stdin = safe_close(*saved_stdin);
854 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
855
856 return r;
857}
858
3b20f877
FB
859enum {
860 CONFIRM_PRETEND_FAILURE = -1,
861 CONFIRM_PRETEND_SUCCESS = 0,
862 CONFIRM_EXECUTE = 1,
863};
864
51462135 865static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 866 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 867 _cleanup_free_ char *e = NULL;
3b20f877 868 char c;
af6da548 869
3b20f877 870 /* For any internal errors, assume a positive response. */
51462135 871 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 872 if (r < 0) {
63d77c92 873 write_confirm_error(r, vc, u);
3b20f877
FB
874 return CONFIRM_EXECUTE;
875 }
af6da548 876
b0eb2944
FB
877 /* confirm_spawn might have been disabled while we were sleeping. */
878 if (manager_is_confirm_spawn_disabled(u->manager)) {
879 r = 1;
880 goto restore_stdio;
881 }
af6da548 882
2bcd3c26
FB
883 e = ellipsize(cmdline, 60, 100);
884 if (!e) {
885 log_oom();
886 r = CONFIRM_EXECUTE;
887 goto restore_stdio;
888 }
af6da548 889
d172b175 890 for (;;) {
539622bd 891 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 892 if (r < 0) {
63d77c92 893 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
894 r = CONFIRM_EXECUTE;
895 goto restore_stdio;
896 }
af6da548 897
d172b175 898 switch (c) {
b0eb2944
FB
899 case 'c':
900 printf("Resuming normal execution.\n");
901 manager_disable_confirm_spawn();
902 r = 1;
903 break;
dd6f9ac0
FB
904 case 'D':
905 unit_dump(u, stdout, " ");
906 continue; /* ask again */
d172b175
FB
907 case 'f':
908 printf("Failing execution.\n");
909 r = CONFIRM_PRETEND_FAILURE;
910 break;
911 case 'h':
b0eb2944
FB
912 printf(" c - continue, proceed without asking anymore\n"
913 " D - dump, show the state of the unit\n"
dd6f9ac0 914 " f - fail, don't execute the command and pretend it failed\n"
d172b175 915 " h - help\n"
eedf223a 916 " i - info, show a short summary of the unit\n"
56fde33a 917 " j - jobs, show jobs that are in progress\n"
d172b175
FB
918 " s - skip, don't execute the command and pretend it succeeded\n"
919 " y - yes, execute the command\n");
dd6f9ac0 920 continue; /* ask again */
eedf223a
FB
921 case 'i':
922 printf(" Description: %s\n"
923 " Unit: %s\n"
924 " Command: %s\n",
925 u->id, u->description, cmdline);
926 continue; /* ask again */
56fde33a
FB
927 case 'j':
928 manager_dump_jobs(u->manager, stdout, " ");
929 continue; /* ask again */
539622bd
FB
930 case 'n':
931 /* 'n' was removed in favor of 'f'. */
932 printf("Didn't understand 'n', did you mean 'f'?\n");
933 continue; /* ask again */
d172b175
FB
934 case 's':
935 printf("Skipping execution.\n");
936 r = CONFIRM_PRETEND_SUCCESS;
937 break;
938 case 'y':
939 r = CONFIRM_EXECUTE;
940 break;
941 default:
04499a70 942 assert_not_reached();
d172b175 943 }
3b20f877 944 break;
3b20f877 945 }
af6da548 946
3b20f877 947restore_stdio:
af6da548 948 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 949 return r;
80876c20
LP
950}
951
4d885bd3
DH
952static int get_fixed_user(const ExecContext *c, const char **user,
953 uid_t *uid, gid_t *gid,
954 const char **home, const char **shell) {
81a2b7ce 955 int r;
4d885bd3 956 const char *name;
81a2b7ce 957
4d885bd3 958 assert(c);
81a2b7ce 959
23deef88
LP
960 if (!c->user)
961 return 0;
962
4d885bd3
DH
963 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
964 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 965
23deef88 966 name = c->user;
fafff8f1 967 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
968 if (r < 0)
969 return r;
81a2b7ce 970
4d885bd3
DH
971 *user = name;
972 return 0;
973}
974
975static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
976 int r;
977 const char *name;
978
979 assert(c);
980
981 if (!c->group)
982 return 0;
983
984 name = c->group;
fafff8f1 985 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
986 if (r < 0)
987 return r;
988
989 *group = name;
990 return 0;
991}
992
cdc5d5c5
DH
993static int get_supplementary_groups(const ExecContext *c, const char *user,
994 const char *group, gid_t gid,
995 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
996 char **i;
997 int r, k = 0;
998 int ngroups_max;
999 bool keep_groups = false;
1000 gid_t *groups = NULL;
1001 _cleanup_free_ gid_t *l_gids = NULL;
1002
1003 assert(c);
1004
bbeea271
DH
1005 /*
1006 * If user is given, then lookup GID and supplementary groups list.
1007 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1008 * here and as early as possible so we keep the list of supplementary
1009 * groups of the caller.
bbeea271
DH
1010 */
1011 if (user && gid_is_valid(gid) && gid != 0) {
1012 /* First step, initialize groups from /etc/groups */
1013 if (initgroups(user, gid) < 0)
1014 return -errno;
1015
1016 keep_groups = true;
1017 }
1018
ac6e8be6 1019 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1020 return 0;
1021
366ddd25
DH
1022 /*
1023 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1024 * be positive, otherwise fail.
1025 */
1026 errno = 0;
1027 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1028 if (ngroups_max <= 0)
1029 return errno_or_else(EOPNOTSUPP);
366ddd25 1030
4d885bd3
DH
1031 l_gids = new(gid_t, ngroups_max);
1032 if (!l_gids)
1033 return -ENOMEM;
81a2b7ce 1034
4d885bd3
DH
1035 if (keep_groups) {
1036 /*
1037 * Lookup the list of groups that the user belongs to, we
1038 * avoid NSS lookups here too for gid=0.
1039 */
1040 k = ngroups_max;
1041 if (getgrouplist(user, gid, l_gids, &k) < 0)
1042 return -EINVAL;
1043 } else
1044 k = 0;
81a2b7ce 1045
4d885bd3
DH
1046 STRV_FOREACH(i, c->supplementary_groups) {
1047 const char *g;
81a2b7ce 1048
4d885bd3
DH
1049 if (k >= ngroups_max)
1050 return -E2BIG;
81a2b7ce 1051
4d885bd3 1052 g = *i;
fafff8f1 1053 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1054 if (r < 0)
1055 return r;
81a2b7ce 1056
4d885bd3
DH
1057 k++;
1058 }
81a2b7ce 1059
4d885bd3
DH
1060 /*
1061 * Sets ngids to zero to drop all supplementary groups, happens
1062 * when we are under root and SupplementaryGroups= is empty.
1063 */
1064 if (k == 0) {
1065 *ngids = 0;
1066 return 0;
1067 }
81a2b7ce 1068
4d885bd3
DH
1069 /* Otherwise get the final list of supplementary groups */
1070 groups = memdup(l_gids, sizeof(gid_t) * k);
1071 if (!groups)
1072 return -ENOMEM;
1073
1074 *supplementary_gids = groups;
1075 *ngids = k;
1076
1077 groups = NULL;
1078
1079 return 0;
1080}
1081
34cf6c43 1082static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1083 int r;
1084
709dbeac
YW
1085 /* Handle SupplementaryGroups= if it is not empty */
1086 if (ngids > 0) {
4d885bd3
DH
1087 r = maybe_setgroups(ngids, supplementary_gids);
1088 if (r < 0)
97f0e76f 1089 return r;
4d885bd3 1090 }
81a2b7ce 1091
4d885bd3
DH
1092 if (gid_is_valid(gid)) {
1093 /* Then set our gids */
1094 if (setresgid(gid, gid, gid) < 0)
1095 return -errno;
81a2b7ce
LP
1096 }
1097
1098 return 0;
1099}
1100
dbdc4098
TK
1101static int set_securebits(int bits, int mask) {
1102 int current, applied;
1103 current = prctl(PR_GET_SECUREBITS);
1104 if (current < 0)
1105 return -errno;
1106 /* Clear all securebits defined in mask and set bits */
1107 applied = (current & ~mask) | bits;
1108 if (current == applied)
1109 return 0;
1110 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1111 return -errno;
1112 return 1;
1113}
1114
81a2b7ce 1115static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1116 assert(context);
dbdc4098 1117 int r;
81a2b7ce 1118
4d885bd3
DH
1119 if (!uid_is_valid(uid))
1120 return 0;
1121
479050b3 1122 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1123 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1124 * required, so we also need keep-caps in this case.
1125 */
81a2b7ce 1126
dbdc4098 1127 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1128
1129 /* First step: If we need to keep capabilities but
1130 * drop privileges we need to make sure we keep our
cbb21cca 1131 * caps, while we drop privileges. */
693ced48 1132 if (uid != 0) {
dbdc4098
TK
1133 /* Add KEEP_CAPS to the securebits */
1134 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1135 if (r < 0)
1136 return r;
693ced48 1137 }
81a2b7ce
LP
1138 }
1139
479050b3 1140 /* Second step: actually set the uids */
81a2b7ce
LP
1141 if (setresuid(uid, uid, uid) < 0)
1142 return -errno;
1143
1144 /* At this point we should have all necessary capabilities but
1145 are otherwise a normal user. However, the caps might got
1146 corrupted due to the setresuid() so we need clean them up
1147 later. This is done outside of this call. */
1148
1149 return 0;
1150}
1151
349cc4a5 1152#if HAVE_PAM
5b6319dc
LP
1153
1154static int null_conv(
1155 int num_msg,
1156 const struct pam_message **msg,
1157 struct pam_response **resp,
1158 void *appdata_ptr) {
1159
1160 /* We don't support conversations */
1161
1162 return PAM_CONV_ERR;
1163}
1164
cefc33ae
LP
1165#endif
1166
5b6319dc
LP
1167static int setup_pam(
1168 const char *name,
1169 const char *user,
940c5210 1170 uid_t uid,
2d6fce8d 1171 gid_t gid,
5b6319dc 1172 const char *tty,
2065ca69 1173 char ***env,
5b8d1f6b 1174 const int fds[], size_t n_fds) {
5b6319dc 1175
349cc4a5 1176#if HAVE_PAM
cefc33ae 1177
5b6319dc
LP
1178 static const struct pam_conv conv = {
1179 .conv = null_conv,
1180 .appdata_ptr = NULL
1181 };
1182
2d7c6aa2 1183 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1184 pam_handle_t *handle = NULL;
d6e5f3ad 1185 sigset_t old_ss;
7bb70b6e 1186 int pam_code = PAM_SUCCESS, r;
84eada2f 1187 char **nv, **e = NULL;
5b6319dc
LP
1188 bool close_session = false;
1189 pid_t pam_pid = 0, parent_pid;
970edce6 1190 int flags = 0;
5b6319dc
LP
1191
1192 assert(name);
1193 assert(user);
2065ca69 1194 assert(env);
5b6319dc
LP
1195
1196 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1197 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1198 * systemd via the cgroup logic. It will then remove the PAM
1199 * session again. The parent process will exec() the actual
1200 * daemon. We do things this way to ensure that the main PID
1201 * of the daemon is the one we initially fork()ed. */
1202
7bb70b6e
LP
1203 r = barrier_create(&barrier);
1204 if (r < 0)
2d7c6aa2
DH
1205 goto fail;
1206
553d2243 1207 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1208 flags |= PAM_SILENT;
1209
f546241b
ZJS
1210 pam_code = pam_start(name, user, &conv, &handle);
1211 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1212 handle = NULL;
1213 goto fail;
1214 }
1215
3cd24c1a
LP
1216 if (!tty) {
1217 _cleanup_free_ char *q = NULL;
1218
1219 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1220 * out if that's the case, and read the TTY off it. */
1221
1222 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1223 tty = strjoina("/dev/", q);
1224 }
1225
f546241b
ZJS
1226 if (tty) {
1227 pam_code = pam_set_item(handle, PAM_TTY, tty);
1228 if (pam_code != PAM_SUCCESS)
5b6319dc 1229 goto fail;
f546241b 1230 }
5b6319dc 1231
84eada2f
JW
1232 STRV_FOREACH(nv, *env) {
1233 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1234 if (pam_code != PAM_SUCCESS)
1235 goto fail;
1236 }
1237
970edce6 1238 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1239 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1240 goto fail;
1241
3bb39ea9
DG
1242 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1243 if (pam_code != PAM_SUCCESS)
46d7c6af 1244 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1245
970edce6 1246 pam_code = pam_open_session(handle, flags);
f546241b 1247 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1248 goto fail;
1249
1250 close_session = true;
1251
f546241b
ZJS
1252 e = pam_getenvlist(handle);
1253 if (!e) {
5b6319dc
LP
1254 pam_code = PAM_BUF_ERR;
1255 goto fail;
1256 }
1257
1258 /* Block SIGTERM, so that we know that it won't get lost in
1259 * the child */
ce30c8dc 1260
72c0a2c2 1261 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1262
df0ff127 1263 parent_pid = getpid_cached();
5b6319dc 1264
4c253ed1
LP
1265 r = safe_fork("(sd-pam)", 0, &pam_pid);
1266 if (r < 0)
5b6319dc 1267 goto fail;
4c253ed1 1268 if (r == 0) {
7bb70b6e 1269 int sig, ret = EXIT_PAM;
5b6319dc
LP
1270
1271 /* The child's job is to reset the PAM session on
1272 * termination */
2d7c6aa2 1273 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1274
1da37e58
ZJS
1275 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1276 * those fds are open here that have been opened by PAM. */
4c253ed1 1277 (void) close_many(fds, n_fds);
5b6319dc 1278
940c5210
AK
1279 /* Drop privileges - we don't need any to pam_close_session
1280 * and this will make PR_SET_PDEATHSIG work in most cases.
1281 * If this fails, ignore the error - but expect sd-pam threads
1282 * to fail to exit normally */
2d6fce8d 1283
97f0e76f
LP
1284 r = maybe_setgroups(0, NULL);
1285 if (r < 0)
1286 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1287 if (setresgid(gid, gid, gid) < 0)
1288 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1289 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1290 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1291
9c274488 1292 (void) ignore_signals(SIGPIPE);
ce30c8dc 1293
940c5210
AK
1294 /* Wait until our parent died. This will only work if
1295 * the above setresuid() succeeds, otherwise the kernel
1296 * will not allow unprivileged parents kill their privileged
1297 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1298 * to do the rest for us. */
1299 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1300 goto child_finish;
1301
2d7c6aa2
DH
1302 /* Tell the parent that our setup is done. This is especially
1303 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1304 * setup might race against our setresuid(2) call.
1305 *
1306 * If the parent aborted, we'll detect this below, hence ignore
1307 * return failure here. */
1308 (void) barrier_place(&barrier);
2d7c6aa2 1309
643f4706 1310 /* Check if our parent process might already have died? */
5b6319dc 1311 if (getppid() == parent_pid) {
d6e5f3ad
DM
1312 sigset_t ss;
1313
1314 assert_se(sigemptyset(&ss) >= 0);
1315 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1316
3dead8d9
LP
1317 for (;;) {
1318 if (sigwait(&ss, &sig) < 0) {
1319 if (errno == EINTR)
1320 continue;
1321
1322 goto child_finish;
1323 }
5b6319dc 1324
3dead8d9
LP
1325 assert(sig == SIGTERM);
1326 break;
1327 }
5b6319dc
LP
1328 }
1329
3bb39ea9
DG
1330 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1331 if (pam_code != PAM_SUCCESS)
1332 goto child_finish;
1333
3dead8d9 1334 /* If our parent died we'll end the session */
f546241b 1335 if (getppid() != parent_pid) {
970edce6 1336 pam_code = pam_close_session(handle, flags);
f546241b 1337 if (pam_code != PAM_SUCCESS)
5b6319dc 1338 goto child_finish;
f546241b 1339 }
5b6319dc 1340
7bb70b6e 1341 ret = 0;
5b6319dc
LP
1342
1343 child_finish:
970edce6 1344 pam_end(handle, pam_code | flags);
7bb70b6e 1345 _exit(ret);
5b6319dc
LP
1346 }
1347
2d7c6aa2
DH
1348 barrier_set_role(&barrier, BARRIER_PARENT);
1349
5b6319dc
LP
1350 /* If the child was forked off successfully it will do all the
1351 * cleanups, so forget about the handle here. */
1352 handle = NULL;
1353
3b8bddde 1354 /* Unblock SIGTERM again in the parent */
72c0a2c2 1355 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1356
1357 /* We close the log explicitly here, since the PAM modules
1358 * might have opened it, but we don't want this fd around. */
1359 closelog();
1360
2d7c6aa2
DH
1361 /* Synchronously wait for the child to initialize. We don't care for
1362 * errors as we cannot recover. However, warn loudly if it happens. */
1363 if (!barrier_place_and_sync(&barrier))
1364 log_error("PAM initialization failed");
1365
130d3d22 1366 return strv_free_and_replace(*env, e);
5b6319dc
LP
1367
1368fail:
970edce6
ZJS
1369 if (pam_code != PAM_SUCCESS) {
1370 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1371 r = -EPERM; /* PAM errors do not map to errno */
1372 } else
1373 log_error_errno(r, "PAM failed: %m");
9ba35398 1374
5b6319dc
LP
1375 if (handle) {
1376 if (close_session)
970edce6 1377 pam_code = pam_close_session(handle, flags);
5b6319dc 1378
970edce6 1379 pam_end(handle, pam_code | flags);
5b6319dc
LP
1380 }
1381
1382 strv_free(e);
5b6319dc
LP
1383 closelog();
1384
7bb70b6e 1385 return r;
cefc33ae
LP
1386#else
1387 return 0;
5b6319dc 1388#endif
cefc33ae 1389}
5b6319dc 1390
5d6b1584
LP
1391static void rename_process_from_path(const char *path) {
1392 char process_name[11];
1393 const char *p;
1394 size_t l;
1395
1396 /* This resulting string must fit in 10 chars (i.e. the length
1397 * of "/sbin/init") to look pretty in /bin/ps */
1398
2b6bf07d 1399 p = basename(path);
5d6b1584
LP
1400 if (isempty(p)) {
1401 rename_process("(...)");
1402 return;
1403 }
1404
1405 l = strlen(p);
1406 if (l > 8) {
1407 /* The end of the process name is usually more
1408 * interesting, since the first bit might just be
1409 * "systemd-" */
1410 p = p + l - 8;
1411 l = 8;
1412 }
1413
1414 process_name[0] = '(';
1415 memcpy(process_name+1, p, l);
1416 process_name[1+l] = ')';
1417 process_name[1+l+1] = 0;
1418
1419 rename_process(process_name);
1420}
1421
469830d1
LP
1422static bool context_has_address_families(const ExecContext *c) {
1423 assert(c);
1424
6b000af4 1425 return c->address_families_allow_list ||
469830d1
LP
1426 !set_isempty(c->address_families);
1427}
1428
1429static bool context_has_syscall_filters(const ExecContext *c) {
1430 assert(c);
1431
6b000af4 1432 return c->syscall_allow_list ||
8cfa775f 1433 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1434}
1435
9df2cdd8
TM
1436static bool context_has_syscall_logs(const ExecContext *c) {
1437 assert(c);
1438
1439 return c->syscall_log_allow_list ||
1440 !hashmap_isempty(c->syscall_log);
1441}
1442
469830d1
LP
1443static bool context_has_no_new_privileges(const ExecContext *c) {
1444 assert(c);
1445
1446 if (c->no_new_privileges)
1447 return true;
1448
1449 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1450 return false;
1451
1452 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1453 return c->lock_personality ||
469830d1 1454 c->memory_deny_write_execute ||
0538d2a8 1455 c->private_devices ||
fc64760d 1456 c->protect_clock ||
0538d2a8 1457 c->protect_hostname ||
469830d1
LP
1458 c->protect_kernel_tunables ||
1459 c->protect_kernel_modules ||
84703040 1460 c->protect_kernel_logs ||
0538d2a8
YW
1461 context_has_address_families(c) ||
1462 exec_context_restrict_namespaces_set(c) ||
1463 c->restrict_realtime ||
1464 c->restrict_suid_sgid ||
78e864e5 1465 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1466 context_has_syscall_filters(c) ||
1467 context_has_syscall_logs(c);
469830d1
LP
1468}
1469
bb0c0d6f
LP
1470static bool exec_context_has_credentials(const ExecContext *context) {
1471
1472 assert(context);
1473
1474 return !hashmap_isempty(context->set_credentials) ||
43144be4 1475 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1476}
1477
349cc4a5 1478#if HAVE_SECCOMP
17df7223 1479
83f12b27 1480static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1481
1482 if (is_seccomp_available())
1483 return false;
1484
f673b62d 1485 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1486 return true;
83f12b27
FS
1487}
1488
165a31c0 1489static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1490 uint32_t negative_action, default_action, action;
165a31c0 1491 int r;
8351ceae 1492
469830d1 1493 assert(u);
c0467cf3 1494 assert(c);
8351ceae 1495
469830d1 1496 if (!context_has_syscall_filters(c))
83f12b27
FS
1497 return 0;
1498
469830d1
LP
1499 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1500 return 0;
e9642be2 1501
005bfaf1 1502 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1503
6b000af4 1504 if (c->syscall_allow_list) {
469830d1
LP
1505 default_action = negative_action;
1506 action = SCMP_ACT_ALLOW;
7c66bae2 1507 } else {
469830d1
LP
1508 default_action = SCMP_ACT_ALLOW;
1509 action = negative_action;
57183d11 1510 }
8351ceae 1511
165a31c0 1512 if (needs_ambient_hack) {
6b000af4 1513 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1514 if (r < 0)
1515 return r;
1516 }
1517
b54f36c6 1518 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1519}
1520
9df2cdd8
TM
1521static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1522#ifdef SCMP_ACT_LOG
1523 uint32_t default_action, action;
1524#endif
1525
1526 assert(u);
1527 assert(c);
1528
1529 if (!context_has_syscall_logs(c))
1530 return 0;
1531
1532#ifdef SCMP_ACT_LOG
1533 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1534 return 0;
1535
1536 if (c->syscall_log_allow_list) {
1537 /* Log nothing but the ones listed */
1538 default_action = SCMP_ACT_ALLOW;
1539 action = SCMP_ACT_LOG;
1540 } else {
1541 /* Log everything but the ones listed */
1542 default_action = SCMP_ACT_LOG;
1543 action = SCMP_ACT_ALLOW;
1544 }
1545
1546 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1547#else
1548 /* old libseccomp */
1549 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1550 return 0;
1551#endif
1552}
1553
469830d1
LP
1554static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1555 assert(u);
4298d0b5
LP
1556 assert(c);
1557
469830d1 1558 if (set_isempty(c->syscall_archs))
83f12b27
FS
1559 return 0;
1560
469830d1
LP
1561 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1562 return 0;
4298d0b5 1563
469830d1
LP
1564 return seccomp_restrict_archs(c->syscall_archs);
1565}
4298d0b5 1566
469830d1
LP
1567static int apply_address_families(const Unit* u, const ExecContext *c) {
1568 assert(u);
1569 assert(c);
4298d0b5 1570
469830d1
LP
1571 if (!context_has_address_families(c))
1572 return 0;
4298d0b5 1573
469830d1
LP
1574 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1575 return 0;
4298d0b5 1576
6b000af4 1577 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1578}
4298d0b5 1579
83f12b27 1580static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1581 assert(u);
f3e43635
TM
1582 assert(c);
1583
469830d1 1584 if (!c->memory_deny_write_execute)
83f12b27
FS
1585 return 0;
1586
469830d1
LP
1587 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1588 return 0;
f3e43635 1589
469830d1 1590 return seccomp_memory_deny_write_execute();
f3e43635
TM
1591}
1592
83f12b27 1593static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1594 assert(u);
f4170c67
LP
1595 assert(c);
1596
469830d1 1597 if (!c->restrict_realtime)
83f12b27
FS
1598 return 0;
1599
469830d1
LP
1600 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1601 return 0;
f4170c67 1602
469830d1 1603 return seccomp_restrict_realtime();
f4170c67
LP
1604}
1605
f69567cb
LP
1606static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1607 assert(u);
1608 assert(c);
1609
1610 if (!c->restrict_suid_sgid)
1611 return 0;
1612
1613 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1614 return 0;
1615
1616 return seccomp_restrict_suid_sgid();
1617}
1618
59e856c7 1619static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1620 assert(u);
59eeb84b
LP
1621 assert(c);
1622
1623 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1624 * let's protect even those systems where this is left on in the kernel. */
1625
469830d1 1626 if (!c->protect_kernel_tunables)
59eeb84b
LP
1627 return 0;
1628
469830d1
LP
1629 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1630 return 0;
59eeb84b 1631
469830d1 1632 return seccomp_protect_sysctl();
59eeb84b
LP
1633}
1634
59e856c7 1635static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1636 assert(u);
502d704e
DH
1637 assert(c);
1638
25a8d8a0 1639 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1640
469830d1
LP
1641 if (!c->protect_kernel_modules)
1642 return 0;
1643
502d704e
DH
1644 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1645 return 0;
1646
b54f36c6 1647 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1648}
1649
84703040
KK
1650static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1651 assert(u);
1652 assert(c);
1653
1654 if (!c->protect_kernel_logs)
1655 return 0;
1656
1657 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1658 return 0;
1659
1660 return seccomp_protect_syslog();
1661}
1662
daf8f72b 1663static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1664 assert(u);
1665 assert(c);
1666
1667 if (!c->protect_clock)
1668 return 0;
1669
1670 if (skip_seccomp_unavailable(u, "ProtectClock="))
1671 return 0;
1672
1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1674}
1675
59e856c7 1676static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1677 assert(u);
ba128bb8
LP
1678 assert(c);
1679
8f81a5f6 1680 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1681
469830d1
LP
1682 if (!c->private_devices)
1683 return 0;
1684
ba128bb8
LP
1685 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1686 return 0;
1687
b54f36c6 1688 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1689}
1690
34cf6c43 1691static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1692 assert(u);
add00535
LP
1693 assert(c);
1694
1695 if (!exec_context_restrict_namespaces_set(c))
1696 return 0;
1697
1698 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1699 return 0;
1700
1701 return seccomp_restrict_namespaces(c->restrict_namespaces);
1702}
1703
b1994387
ILG
1704#if HAVE_LIBBPF
1705static bool skip_lsm_bpf_unsupported(const Unit* u, const char* msg) {
1706 if (lsm_bpf_supported())
1707 return false;
1708
1709 log_unit_debug(u, "LSM BPF not supported, skipping %s", msg);
1710 return true;
1711}
1712
1713static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1714 assert(u);
1715 assert(c);
1716
1717 if (!exec_context_restrict_filesystems_set(c))
1718 return 0;
1719
1720 if (skip_lsm_bpf_unsupported(u, "RestrictFileSystems="))
1721 return 0;
1722
1723 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1724}
1725#endif
1726
78e864e5 1727static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1728 unsigned long personality;
1729 int r;
78e864e5
TM
1730
1731 assert(u);
1732 assert(c);
1733
1734 if (!c->lock_personality)
1735 return 0;
1736
1737 if (skip_seccomp_unavailable(u, "LockPersonality="))
1738 return 0;
1739
e8132d63
LP
1740 personality = c->personality;
1741
1742 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1743 if (personality == PERSONALITY_INVALID) {
1744
1745 r = opinionated_personality(&personality);
1746 if (r < 0)
1747 return r;
1748 }
78e864e5
TM
1749
1750 return seccomp_lock_personality(personality);
1751}
1752
c0467cf3 1753#endif
8351ceae 1754
daf8f72b 1755static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1756 assert(u);
1757 assert(c);
1758
1759 if (!c->protect_hostname)
1760 return 0;
1761
1762 if (ns_type_supported(NAMESPACE_UTS)) {
1763 if (unshare(CLONE_NEWUTS) < 0) {
1764 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1765 *ret_exit_status = EXIT_NAMESPACE;
1766 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1767 }
1768
1769 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1770 }
1771 } else
1772 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1773
1774#if HAVE_SECCOMP
8f3e342f
ZJS
1775 int r;
1776
daf8f72b
LP
1777 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1778 return 0;
1779
1780 r = seccomp_protect_hostname();
1781 if (r < 0) {
1782 *ret_exit_status = EXIT_SECCOMP;
1783 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1784 }
1785#endif
1786
1787 return 0;
1788}
1789
3042bbeb 1790static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1791 assert(idle_pipe);
1792
54eb2300
LP
1793 idle_pipe[1] = safe_close(idle_pipe[1]);
1794 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1795
1796 if (idle_pipe[0] >= 0) {
1797 int r;
1798
1799 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1800
1801 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1802 ssize_t n;
1803
31a7eb86 1804 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1805 n = write(idle_pipe[3], "x", 1);
1806 if (n > 0)
cd972d69 1807 /* Wait for systemd to react to the signal above. */
54756dce 1808 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1809 }
1810
54eb2300 1811 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1812
1813 }
1814
54eb2300 1815 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1816}
1817
fb2042dd
YW
1818static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1819
7cae38c4 1820static int build_environment(
34cf6c43 1821 const Unit *u,
9fa95f85 1822 const ExecContext *c,
1e22b5cd 1823 const ExecParameters *p,
da6053d0 1824 size_t n_fds,
7cae38c4
LP
1825 const char *home,
1826 const char *username,
1827 const char *shell,
7bce046b
LP
1828 dev_t journal_stream_dev,
1829 ino_t journal_stream_ino,
7cae38c4
LP
1830 char ***ret) {
1831
1832 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1833 size_t n_env = 0;
7cae38c4
LP
1834 char *x;
1835
4b58153d 1836 assert(u);
7cae38c4 1837 assert(c);
7c1cb6f1 1838 assert(p);
7cae38c4
LP
1839 assert(ret);
1840
dc4e2940 1841#define N_ENV_VARS 17
8d5bb13d 1842 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1843 if (!our_env)
1844 return -ENOMEM;
1845
1846 if (n_fds > 0) {
8dd4c05b
LP
1847 _cleanup_free_ char *joined = NULL;
1848
df0ff127 1849 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1850 return -ENOMEM;
1851 our_env[n_env++] = x;
1852
da6053d0 1853 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1854 return -ENOMEM;
1855 our_env[n_env++] = x;
8dd4c05b 1856
1e22b5cd 1857 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1858 if (!joined)
1859 return -ENOMEM;
1860
605405c6 1861 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1862 if (!x)
1863 return -ENOMEM;
1864 our_env[n_env++] = x;
7cae38c4
LP
1865 }
1866
b08af3b1 1867 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1868 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1869 return -ENOMEM;
1870 our_env[n_env++] = x;
1871
1e22b5cd 1872 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1873 return -ENOMEM;
1874 our_env[n_env++] = x;
1875 }
1876
fd63e712
LP
1877 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1878 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1879 * check the database directly. */
ac647978 1880 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1881 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1882 if (!x)
1883 return -ENOMEM;
1884 our_env[n_env++] = x;
1885 }
1886
7cae38c4 1887 if (home) {
b910cc72 1888 x = strjoin("HOME=", home);
7cae38c4
LP
1889 if (!x)
1890 return -ENOMEM;
7bbead1d 1891
4ff361cc 1892 path_simplify(x + 5);
7cae38c4
LP
1893 our_env[n_env++] = x;
1894 }
1895
1896 if (username) {
b910cc72 1897 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1898 if (!x)
1899 return -ENOMEM;
1900 our_env[n_env++] = x;
1901
b910cc72 1902 x = strjoin("USER=", username);
7cae38c4
LP
1903 if (!x)
1904 return -ENOMEM;
1905 our_env[n_env++] = x;
1906 }
1907
1908 if (shell) {
b910cc72 1909 x = strjoin("SHELL=", shell);
7cae38c4
LP
1910 if (!x)
1911 return -ENOMEM;
7bbead1d 1912
4ff361cc 1913 path_simplify(x + 6);
7cae38c4
LP
1914 our_env[n_env++] = x;
1915 }
1916
4b58153d
LP
1917 if (!sd_id128_is_null(u->invocation_id)) {
1918 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1919 return -ENOMEM;
1920
1921 our_env[n_env++] = x;
1922 }
1923
6af760f3
LP
1924 if (exec_context_needs_term(c)) {
1925 const char *tty_path, *term = NULL;
1926
1927 tty_path = exec_context_tty_path(c);
1928
e8cf09b2
LP
1929 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1930 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1931 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1932
e8cf09b2 1933 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1934 term = getenv("TERM");
e8cf09b2 1935
6af760f3
LP
1936 if (!term)
1937 term = default_term_for_tty(tty_path);
7cae38c4 1938
b910cc72 1939 x = strjoin("TERM=", term);
7cae38c4
LP
1940 if (!x)
1941 return -ENOMEM;
1942 our_env[n_env++] = x;
1943 }
1944
7bce046b
LP
1945 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1946 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1947 return -ENOMEM;
1948
1949 our_env[n_env++] = x;
1950 }
1951
91dd5f7c
LP
1952 if (c->log_namespace) {
1953 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1954 if (!x)
1955 return -ENOMEM;
1956
1957 our_env[n_env++] = x;
1958 }
1959
5b10116e 1960 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1961 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1962 const char *n;
1963
1964 if (!p->prefix[t])
1965 continue;
1966
211a3d87 1967 if (c->directories[t].n_items == 0)
fb2042dd
YW
1968 continue;
1969
1970 n = exec_directory_env_name_to_string(t);
1971 if (!n)
1972 continue;
1973
211a3d87
LB
1974 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1975 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1976
211a3d87
LB
1977 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1978 if (!prefixed)
1979 return -ENOMEM;
1980
1981 if (!strextend_with_separator(&joined, ":", prefixed))
1982 return -ENOMEM;
1983 }
fb2042dd
YW
1984
1985 x = strjoin(n, "=", joined);
1986 if (!x)
1987 return -ENOMEM;
1988
1989 our_env[n_env++] = x;
1990 }
1991
bb0c0d6f
LP
1992 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1993 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1994 if (!x)
1995 return -ENOMEM;
1996
1997 our_env[n_env++] = x;
1998 }
1999
dc4e2940
YW
2000 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2001 return -ENOMEM;
2002
2003 our_env[n_env++] = x;
2004
7cae38c4 2005 our_env[n_env++] = NULL;
8d5bb13d
LP
2006 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
2007#undef N_ENV_VARS
7cae38c4 2008
ae2a15bc 2009 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2010
2011 return 0;
2012}
2013
b4c14404
FB
2014static int build_pass_environment(const ExecContext *c, char ***ret) {
2015 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2016 size_t n_env = 0;
b4c14404
FB
2017 char **i;
2018
2019 STRV_FOREACH(i, c->pass_environment) {
2020 _cleanup_free_ char *x = NULL;
2021 char *v;
2022
2023 v = getenv(*i);
2024 if (!v)
2025 continue;
605405c6 2026 x = strjoin(*i, "=", v);
b4c14404
FB
2027 if (!x)
2028 return -ENOMEM;
00819cc1 2029
319a4f4b 2030 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2031 return -ENOMEM;
00819cc1 2032
1cc6c93a 2033 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2034 pass_env[n_env] = NULL;
b4c14404
FB
2035 }
2036
ae2a15bc 2037 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2038
2039 return 0;
2040}
2041
5e8deb94 2042bool exec_needs_mount_namespace(
8b44a3d2
LP
2043 const ExecContext *context,
2044 const ExecParameters *params,
4657abb5 2045 const ExecRuntime *runtime) {
8b44a3d2
LP
2046
2047 assert(context);
8b44a3d2 2048
915e6d16
LP
2049 if (context->root_image)
2050 return true;
2051
2a624c36
AP
2052 if (!strv_isempty(context->read_write_paths) ||
2053 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2054 !strv_isempty(context->inaccessible_paths) ||
2055 !strv_isempty(context->exec_paths) ||
2056 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2057 return true;
2058
42b1d8e0 2059 if (context->n_bind_mounts > 0)
d2d6c096
LP
2060 return true;
2061
2abd4e38
YW
2062 if (context->n_temporary_filesystems > 0)
2063 return true;
2064
b3d13314
LB
2065 if (context->n_mount_images > 0)
2066 return true;
2067
93f59701
LB
2068 if (context->n_extension_images > 0)
2069 return true;
2070
37ed15d7 2071 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2072 return true;
2073
2074 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2075 return true;
2076
8b44a3d2 2077 if (context->private_devices ||
228af36f 2078 context->private_mounts ||
8b44a3d2 2079 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2080 context->protect_home != PROTECT_HOME_NO ||
2081 context->protect_kernel_tunables ||
c575770b 2082 context->protect_kernel_modules ||
94a7b275 2083 context->protect_kernel_logs ||
4e399953
LP
2084 context->protect_control_groups ||
2085 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2086 context->proc_subset != PROC_SUBSET_ALL ||
2087 context->private_ipc ||
2088 context->ipc_namespace_path)
8b44a3d2
LP
2089 return true;
2090
37c56f89 2091 if (context->root_directory) {
5e98086d 2092 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2093 return true;
2094
5b10116e 2095 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2096 if (params && !params->prefix[t])
37c56f89
YW
2097 continue;
2098
211a3d87 2099 if (context->directories[t].n_items > 0)
37c56f89
YW
2100 return true;
2101 }
2102 }
5d997827 2103
42b1d8e0 2104 if (context->dynamic_user &&
211a3d87
LB
2105 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2106 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2107 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2108 return true;
2109
91dd5f7c
LP
2110 if (context->log_namespace)
2111 return true;
2112
8b44a3d2
LP
2113 return false;
2114}
2115
5749f855 2116static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2117 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2118 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2119 _cleanup_close_ int unshare_ready_fd = -1;
2120 _cleanup_(sigkill_waitp) pid_t pid = 0;
2121 uint64_t c = 1;
d251207d
LP
2122 ssize_t n;
2123 int r;
2124
5749f855
AZ
2125 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2126 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2127 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2128 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2129 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2130 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2131 * continues execution normally.
2132 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2133 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2134
5749f855
AZ
2135 /* Can only set up multiple mappings with CAP_SETUID. */
2136 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2137 r = asprintf(&uid_map,
5749f855 2138 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2139 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2140 ouid, ouid, uid, uid);
2141 else
2142 r = asprintf(&uid_map,
2143 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2144 ouid, ouid);
d251207d 2145
5749f855
AZ
2146 if (r < 0)
2147 return -ENOMEM;
2148
2149 /* Can only set up multiple mappings with CAP_SETGID. */
2150 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2151 r = asprintf(&gid_map,
5749f855 2152 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2153 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2154 ogid, ogid, gid, gid);
2155 else
2156 r = asprintf(&gid_map,
2157 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2158 ogid, ogid);
2159
2160 if (r < 0)
2161 return -ENOMEM;
d251207d
LP
2162
2163 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2164 * namespace. */
2165 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2166 if (unshare_ready_fd < 0)
2167 return -errno;
2168
2169 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2170 * failed. */
2171 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2172 return -errno;
2173
4c253ed1
LP
2174 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2175 if (r < 0)
2176 return r;
2177 if (r == 0) {
d251207d
LP
2178 _cleanup_close_ int fd = -1;
2179 const char *a;
2180 pid_t ppid;
2181
2182 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2183 * here, after the parent opened its own user namespace. */
2184
2185 ppid = getppid();
2186 errno_pipe[0] = safe_close(errno_pipe[0]);
2187
2188 /* Wait until the parent unshared the user namespace */
2189 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2190 r = -errno;
2191 goto child_fail;
2192 }
2193
2194 /* Disable the setgroups() system call in the child user namespace, for good. */
2195 a = procfs_file_alloca(ppid, "setgroups");
2196 fd = open(a, O_WRONLY|O_CLOEXEC);
2197 if (fd < 0) {
2198 if (errno != ENOENT) {
2199 r = -errno;
2200 goto child_fail;
2201 }
2202
2203 /* If the file is missing the kernel is too old, let's continue anyway. */
2204 } else {
2205 if (write(fd, "deny\n", 5) < 0) {
2206 r = -errno;
2207 goto child_fail;
2208 }
2209
2210 fd = safe_close(fd);
2211 }
2212
2213 /* First write the GID map */
2214 a = procfs_file_alloca(ppid, "gid_map");
2215 fd = open(a, O_WRONLY|O_CLOEXEC);
2216 if (fd < 0) {
2217 r = -errno;
2218 goto child_fail;
2219 }
2220 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2221 r = -errno;
2222 goto child_fail;
2223 }
2224 fd = safe_close(fd);
2225
2226 /* The write the UID map */
2227 a = procfs_file_alloca(ppid, "uid_map");
2228 fd = open(a, O_WRONLY|O_CLOEXEC);
2229 if (fd < 0) {
2230 r = -errno;
2231 goto child_fail;
2232 }
2233 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2234 r = -errno;
2235 goto child_fail;
2236 }
2237
2238 _exit(EXIT_SUCCESS);
2239
2240 child_fail:
2241 (void) write(errno_pipe[1], &r, sizeof(r));
2242 _exit(EXIT_FAILURE);
2243 }
2244
2245 errno_pipe[1] = safe_close(errno_pipe[1]);
2246
2247 if (unshare(CLONE_NEWUSER) < 0)
2248 return -errno;
2249
2250 /* Let the child know that the namespace is ready now */
2251 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2252 return -errno;
2253
2254 /* Try to read an error code from the child */
2255 n = read(errno_pipe[0], &r, sizeof(r));
2256 if (n < 0)
2257 return -errno;
2258 if (n == sizeof(r)) { /* an error code was sent to us */
2259 if (r < 0)
2260 return r;
2261 return -EIO;
2262 }
2263 if (n != 0) /* on success we should have read 0 bytes */
2264 return -EIO;
2265
8f03de53 2266 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2267 if (r < 0)
2268 return r;
2e87a1fd 2269 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2270 return -EIO;
2271
2272 return 0;
2273}
2274
494d0247
YW
2275static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2276 if (!context->dynamic_user)
2277 return false;
2278
2279 if (type == EXEC_DIRECTORY_CONFIGURATION)
2280 return false;
2281
2282 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2283 return false;
2284
2285 return true;
2286}
2287
211a3d87
LB
2288static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2289 _cleanup_free_ char *src_abs = NULL;
2290 char **dst;
2291 int r;
2292
2293 assert(source);
2294
2295 src_abs = path_join(root, source);
2296 if (!src_abs)
2297 return -ENOMEM;
2298
2299 STRV_FOREACH(dst, symlinks) {
2300 _cleanup_free_ char *dst_abs = NULL;
2301
2302 dst_abs = path_join(root, *dst);
2303 if (!dst_abs)
2304 return -ENOMEM;
2305
2306 r = mkdir_parents_label(dst_abs, 0755);
2307 if (r < 0)
2308 return r;
2309
2310 r = symlink_idempotent(src_abs, dst_abs, true);
2311 if (r < 0)
2312 return r;
2313 }
2314
2315 return 0;
2316}
2317
3536f49e 2318static int setup_exec_directory(
07689d5d
LP
2319 const ExecContext *context,
2320 const ExecParameters *params,
2321 uid_t uid,
3536f49e 2322 gid_t gid,
3536f49e 2323 ExecDirectoryType type,
211a3d87 2324 bool needs_mount_namespace,
3536f49e 2325 int *exit_status) {
07689d5d 2326
72fd1768 2327 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2328 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2329 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2330 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2331 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2332 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2333 };
07689d5d
LP
2334 int r;
2335
2336 assert(context);
2337 assert(params);
72fd1768 2338 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2339 assert(exit_status);
07689d5d 2340
3536f49e
YW
2341 if (!params->prefix[type])
2342 return 0;
2343
8679efde 2344 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2345 if (!uid_is_valid(uid))
2346 uid = 0;
2347 if (!gid_is_valid(gid))
2348 gid = 0;
2349 }
2350
211a3d87 2351 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2352 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2353
211a3d87 2354 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2355 if (!p) {
2356 r = -ENOMEM;
2357 goto fail;
2358 }
07689d5d 2359
23a7448e
YW
2360 r = mkdir_parents_label(p, 0755);
2361 if (r < 0)
3536f49e 2362 goto fail;
23a7448e 2363
494d0247 2364 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2365 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2366 * case we want to avoid leaving a directory around fully accessible that is owned by
2367 * a dynamic user whose UID is later on reused. To lock this down we use the same
2368 * trick used by container managers to prohibit host users to get access to files of
2369 * the same UID in containers: we place everything inside a directory that has an
2370 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2371 * for unprivileged host code. We then use fs namespacing to make this directory
2372 * permeable for the service itself.
6c47cd7d 2373 *
3f5b1508
LP
2374 * Specifically: for a service which wants a special directory "foo/" we first create
2375 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2376 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2377 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2378 * unprivileged host users can't look into it. Inside of the namespace of the unit
2379 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2380 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2381 * for the service and making sure it only gets access to the dirs it needs but no
2382 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2383 *
3f5b1508
LP
2384 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2385 * to be owned by the service itself.
2386 *
2387 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2388 * for sharing files or sockets with other services. */
6c47cd7d 2389
4ede9802
LP
2390 pp = path_join(params->prefix[type], "private");
2391 if (!pp) {
6c47cd7d
LP
2392 r = -ENOMEM;
2393 goto fail;
2394 }
2395
2396 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2397 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2398 if (r < 0)
2399 goto fail;
2400
211a3d87 2401 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2402 r = -ENOMEM;
2403 goto fail;
2404 }
2405
2406 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2407 r = mkdir_parents_label(pp, 0755);
2408 if (r < 0)
2409 goto fail;
2410
949befd3
LP
2411 if (is_dir(p, false) > 0 &&
2412 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2413
2414 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2415 * it over. Most likely the service has been upgraded from one that didn't use
2416 * DynamicUser=1, to one that does. */
2417
cf52c45d
LP
2418 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2419 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2420 exec_directory_type_to_string(type), p, pp);
2421
949befd3
LP
2422 if (rename(p, pp) < 0) {
2423 r = -errno;
2424 goto fail;
2425 }
2426 } else {
2427 /* Otherwise, create the actual directory for the service */
2428
2429 r = mkdir_label(pp, context->directories[type].mode);
2430 if (r < 0 && r != -EEXIST)
2431 goto fail;
2432 }
6c47cd7d 2433
df61e79a
LB
2434 /* And link it up from the original place. Note that if a mount namespace is going to be
2435 * used, then this symlink remains on the host, and a new one for the child namespace will
2436 * be created later. */
6c9c51e5 2437 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2438 if (r < 0)
2439 goto fail;
2440
6c47cd7d 2441 } else {
5c6d40d1
LP
2442 _cleanup_free_ char *target = NULL;
2443
2444 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2445 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2446 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2447
2448 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2449 * by DynamicUser=1 (see above)?
2450 *
2451 * We do this for all directory types except for ConfigurationDirectory=,
2452 * since they all support the private/ symlink logic at least in some
2453 * configurations, see above. */
5c6d40d1 2454
578dc69f
YW
2455 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2456 if (r < 0)
2457 goto fail;
2458
211a3d87 2459 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2460 if (!q) {
2461 r = -ENOMEM;
2462 goto fail;
2463 }
2464
578dc69f
YW
2465 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2466 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2467 if (r < 0)
2468 goto fail;
2469
2470 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2471
2472 /* Hmm, apparently DynamicUser= was once turned on for this service,
2473 * but is no longer. Let's move the directory back up. */
2474
cf52c45d
LP
2475 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2476 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2477 exec_directory_type_to_string(type), q, p);
2478
5c6d40d1
LP
2479 if (unlink(p) < 0) {
2480 r = -errno;
2481 goto fail;
2482 }
2483
2484 if (rename(q, p) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488 }
2489 }
2490
6c47cd7d 2491 r = mkdir_label(p, context->directories[type].mode);
d484580c 2492 if (r < 0) {
d484580c
LP
2493 if (r != -EEXIST)
2494 goto fail;
2495
206e9864
LP
2496 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2497 struct stat st;
2498
2499 /* Don't change the owner/access mode of the configuration directory,
2500 * as in the common case it is not written to by a service, and shall
2501 * not be writable. */
2502
2503 if (stat(p, &st) < 0) {
2504 r = -errno;
2505 goto fail;
2506 }
2507
2508 /* Still complain if the access mode doesn't match */
2509 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2510 log_warning("%s \'%s\' already exists but the mode is different. "
2511 "(File system: %o %sMode: %o)",
211a3d87 2512 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2513 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2514
6cff72eb 2515 continue;
206e9864 2516 }
6cff72eb 2517 }
a1164ae3 2518 }
07689d5d 2519
206e9864 2520 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2521 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2522 * current UID/GID ownership.) */
2523 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2524 if (r < 0)
2525 goto fail;
c71b2eb7 2526
607b358e
LP
2527 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2528 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2529 * assignments to exist. */
607b358e 2530 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2531 if (r < 0)
3536f49e 2532 goto fail;
07689d5d
LP
2533 }
2534
211a3d87
LB
2535 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2536 * they are set up later, to allow configuring empty var/run/etc. */
2537 if (!needs_mount_namespace)
2538 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2539 r = create_many_symlinks(params->prefix[type],
2540 context->directories[type].items[i].path,
2541 context->directories[type].items[i].symlinks);
2542 if (r < 0)
2543 goto fail;
2544 }
2545
07689d5d 2546 return 0;
3536f49e
YW
2547
2548fail:
2549 *exit_status = exit_status_table[type];
3536f49e 2550 return r;
07689d5d
LP
2551}
2552
bb0c0d6f
LP
2553static int write_credential(
2554 int dfd,
2555 const char *id,
2556 const void *data,
2557 size_t size,
2558 uid_t uid,
2559 bool ownership_ok) {
2560
2561 _cleanup_(unlink_and_freep) char *tmp = NULL;
2562 _cleanup_close_ int fd = -1;
2563 int r;
2564
2565 r = tempfn_random_child("", "cred", &tmp);
2566 if (r < 0)
2567 return r;
2568
2569 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2570 if (fd < 0) {
2571 tmp = mfree(tmp);
2572 return -errno;
2573 }
2574
43144be4 2575 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2576 if (r < 0)
2577 return r;
2578
2579 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2580 return -errno;
2581
2582 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2583 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2584 if (r < 0) {
2585 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2586 return r;
2587
2588 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2589 * to express: that the user gets read access and nothing
2590 * else. But if the backing fs can't support that (e.g. ramfs)
2591 * then we can use file ownership instead. But that's only safe if
2592 * we can then re-mount the whole thing read-only, so that the
2593 * user can no longer chmod() the file to gain write access. */
2594 return r;
2595
f5fbe71d 2596 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2597 return -errno;
2598 }
2599 }
2600
2601 if (renameat(dfd, tmp, dfd, id) < 0)
2602 return -errno;
2603
2604 tmp = mfree(tmp);
2605 return 0;
2606}
2607
bb0c0d6f
LP
2608static int acquire_credentials(
2609 const ExecContext *context,
2610 const ExecParameters *params,
d3dcf4e3 2611 const char *unit,
bb0c0d6f
LP
2612 const char *p,
2613 uid_t uid,
2614 bool ownership_ok) {
2615
43144be4 2616 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2617 _cleanup_close_ int dfd = -1;
43144be4 2618 ExecLoadCredential *lc;
bb0c0d6f 2619 ExecSetCredential *sc;
bb0c0d6f
LP
2620 int r;
2621
2622 assert(context);
2623 assert(p);
2624
2625 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2626 if (dfd < 0)
2627 return -errno;
2628
43144be4
LP
2629 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2630 HASHMAP_FOREACH(lc, context->load_credentials) {
2631 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
bb0c0d6f 2632 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2633 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2634 bool missing_ok = true;
bb0c0d6f
LP
2635 const char *source;
2636 size_t size, add;
2637
43144be4 2638 if (path_is_absolute(lc->path)) {
bb0c0d6f 2639 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
43144be4 2640 source = lc->path;
bb0c0d6f 2641 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2642
2643 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2644 * via the source socket address in case we read off an AF_UNIX socket. */
43144be4 2645 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
d3dcf4e3
LP
2646 return -ENOMEM;
2647
fc682be2
LP
2648 missing_ok = false;
2649
bb0c0d6f
LP
2650 } else if (params->received_credentials) {
2651 /* If this is a relative path, take it relative to the credentials we received
2652 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2653 * on a credential store, i.e. this is guaranteed to be regular files. */
43144be4 2654 j = path_join(params->received_credentials, lc->path);
bb0c0d6f
LP
2655 if (!j)
2656 return -ENOMEM;
2657
2658 source = j;
2659 } else
2660 source = NULL;
2661
2662 if (source)
43144be4
LP
2663 r = read_full_file_full(
2664 AT_FDCWD, source,
2665 UINT64_MAX,
2666 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2667 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2668 bindname,
2669 &data, &size);
bb0c0d6f
LP
2670 else
2671 r = -ENOENT;
43144be4 2672 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
fc682be2
LP
2673 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2674 * will get clear errors if we don't pass such a missing credential on as they
2675 * themselves will get ENOENT when trying to read them, which should not be much
2676 * worse than when we handle the error here and make it fatal.
2677 *
43144be4
LP
2678 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2679 * we are fine, too. */
2680 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
bb0c0d6f 2681 continue;
fc682be2 2682 }
bb0c0d6f 2683 if (r < 0)
43144be4
LP
2684 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2685
2686 if (lc->encrypted) {
2687 _cleanup_free_ void *plaintext = NULL;
2688 size_t plaintext_size = 0;
2689
2690 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2691 if (r < 0)
2692 return r;
bb0c0d6f 2693
43144be4
LP
2694 free_and_replace(data, plaintext);
2695 size = plaintext_size;
2696 }
2697
2698 add = strlen(lc->id) + size;
bb0c0d6f
LP
2699 if (add > left)
2700 return -E2BIG;
2701
43144be4 2702 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
bb0c0d6f
LP
2703 if (r < 0)
2704 return r;
2705
2706 left -= add;
2707 }
2708
43144be4
LP
2709 /* First we use the literally specified credentials. Note that they might be overridden again below,
2710 * and thus act as a "default" if the same credential is specified multiple times */
2711 HASHMAP_FOREACH(sc, context->set_credentials) {
2712 _cleanup_(erase_and_freep) void *plaintext = NULL;
2713 const char *data;
2714 size_t size, add;
2715
2716 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2717 continue;
2718 if (errno != ENOENT)
2719 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2720
2721 if (sc->encrypted) {
2722 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2723 if (r < 0)
2724 return r;
2725
2726 data = plaintext;
2727 } else {
2728 data = sc->data;
2729 size = sc->size;
2730 }
2731
2732 add = strlen(sc->id) + size;
2733 if (add > left)
2734 return -E2BIG;
2735
2736 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2737 if (r < 0)
2738 return r;
2739
2740
2741 left -= add;
2742 }
2743
bb0c0d6f
LP
2744 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2745 return -errno;
2746
2747 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2748 * accessible */
2749
2750 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2751 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2752 if (r < 0) {
2753 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2754 return r;
2755
2756 if (!ownership_ok)
2757 return r;
2758
f5fbe71d 2759 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2760 return -errno;
2761 }
2762 }
2763
2764 return 0;
2765}
2766
2767static int setup_credentials_internal(
2768 const ExecContext *context,
2769 const ExecParameters *params,
d3dcf4e3 2770 const char *unit,
bb0c0d6f
LP
2771 const char *final, /* This is where the credential store shall eventually end up at */
2772 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2773 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2774 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2775 uid_t uid) {
2776
2777 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2778 * if we mounted something; false if we definitely can't mount anything */
2779 bool final_mounted;
2780 const char *where;
2781
2782 assert(context);
2783 assert(final);
2784 assert(workspace);
2785
2786 if (reuse_workspace) {
2787 r = path_is_mount_point(workspace, NULL, 0);
2788 if (r < 0)
2789 return r;
2790 if (r > 0)
2791 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2792 else
2793 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2794 } else
2795 workspace_mounted = -1; /* ditto */
2796
2797 r = path_is_mount_point(final, NULL, 0);
2798 if (r < 0)
2799 return r;
2800 if (r > 0) {
2801 /* If the final place already has something mounted, we use that. If the workspace also has
2802 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2803 * different). */
2804 final_mounted = true;
2805
2806 if (workspace_mounted < 0) {
2807 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2808 * the final version to the workspace, and make it writable, so that we can make
2809 * changes */
2810
21935150
LP
2811 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2812 if (r < 0)
2813 return r;
bb0c0d6f 2814
21935150
LP
2815 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2816 if (r < 0)
2817 return r;
bb0c0d6f
LP
2818
2819 workspace_mounted = true;
2820 }
2821 } else
2822 final_mounted = false;
2823
2824 if (workspace_mounted < 0) {
2825 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2826 for (int try = 0;; try++) {
2827
2828 if (try == 0) {
2829 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2830 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2831 if (r >= 0) {
bb0c0d6f
LP
2832 workspace_mounted = true;
2833 break;
2834 }
2835
2836 } else if (try == 1) {
2837 _cleanup_free_ char *opts = NULL;
2838
43144be4 2839 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
2840 return -ENOMEM;
2841
2842 /* Fall back to "tmpfs" otherwise */
21935150
LP
2843 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2844 if (r >= 0) {
bb0c0d6f
LP
2845 workspace_mounted = true;
2846 break;
2847 }
2848
2849 } else {
2850 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2851 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2852 if (r < 0) {
2853 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2854 return r;
bb0c0d6f
LP
2855
2856 if (must_mount) /* If we it's not OK to use the plain directory
2857 * fallback, propagate all errors too */
21935150 2858 return r;
bb0c0d6f
LP
2859
2860 /* If we lack privileges to bind mount stuff, then let's gracefully
2861 * proceed for compat with container envs, and just use the final dir
2862 * as is. */
2863
2864 workspace_mounted = false;
2865 break;
2866 }
2867
2868 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2869 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2870 if (r < 0)
2871 return r;
bb0c0d6f
LP
2872
2873 workspace_mounted = true;
2874 break;
2875 }
2876 }
2877 }
2878
2879 assert(!must_mount || workspace_mounted > 0);
2880 where = workspace_mounted ? workspace : final;
2881
e3a0a862
CG
2882 (void) label_fix_container(where, final, 0);
2883
d3dcf4e3 2884 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2885 if (r < 0)
2886 return r;
2887
2888 if (workspace_mounted) {
2889 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2890 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2891 if (r < 0)
2892 return r;
bb0c0d6f
LP
2893
2894 /* And mount it to the final place, read-only */
21935150
LP
2895 if (final_mounted)
2896 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2897 else
2898 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2899 if (r < 0)
2900 return r;
bb0c0d6f
LP
2901 } else {
2902 _cleanup_free_ char *parent = NULL;
2903
2904 /* If we do not have our own mount put used the plain directory fallback, then we need to
2905 * open access to the top-level credential directory and the per-service directory now */
2906
2907 parent = dirname_malloc(final);
2908 if (!parent)
2909 return -ENOMEM;
2910 if (chmod(parent, 0755) < 0)
2911 return -errno;
2912 }
2913
2914 return 0;
2915}
2916
2917static int setup_credentials(
2918 const ExecContext *context,
2919 const ExecParameters *params,
2920 const char *unit,
2921 uid_t uid) {
2922
2923 _cleanup_free_ char *p = NULL, *q = NULL;
2924 const char *i;
2925 int r;
2926
2927 assert(context);
2928 assert(params);
2929
2930 if (!exec_context_has_credentials(context))
2931 return 0;
2932
2933 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2934 return -EINVAL;
2935
2936 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2937 * and the subdir we mount over with a read-only file system readable by the service's user */
2938 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2939 if (!q)
2940 return -ENOMEM;
2941
2942 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2943 if (r < 0 && r != -EEXIST)
2944 return r;
2945
2946 p = path_join(q, unit);
2947 if (!p)
2948 return -ENOMEM;
2949
2950 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2951 if (r < 0 && r != -EEXIST)
2952 return r;
2953
2954 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2955 if (r < 0) {
2956 _cleanup_free_ char *t = NULL, *u = NULL;
2957
2958 /* If this is not a privilege or support issue then propagate the error */
2959 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2960 return r;
2961
2962 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2963 * it into place, so that users can't access half-initialized credential stores. */
2964 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2965 if (!t)
2966 return -ENOMEM;
2967
2968 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2969 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2970 * after it is fully set up */
2971 u = path_join(t, unit);
2972 if (!u)
2973 return -ENOMEM;
2974
2975 FOREACH_STRING(i, t, u) {
2976 r = mkdir_label(i, 0700);
2977 if (r < 0 && r != -EEXIST)
2978 return r;
2979 }
2980
2981 r = setup_credentials_internal(
2982 context,
2983 params,
d3dcf4e3 2984 unit,
bb0c0d6f
LP
2985 p, /* final mount point */
2986 u, /* temporary workspace to overmount */
2987 true, /* reuse the workspace if it is already a mount */
2988 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2989 uid);
2990
2991 (void) rmdir(u); /* remove the workspace again if we can. */
2992
2993 if (r < 0)
2994 return r;
2995
2996 } else if (r == 0) {
2997
2998 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2999 * we can use the same directory for all cases, after turning off propagation. Question
3000 * though is: where do we turn off propagation exactly, and where do we place the workspace
3001 * directory? We need some place that is guaranteed to be a mount point in the host, and
3002 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3003 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3004 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3005 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3006 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3007 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3008 * propagation on the former, and then overmount the latter.
3009 *
3010 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3011 * for this purpose, but there are few other candidates that work equally well for us, and
3012 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3013 * that no one else sees this should be OK to do. */
bb0c0d6f 3014
21935150
LP
3015 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3016 if (r < 0)
bb0c0d6f
LP
3017 goto child_fail;
3018
3019 r = setup_credentials_internal(
3020 context,
3021 params,
d3dcf4e3 3022 unit,
bb0c0d6f
LP
3023 p, /* final mount point */
3024 "/dev/shm", /* temporary workspace to overmount */
3025 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3026 true, /* insist that something is mounted, do not allow fallback to plain directory */
3027 uid);
3028 if (r < 0)
3029 goto child_fail;
3030
3031 _exit(EXIT_SUCCESS);
3032
3033 child_fail:
3034 _exit(EXIT_FAILURE);
3035 }
3036
3037 return 0;
3038}
3039
92b423b9 3040#if ENABLE_SMACK
cefc33ae
LP
3041static int setup_smack(
3042 const ExecContext *context,
b83d5050 3043 int executable_fd) {
cefc33ae
LP
3044 int r;
3045
3046 assert(context);
b83d5050 3047 assert(executable_fd >= 0);
cefc33ae 3048
cefc33ae
LP
3049 if (context->smack_process_label) {
3050 r = mac_smack_apply_pid(0, context->smack_process_label);
3051 if (r < 0)
3052 return r;
3053 }
3054#ifdef SMACK_DEFAULT_PROCESS_LABEL
3055 else {
3056 _cleanup_free_ char *exec_label = NULL;
3057
b83d5050 3058 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3059 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3060 return r;
3061
3062 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3063 if (r < 0)
3064 return r;
3065 }
cefc33ae
LP
3066#endif
3067
3068 return 0;
3069}
92b423b9 3070#endif
cefc33ae 3071
6c47cd7d
LP
3072static int compile_bind_mounts(
3073 const ExecContext *context,
3074 const ExecParameters *params,
3075 BindMount **ret_bind_mounts,
da6053d0 3076 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3077 char ***ret_empty_directories) {
3078
3079 _cleanup_strv_free_ char **empty_directories = NULL;
3080 BindMount *bind_mounts;
5b10116e 3081 size_t n, h = 0;
6c47cd7d
LP
3082 int r;
3083
3084 assert(context);
3085 assert(params);
3086 assert(ret_bind_mounts);
3087 assert(ret_n_bind_mounts);
3088 assert(ret_empty_directories);
3089
3090 n = context->n_bind_mounts;
5b10116e 3091 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3092 if (!params->prefix[t])
3093 continue;
3094
211a3d87 3095 n += context->directories[t].n_items;
6c47cd7d
LP
3096 }
3097
3098 if (n <= 0) {
3099 *ret_bind_mounts = NULL;
3100 *ret_n_bind_mounts = 0;
3101 *ret_empty_directories = NULL;
3102 return 0;
3103 }
3104
3105 bind_mounts = new(BindMount, n);
3106 if (!bind_mounts)
3107 return -ENOMEM;
3108
5b10116e 3109 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3110 BindMount *item = context->bind_mounts + i;
3111 char *s, *d;
3112
3113 s = strdup(item->source);
3114 if (!s) {
3115 r = -ENOMEM;
3116 goto finish;
3117 }
3118
3119 d = strdup(item->destination);
3120 if (!d) {
3121 free(s);
3122 r = -ENOMEM;
3123 goto finish;
3124 }
3125
3126 bind_mounts[h++] = (BindMount) {
3127 .source = s,
3128 .destination = d,
3129 .read_only = item->read_only,
3130 .recursive = item->recursive,
3131 .ignore_enoent = item->ignore_enoent,
3132 };
3133 }
3134
5b10116e 3135 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3136 if (!params->prefix[t])
3137 continue;
3138
211a3d87 3139 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3140 continue;
3141
494d0247 3142 if (exec_directory_is_private(context, t) &&
74e12520 3143 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3144 char *private_root;
3145
3146 /* So this is for a dynamic user, and we need to make sure the process can access its own
3147 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3148 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3149
657ee2d8 3150 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3151 if (!private_root) {
3152 r = -ENOMEM;
3153 goto finish;
3154 }
3155
3156 r = strv_consume(&empty_directories, private_root);
a635a7ae 3157 if (r < 0)
6c47cd7d 3158 goto finish;
6c47cd7d
LP
3159 }
3160
211a3d87 3161 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3162 char *s, *d;
3163
494d0247 3164 if (exec_directory_is_private(context, t))
211a3d87 3165 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3166 else
211a3d87 3167 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3168 if (!s) {
3169 r = -ENOMEM;
3170 goto finish;
3171 }
3172
494d0247 3173 if (exec_directory_is_private(context, t) &&
74e12520 3174 exec_context_with_rootfs(context))
5609f688
YW
3175 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3176 * directory is not created on the root directory. So, let's bind-mount the directory
3177 * on the 'non-private' place. */
211a3d87 3178 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3179 else
3180 d = strdup(s);
6c47cd7d
LP
3181 if (!d) {
3182 free(s);
3183 r = -ENOMEM;
3184 goto finish;
3185 }
3186
3187 bind_mounts[h++] = (BindMount) {
3188 .source = s,
3189 .destination = d,
3190 .read_only = false,
9ce4e4b0 3191 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3192 .recursive = true,
3193 .ignore_enoent = false,
3194 };
3195 }
3196 }
3197
3198 assert(h == n);
3199
3200 *ret_bind_mounts = bind_mounts;
3201 *ret_n_bind_mounts = n;
ae2a15bc 3202 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3203
3204 return (int) n;
3205
3206finish:
3207 bind_mount_free_many(bind_mounts, h);
3208 return r;
3209}
3210
df61e79a
LB
3211/* ret_symlinks will contain a list of pairs src:dest that describes
3212 * the symlinks to create later on. For example, the symlinks needed
3213 * to safely give private directories to DynamicUser=1 users. */
3214static int compile_symlinks(
3215 const ExecContext *context,
3216 const ExecParameters *params,
3217 char ***ret_symlinks) {
3218
3219 _cleanup_strv_free_ char **symlinks = NULL;
3220 int r;
3221
3222 assert(context);
3223 assert(params);
3224 assert(ret_symlinks);
3225
3226 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3227 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3228 _cleanup_free_ char *private_path = NULL, *path = NULL;
3229 char **symlink;
df61e79a 3230
211a3d87
LB
3231 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3232 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3233
211a3d87
LB
3234 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3235 dst_abs = path_join(params->prefix[dt], *symlink);
3236 if (!src_abs || !dst_abs)
3237 return -ENOMEM;
df61e79a 3238
211a3d87
LB
3239 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3240 if (r < 0)
3241 return r;
3242 }
3243
3244 if (!exec_directory_is_private(context, dt))
3245 continue;
3246
3247 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3248 if (!private_path)
3249 return -ENOMEM;
3250
211a3d87 3251 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3252 if (!path)
3253 return -ENOMEM;
3254
3255 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3256 if (r < 0)
3257 return r;
3258 }
3259 }
3260
3261 *ret_symlinks = TAKE_PTR(symlinks);
3262
3263 return 0;
3264}
3265
4e677599
LP
3266static bool insist_on_sandboxing(
3267 const ExecContext *context,
3268 const char *root_dir,
3269 const char *root_image,
3270 const BindMount *bind_mounts,
3271 size_t n_bind_mounts) {
3272
4e677599
LP
3273 assert(context);
3274 assert(n_bind_mounts == 0 || bind_mounts);
3275
3276 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3277 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3278 * rearrange stuff in a way we cannot ignore gracefully. */
3279
3280 if (context->n_temporary_filesystems > 0)
3281 return true;
3282
3283 if (root_dir || root_image)
3284 return true;
3285
b3d13314
LB
3286 if (context->n_mount_images > 0)
3287 return true;
3288
4e677599
LP
3289 if (context->dynamic_user)
3290 return true;
3291
3292 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3293 * essential. */
5b10116e 3294 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3295 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3296 return true;
3297
91dd5f7c
LP
3298 if (context->log_namespace)
3299 return true;
3300
4e677599
LP
3301 return false;
3302}
3303
6818c54c 3304static int apply_mount_namespace(
34cf6c43 3305 const Unit *u,
9f71ba8d 3306 ExecCommandFlags command_flags,
6818c54c
LP
3307 const ExecContext *context,
3308 const ExecParameters *params,
7cc5ef5f
ZJS
3309 const ExecRuntime *runtime,
3310 char **error_path) {
6818c54c 3311
df61e79a 3312 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3313 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3314 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3315 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3316 NamespaceInfo ns_info;
165a31c0 3317 bool needs_sandboxing;
6c47cd7d 3318 BindMount *bind_mounts = NULL;
da6053d0 3319 size_t n_bind_mounts = 0;
6818c54c 3320 int r;
93c6bb51 3321
2b3c1b9e
DH
3322 assert(context);
3323
915e6d16
LP
3324 if (params->flags & EXEC_APPLY_CHROOT) {
3325 root_image = context->root_image;
3326
3327 if (!root_image)
3328 root_dir = context->root_directory;
3329 }
93c6bb51 3330
6c47cd7d
LP
3331 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3332 if (r < 0)
3333 return r;
3334
211a3d87 3335 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3336 r = compile_symlinks(context, params, &symlinks);
3337 if (r < 0)
3338 return r;
3339
9f71ba8d 3340 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3341 if (needs_sandboxing) {
3342 /* The runtime struct only contains the parent of the private /tmp,
3343 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3344 * that is sticky, and that's the one we want to use here.
3345 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3346
3347 if (context->private_tmp && runtime) {
56a13a49
ZJS
3348 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3349 tmp_dir = runtime->tmp_dir;
3350 else if (runtime->tmp_dir)
3351 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3352
3353 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3354 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3355 else if (runtime->var_tmp_dir)
56a13a49 3356 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3357 }
3358
b5a33299
YW
3359 ns_info = (NamespaceInfo) {
3360 .ignore_protect_paths = false,
3361 .private_dev = context->private_devices,
3362 .protect_control_groups = context->protect_control_groups,
3363 .protect_kernel_tunables = context->protect_kernel_tunables,
3364 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3365 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3366 .protect_hostname = context->protect_hostname,
5e98086d 3367 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3368 .private_mounts = context->private_mounts,
52b3d652
LP
3369 .protect_home = context->protect_home,
3370 .protect_system = context->protect_system,
4e399953
LP
3371 .protect_proc = context->protect_proc,
3372 .proc_subset = context->proc_subset,
80271a44 3373 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3374 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3375 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3376 };
ecf63c91 3377 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3378 /*
3379 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3380 * sandbox info, otherwise enforce it, don't ignore protected paths and
3381 * fail if we are enable to apply the sandbox inside the mount namespace.
3382 */
3383 ns_info = (NamespaceInfo) {
3384 .ignore_protect_paths = true,
3385 };
3386 else
3387 ns_info = (NamespaceInfo) {};
b5a33299 3388
37ed15d7
FB
3389 if (context->mount_flags == MS_SHARED)
3390 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3391
a631cbfa
LP
3392 if (exec_context_has_credentials(context) &&
3393 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3394 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3395 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3396 if (!creds_path) {
3397 r = -ENOMEM;
3398 goto finalize;
3399 }
bbb4e7f3
LP
3400 }
3401
5e8deb94
LB
3402 if (MANAGER_IS_SYSTEM(u->manager)) {
3403 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3404 if (!propagate_dir) {
3405 r = -ENOMEM;
3406 goto finalize;
3407 }
3408
5e8deb94 3409 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3410 if (!incoming_dir) {
3411 r = -ENOMEM;
3412 goto finalize;
3413 }
5e8deb94
LB
3414 }
3415
18d73705 3416 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3417 &ns_info, context->read_write_paths,
165a31c0
LP
3418 needs_sandboxing ? context->read_only_paths : NULL,
3419 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3420 needs_sandboxing ? context->exec_paths : NULL,
3421 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3422 empty_directories,
df61e79a 3423 symlinks,
6c47cd7d
LP
3424 bind_mounts,
3425 n_bind_mounts,
2abd4e38
YW
3426 context->temporary_filesystems,
3427 context->n_temporary_filesystems,
b3d13314
LB
3428 context->mount_images,
3429 context->n_mount_images,
56a13a49
ZJS
3430 tmp_dir,
3431 var_tmp_dir,
bbb4e7f3 3432 creds_path,
91dd5f7c 3433 context->log_namespace,
915e6d16 3434 context->mount_flags,
d4d55b0d
LB
3435 context->root_hash, context->root_hash_size, context->root_hash_path,
3436 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3437 context->root_verity,
93f59701
LB
3438 context->extension_images,
3439 context->n_extension_images,
5e8deb94
LB
3440 propagate_dir,
3441 incoming_dir,
3bdc25a4 3442 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3443 error_path);
93c6bb51 3444
1beab8b0 3445 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3446 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3447 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3448 * completely different execution environment. */
aca835ed 3449 if (r == -ENOANO) {
4e677599
LP
3450 if (insist_on_sandboxing(
3451 context,
3452 root_dir, root_image,
3453 bind_mounts,
3454 n_bind_mounts)) {
3455 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3456 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3457 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3458
3459 r = -EOPNOTSUPP;
3460 } else {
aca835ed 3461 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3462 r = 0;
aca835ed 3463 }
93c6bb51
DH
3464 }
3465
8062e643 3466finalize:
4e677599 3467 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3468 return r;
3469}
3470
915e6d16
LP
3471static int apply_working_directory(
3472 const ExecContext *context,
3473 const ExecParameters *params,
3474 const char *home,
376fecf6 3475 int *exit_status) {
915e6d16 3476
6732edab 3477 const char *d, *wd;
2b3c1b9e
DH
3478
3479 assert(context);
376fecf6 3480 assert(exit_status);
2b3c1b9e 3481
6732edab
LP
3482 if (context->working_directory_home) {
3483
376fecf6
LP
3484 if (!home) {
3485 *exit_status = EXIT_CHDIR;
6732edab 3486 return -ENXIO;
376fecf6 3487 }
6732edab 3488
2b3c1b9e 3489 wd = home;
6732edab 3490
14eb3285
LP
3491 } else
3492 wd = empty_to_root(context->working_directory);
e7f1e7c6 3493
fa97f630 3494 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3495 d = wd;
fa97f630 3496 else
3b0e5bb5 3497 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3498
376fecf6
LP
3499 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3500 *exit_status = EXIT_CHDIR;
2b3c1b9e 3501 return -errno;
376fecf6 3502 }
e7f1e7c6
DH
3503
3504 return 0;
3505}
3506
fa97f630
JB
3507static int apply_root_directory(
3508 const ExecContext *context,
3509 const ExecParameters *params,
3510 const bool needs_mount_ns,
3511 int *exit_status) {
3512
3513 assert(context);
3514 assert(exit_status);
3515
5b10116e 3516 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3517 if (!needs_mount_ns && context->root_directory)
3518 if (chroot(context->root_directory) < 0) {
3519 *exit_status = EXIT_CHROOT;
3520 return -errno;
3521 }
fa97f630
JB
3522
3523 return 0;
3524}
3525
b1edf445 3526static int setup_keyring(
34cf6c43 3527 const Unit *u,
b1edf445
LP
3528 const ExecContext *context,
3529 const ExecParameters *p,
3530 uid_t uid, gid_t gid) {
3531
74dd6b51 3532 key_serial_t keyring;
e64c2d0b
DJL
3533 int r = 0;
3534 uid_t saved_uid;
3535 gid_t saved_gid;
74dd6b51
LP
3536
3537 assert(u);
b1edf445 3538 assert(context);
74dd6b51
LP
3539 assert(p);
3540
3541 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3542 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3543 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3544 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3545 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3546 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3547
b1edf445
LP
3548 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3549 return 0;
3550
e64c2d0b
DJL
3551 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3552 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3553 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3554 * & group is just as nasty as acquiring a reference to the user keyring. */
3555
3556 saved_uid = getuid();
3557 saved_gid = getgid();
3558
3559 if (gid_is_valid(gid) && gid != saved_gid) {
3560 if (setregid(gid, -1) < 0)
3561 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3562 }
3563
3564 if (uid_is_valid(uid) && uid != saved_uid) {
3565 if (setreuid(uid, -1) < 0) {
3566 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3567 goto out;
3568 }
3569 }
3570
74dd6b51
LP
3571 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3572 if (keyring == -1) {
3573 if (errno == ENOSYS)
8002fb97 3574 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3575 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3576 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3577 else if (errno == EDQUOT)
8002fb97 3578 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3579 else
e64c2d0b 3580 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3581
e64c2d0b 3582 goto out;
74dd6b51
LP
3583 }
3584
e64c2d0b
DJL
3585 /* When requested link the user keyring into the session keyring. */
3586 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3587
3588 if (keyctl(KEYCTL_LINK,
3589 KEY_SPEC_USER_KEYRING,
3590 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3591 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3592 goto out;
3593 }
3594 }
3595
3596 /* Restore uid/gid back */
3597 if (uid_is_valid(uid) && uid != saved_uid) {
3598 if (setreuid(saved_uid, -1) < 0) {
3599 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3600 goto out;
3601 }
3602 }
3603
3604 if (gid_is_valid(gid) && gid != saved_gid) {
3605 if (setregid(saved_gid, -1) < 0)
3606 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3607 }
3608
3609 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3610 if (!sd_id128_is_null(u->invocation_id)) {
3611 key_serial_t key;
3612
3613 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3614 if (key == -1)
8002fb97 3615 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3616 else {
3617 if (keyctl(KEYCTL_SETPERM, key,
3618 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3619 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3620 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3621 }
3622 }
3623
e64c2d0b 3624out:
37b22b3b 3625 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3626 /* no extra logging, as only the first already reported error matters */
3627 if (getuid() != saved_uid)
3628 (void) setreuid(saved_uid, -1);
b1edf445 3629
e64c2d0b
DJL
3630 if (getgid() != saved_gid)
3631 (void) setregid(saved_gid, -1);
b1edf445 3632
e64c2d0b 3633 return r;
74dd6b51
LP
3634}
3635
3042bbeb 3636static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3637 assert(array);
3638 assert(n);
2caa38e9 3639 assert(pair);
29206d46
LP
3640
3641 if (pair[0] >= 0)
3642 array[(*n)++] = pair[0];
3643 if (pair[1] >= 0)
3644 array[(*n)++] = pair[1];
3645}
3646
a34ceba6
LP
3647static int close_remaining_fds(
3648 const ExecParameters *params,
34cf6c43
YW
3649 const ExecRuntime *runtime,
3650 const DynamicCreds *dcreds,
00d9ef85 3651 int user_lookup_fd,
a34ceba6 3652 int socket_fd,
5b8d1f6b 3653 const int *fds, size_t n_fds) {
a34ceba6 3654
da6053d0 3655 size_t n_dont_close = 0;
00d9ef85 3656 int dont_close[n_fds + 12];
a34ceba6
LP
3657
3658 assert(params);
3659
3660 if (params->stdin_fd >= 0)
3661 dont_close[n_dont_close++] = params->stdin_fd;
3662 if (params->stdout_fd >= 0)
3663 dont_close[n_dont_close++] = params->stdout_fd;
3664 if (params->stderr_fd >= 0)
3665 dont_close[n_dont_close++] = params->stderr_fd;
3666
3667 if (socket_fd >= 0)
3668 dont_close[n_dont_close++] = socket_fd;
3669 if (n_fds > 0) {
3670 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3671 n_dont_close += n_fds;
3672 }
3673
a70581ff 3674 if (runtime) {
29206d46 3675 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3676 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3677 }
29206d46
LP
3678
3679 if (dcreds) {
3680 if (dcreds->user)
3681 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3682 if (dcreds->group)
3683 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3684 }
3685
00d9ef85
LP
3686 if (user_lookup_fd >= 0)
3687 dont_close[n_dont_close++] = user_lookup_fd;
3688
a34ceba6
LP
3689 return close_all_fds(dont_close, n_dont_close);
3690}
3691
00d9ef85
LP
3692static int send_user_lookup(
3693 Unit *unit,
3694 int user_lookup_fd,
3695 uid_t uid,
3696 gid_t gid) {
3697
3698 assert(unit);
3699
3700 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3701 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3702 * specified. */
3703
3704 if (user_lookup_fd < 0)
3705 return 0;
3706
3707 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3708 return 0;
3709
3710 if (writev(user_lookup_fd,
3711 (struct iovec[]) {
e6a7ec4b
LP
3712 IOVEC_INIT(&uid, sizeof(uid)),
3713 IOVEC_INIT(&gid, sizeof(gid)),
3714 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3715 return -errno;
3716
3717 return 0;
3718}
3719
6732edab
LP
3720static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3721 int r;
3722
3723 assert(c);
3724 assert(home);
3725 assert(buf);
3726
3727 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3728
3729 if (*home)
3730 return 0;
3731
3732 if (!c->working_directory_home)
3733 return 0;
3734
6732edab
LP
3735 r = get_home_dir(buf);
3736 if (r < 0)
3737 return r;
3738
3739 *home = *buf;
3740 return 1;
3741}
3742
da50b85a
LP
3743static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3744 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3745 int r;
3746
3747 assert(c);
3748 assert(p);
3749 assert(ret);
3750
3751 assert(c->dynamic_user);
3752
3753 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3754 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3755 * directories. */
3756
5b10116e 3757 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3758 if (t == EXEC_DIRECTORY_CONFIGURATION)
3759 continue;
3760
3761 if (!p->prefix[t])
3762 continue;
3763
211a3d87 3764 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3765 char *e;
3766
494d0247 3767 if (exec_directory_is_private(c, t))
211a3d87 3768 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3769 else
211a3d87 3770 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3771 if (!e)
3772 return -ENOMEM;
3773
3774 r = strv_consume(&list, e);
3775 if (r < 0)
3776 return r;
3777 }
3778 }
3779
ae2a15bc 3780 *ret = TAKE_PTR(list);
da50b85a
LP
3781
3782 return 0;
3783}
3784
78f93209
LP
3785static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3786 bool using_subcgroup;
3787 char *p;
3788
3789 assert(params);
3790 assert(ret);
3791
3792 if (!params->cgroup_path)
3793 return -EINVAL;
3794
3795 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3796 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3797 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3798 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3799 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3800 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3801 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3802 * flag, which is only passed for the former statements, not for the latter. */
3803
3804 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3805 if (using_subcgroup)
657ee2d8 3806 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3807 else
3808 p = strdup(params->cgroup_path);
3809 if (!p)
3810 return -ENOMEM;
3811
3812 *ret = p;
3813 return using_subcgroup;
3814}
3815
e2b2fb7f
MS
3816static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3817 _cleanup_(cpu_set_reset) CPUSet s = {};
3818 int r;
3819
3820 assert(c);
3821 assert(ret);
3822
3823 if (!c->numa_policy.nodes.set) {
3824 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3825 return 0;
3826 }
3827
3828 r = numa_to_cpu_set(&c->numa_policy, &s);
3829 if (r < 0)
3830 return r;
3831
3832 cpu_set_reset(ret);
3833
3834 return cpu_set_add_all(ret, &s);
3835}
3836
3837bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3838 assert(c);
3839
3840 return c->cpu_affinity_from_numa;
3841}
3842
1da37e58
ZJS
3843static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3844 int r;
3845
3846 assert(fds);
3847 assert(n_fds);
3848 assert(*n_fds < fds_size);
3849 assert(ret_fd);
3850
3851 if (fd < 0) {
3852 *ret_fd = -1;
3853 return 0;
3854 }
3855
3856 if (fd < 3 + (int) *n_fds) {
3857 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3858 * the fds we pass to the process (or which are closed only during execve). */
3859
3860 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3861 if (r < 0)
3862 return -errno;
3863
3864 CLOSE_AND_REPLACE(fd, r);
3865 }
3866
3867 *ret_fd = fds[*n_fds] = fd;
3868 (*n_fds) ++;
3869 return 1;
3870}
3871
ff0af2a1 3872static int exec_child(
f2341e0a 3873 Unit *unit,
34cf6c43 3874 const ExecCommand *command,
ff0af2a1
LP
3875 const ExecContext *context,
3876 const ExecParameters *params,
3877 ExecRuntime *runtime,
29206d46 3878 DynamicCreds *dcreds,
ff0af2a1 3879 int socket_fd,
2caa38e9 3880 const int named_iofds[static 3],
4c47affc 3881 int *fds,
da6053d0 3882 size_t n_socket_fds,
25b583d7 3883 size_t n_storage_fds,
ff0af2a1 3884 char **files_env,
00d9ef85 3885 int user_lookup_fd,
12145637 3886 int *exit_status) {
d35fbf6b 3887
8c35c10d 3888 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3889 int r, ngids = 0, exec_fd;
4d885bd3
DH
3890 _cleanup_free_ gid_t *supplementary_gids = NULL;
3891 const char *username = NULL, *groupname = NULL;
5686391b 3892 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3893 const char *home = NULL, *shell = NULL;
7ca69792 3894 char **final_argv = NULL;
7bce046b
LP
3895 dev_t journal_stream_dev = 0;
3896 ino_t journal_stream_ino = 0;
5749f855 3897 bool userns_set_up = false;
165a31c0
LP
3898 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3899 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3900 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3901 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3902#if HAVE_SELINUX
7f59dd35 3903 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3904 bool use_selinux = false;
ecfbc84f 3905#endif
f9fa32f0 3906#if ENABLE_SMACK
43b1f709 3907 bool use_smack = false;
ecfbc84f 3908#endif
349cc4a5 3909#if HAVE_APPARMOR
43b1f709 3910 bool use_apparmor = false;
ecfbc84f 3911#endif
5749f855
AZ
3912 uid_t saved_uid = getuid();
3913 gid_t saved_gid = getgid();
fed1e721
LP
3914 uid_t uid = UID_INVALID;
3915 gid_t gid = GID_INVALID;
1da37e58
ZJS
3916 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3917 n_keep_fds; /* total number of fds not to close */
165a31c0 3918 int secure_bits;
afb11bf1
DG
3919 _cleanup_free_ gid_t *gids_after_pam = NULL;
3920 int ngids_after_pam = 0;
034c6ed7 3921
f2341e0a 3922 assert(unit);
5cb5a6ff
LP
3923 assert(command);
3924 assert(context);
d35fbf6b 3925 assert(params);
ff0af2a1 3926 assert(exit_status);
d35fbf6b
DM
3927
3928 rename_process_from_path(command->path);
3929
9c274488
LP
3930 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3931 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3932 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3933 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3934 SIGNALS_IGNORE);
d35fbf6b
DM
3935
3936 if (context->ignore_sigpipe)
9c274488 3937 (void) ignore_signals(SIGPIPE);
d35fbf6b 3938
ff0af2a1
LP
3939 r = reset_signal_mask();
3940 if (r < 0) {
3941 *exit_status = EXIT_SIGNAL_MASK;
12145637 3942 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3943 }
034c6ed7 3944
d35fbf6b
DM
3945 if (params->idle_pipe)
3946 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3947
2c027c62
LP
3948 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3949 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3950 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3951 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3952
d35fbf6b 3953 log_forget_fds();
2c027c62 3954 log_set_open_when_needed(true);
4f2d528d 3955
40a80078
LP
3956 /* In case anything used libc syslog(), close this here, too */
3957 closelog();
3958
b1994387 3959 int keep_fds[n_fds + 3];
1da37e58
ZJS
3960 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3961 n_keep_fds = n_fds;
3962
3963 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3964 if (r < 0) {
3965 *exit_status = EXIT_FDS;
3966 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3967 }
3968
b1994387
ILG
3969#if HAVE_LIBBPF
3970 if (MANAGER_IS_SYSTEM(unit->manager) && lsm_bpf_supported()) {
3971 int bpf_map_fd = -1;
3972
3973 bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
3974 if (bpf_map_fd < 0) {
3975 *exit_status = EXIT_FDS;
3976 return log_unit_error_errno(unit, r, "Failed to get restrict filesystems BPF map fd: %m");
3977 }
3978
3979 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
3980 if (r < 0) {
3981 *exit_status = EXIT_FDS;
3982 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3983 }
3984 }
3985#endif
3986
1da37e58 3987 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3988 if (r < 0) {
3989 *exit_status = EXIT_FDS;
12145637 3990 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3991 }
3992
0af07108
ZJS
3993 if (!context->same_pgrp &&
3994 setsid() < 0) {
3995 *exit_status = EXIT_SETSID;
3996 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3997 }
9e2f7c11 3998
1e22b5cd 3999 exec_context_tty_reset(context, params);
d35fbf6b 4000
c891efaf 4001 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4002 _cleanup_free_ char *cmdline = NULL;
4003
4ef15008 4004 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4005 if (!cmdline) {
0460aa5c 4006 *exit_status = EXIT_MEMORY;
12145637 4007 return log_oom();
3b20f877 4008 }
d35fbf6b 4009
4ef15008 4010 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4011 if (r != CONFIRM_EXECUTE) {
4012 if (r == CONFIRM_PRETEND_SUCCESS) {
4013 *exit_status = EXIT_SUCCESS;
4014 return 0;
4015 }
ff0af2a1 4016 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4017 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4018 "Execution cancelled by the user");
d35fbf6b
DM
4019 }
4020 }
1a63a750 4021
d521916d
LP
4022 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4023 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4024 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4025 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4026 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4027 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4028 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4029 *exit_status = EXIT_MEMORY;
4030 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4031 }
4032
29206d46 4033 if (context->dynamic_user && dcreds) {
da50b85a 4034 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4035
d521916d 4036 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4037 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4038 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4039 *exit_status = EXIT_USER;
12145637 4040 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4041 }
4042
da50b85a
LP
4043 r = compile_suggested_paths(context, params, &suggested_paths);
4044 if (r < 0) {
4045 *exit_status = EXIT_MEMORY;
4046 return log_oom();
4047 }
4048
4049 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4050 if (r < 0) {
4051 *exit_status = EXIT_USER;
d85ff944
YW
4052 if (r == -EILSEQ)
4053 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4054 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4055 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4056 }
524daa8c 4057
70dd455c 4058 if (!uid_is_valid(uid)) {
29206d46 4059 *exit_status = EXIT_USER;
d85ff944 4060 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4061 }
4062
4063 if (!gid_is_valid(gid)) {
4064 *exit_status = EXIT_USER;
d85ff944 4065 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4066 }
5bc7452b 4067
29206d46
LP
4068 if (dcreds->user)
4069 username = dcreds->user->name;
4070
4071 } else {
4d885bd3
DH
4072 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4073 if (r < 0) {
4074 *exit_status = EXIT_USER;
12145637 4075 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4076 }
5bc7452b 4077
4d885bd3
DH
4078 r = get_fixed_group(context, &groupname, &gid);
4079 if (r < 0) {
4080 *exit_status = EXIT_GROUP;
12145637 4081 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4082 }
cdc5d5c5 4083 }
29206d46 4084
cdc5d5c5
DH
4085 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4086 r = get_supplementary_groups(context, username, groupname, gid,
4087 &supplementary_gids, &ngids);
4088 if (r < 0) {
4089 *exit_status = EXIT_GROUP;
12145637 4090 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4091 }
5bc7452b 4092
00d9ef85
LP
4093 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4094 if (r < 0) {
4095 *exit_status = EXIT_USER;
12145637 4096 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4097 }
4098
4099 user_lookup_fd = safe_close(user_lookup_fd);
4100
6732edab
LP
4101 r = acquire_home(context, uid, &home, &home_buffer);
4102 if (r < 0) {
4103 *exit_status = EXIT_CHDIR;
12145637 4104 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4105 }
4106
d35fbf6b
DM
4107 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4108 * must sure to drop O_NONBLOCK */
4109 if (socket_fd >= 0)
a34ceba6 4110 (void) fd_nonblock(socket_fd, false);
acbb0225 4111
4c70a4a7
MS
4112 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4113 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4114 if (params->cgroup_path) {
4115 _cleanup_free_ char *p = NULL;
4116
4117 r = exec_parameters_get_cgroup_path(params, &p);
4118 if (r < 0) {
4119 *exit_status = EXIT_CGROUP;
4120 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4121 }
4122
4123 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
4124 if (r < 0) {
4125 *exit_status = EXIT_CGROUP;
4126 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4127 }
4128 }
4129
a8d08f39 4130 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4131 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4132 if (r < 0) {
4133 *exit_status = EXIT_NETWORK;
4134 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4135 }
4136 }
4137
a70581ff
XR
4138 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4139 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4140 if (r < 0) {
4141 *exit_status = EXIT_NAMESPACE;
4142 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4143 }
4144 }
4145
52c239d7 4146 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4147 if (r < 0) {
4148 *exit_status = EXIT_STDIN;
12145637 4149 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4150 }
034c6ed7 4151
52c239d7 4152 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4153 if (r < 0) {
4154 *exit_status = EXIT_STDOUT;
12145637 4155 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4156 }
4157
52c239d7 4158 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4159 if (r < 0) {
4160 *exit_status = EXIT_STDERR;
12145637 4161 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4162 }
4163
d35fbf6b 4164 if (context->oom_score_adjust_set) {
9f8168eb
LP
4165 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4166 * prohibit write access to this file, and we shouldn't trip up over that. */
4167 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4168 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4169 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4170 else if (r < 0) {
ff0af2a1 4171 *exit_status = EXIT_OOM_ADJUST;
12145637 4172 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4173 }
d35fbf6b
DM
4174 }
4175
ad21e542
ZJS
4176 if (context->coredump_filter_set) {
4177 r = set_coredump_filter(context->coredump_filter);
4178 if (ERRNO_IS_PRIVILEGE(r))
4179 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4180 else if (r < 0)
4181 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4182 }
4183
39090201
DJL
4184 if (context->nice_set) {
4185 r = setpriority_closest(context->nice);
4186 if (r < 0)
4187 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4188 }
613b411c 4189
d35fbf6b
DM
4190 if (context->cpu_sched_set) {
4191 struct sched_param param = {
4192 .sched_priority = context->cpu_sched_priority,
4193 };
4194
ff0af2a1
LP
4195 r = sched_setscheduler(0,
4196 context->cpu_sched_policy |
4197 (context->cpu_sched_reset_on_fork ?
4198 SCHED_RESET_ON_FORK : 0),
4199 &param);
4200 if (r < 0) {
4201 *exit_status = EXIT_SETSCHEDULER;
12145637 4202 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4203 }
d35fbf6b 4204 }
fc9b2a84 4205
e2b2fb7f
MS
4206 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4207 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4208 const CPUSet *cpu_set;
4209
4210 if (context->cpu_affinity_from_numa) {
4211 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4212 if (r < 0) {
4213 *exit_status = EXIT_CPUAFFINITY;
4214 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4215 }
4216
4217 cpu_set = &converted_cpu_set;
4218 } else
4219 cpu_set = &context->cpu_set;
4220
4221 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4222 *exit_status = EXIT_CPUAFFINITY;
12145637 4223 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4224 }
e2b2fb7f 4225 }
034c6ed7 4226
b070c7c0
MS
4227 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4228 r = apply_numa_policy(&context->numa_policy);
4229 if (r == -EOPNOTSUPP)
33fe9e3f 4230 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4231 else if (r < 0) {
4232 *exit_status = EXIT_NUMA_POLICY;
4233 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4234 }
4235 }
4236
d35fbf6b
DM
4237 if (context->ioprio_set)
4238 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4239 *exit_status = EXIT_IOPRIO;
12145637 4240 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4241 }
da726a4d 4242
d35fbf6b
DM
4243 if (context->timer_slack_nsec != NSEC_INFINITY)
4244 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4245 *exit_status = EXIT_TIMERSLACK;
12145637 4246 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4247 }
9eba9da4 4248
21022b9d
LP
4249 if (context->personality != PERSONALITY_INVALID) {
4250 r = safe_personality(context->personality);
4251 if (r < 0) {
ff0af2a1 4252 *exit_status = EXIT_PERSONALITY;
12145637 4253 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4254 }
21022b9d 4255 }
94f04347 4256
33331d11
VB
4257 if (context->utmp_id) {
4258 const char *line = context->tty_path ?
4259 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4260 NULL;
df0ff127 4261 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4262 line,
023a4f67
LP
4263 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4264 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4265 USER_PROCESS,
6a93917d 4266 username);
33331d11 4267 }
d35fbf6b 4268
08f67696 4269 if (uid_is_valid(uid)) {
ff0af2a1
LP
4270 r = chown_terminal(STDIN_FILENO, uid);
4271 if (r < 0) {
4272 *exit_status = EXIT_STDIN;
12145637 4273 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4274 }
d35fbf6b 4275 }
8e274523 4276
4e1dfa45 4277 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4278 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4279 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4280 * touch a single hierarchy too. */
584b8688 4281 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4282 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4283 if (r < 0) {
4284 *exit_status = EXIT_CGROUP;
12145637 4285 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4286 }
d35fbf6b 4287 }
034c6ed7 4288
211a3d87
LB
4289 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4290
5b10116e 4291 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4292 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4293 if (r < 0)
4294 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4295 }
94f04347 4296
bb0c0d6f
LP
4297 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4298 r = setup_credentials(context, params, unit->id, uid);
4299 if (r < 0) {
4300 *exit_status = EXIT_CREDENTIALS;
4301 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4302 }
4303 }
4304
7bce046b 4305 r = build_environment(
fd63e712 4306 unit,
7bce046b
LP
4307 context,
4308 params,
4309 n_fds,
4310 home,
4311 username,
4312 shell,
4313 journal_stream_dev,
4314 journal_stream_ino,
4315 &our_env);
2065ca69
JW
4316 if (r < 0) {
4317 *exit_status = EXIT_MEMORY;
12145637 4318 return log_oom();
2065ca69
JW
4319 }
4320
4321 r = build_pass_environment(context, &pass_env);
4322 if (r < 0) {
4323 *exit_status = EXIT_MEMORY;
12145637 4324 return log_oom();
2065ca69
JW
4325 }
4326
8c35c10d 4327 /* The PATH variable is set to the default path in params->environment.
4328 * However, this is overridden if user specified fields have PATH set.
4329 * The intention is to also override PATH if the user does
4330 * not specify PATH and the user has specified ExecSearchPath
4331 */
4332
4333 if (!strv_isempty(context->exec_search_path)) {
4334 _cleanup_free_ char *joined = NULL;
4335
4336 joined = strv_join(context->exec_search_path, ":");
4337 if (!joined) {
4338 *exit_status = EXIT_MEMORY;
4339 return log_oom();
4340 }
4341
4342 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4343 if (r < 0) {
4344 *exit_status = EXIT_MEMORY;
4345 return log_oom();
4346 }
4347 }
4348
4ab3d29f 4349 accum_env = strv_env_merge(params->environment,
2065ca69 4350 our_env,
8c35c10d 4351 joined_exec_search_path,
2065ca69
JW
4352 pass_env,
4353 context->environment,
44e5d006 4354 files_env);
2065ca69
JW
4355 if (!accum_env) {
4356 *exit_status = EXIT_MEMORY;
12145637 4357 return log_oom();
2065ca69 4358 }
1280503b 4359 accum_env = strv_env_clean(accum_env);
2065ca69 4360
096424d1 4361 (void) umask(context->umask);
b213e1c1 4362
b1edf445 4363 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4364 if (r < 0) {
4365 *exit_status = EXIT_KEYRING;
12145637 4366 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4367 }
4368
165a31c0 4369 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4370 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4371
165a31c0
LP
4372 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4373 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4374
165a31c0
LP
4375 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4376 if (needs_ambient_hack)
4377 needs_setuid = false;
4378 else
4379 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4380
4381 if (needs_sandboxing) {
7f18ef0a
FK
4382 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4383 * present. The actual MAC context application will happen later, as late as possible, to avoid
4384 * impacting our own code paths. */
4385
349cc4a5 4386#if HAVE_SELINUX
43b1f709 4387 use_selinux = mac_selinux_use();
7f18ef0a 4388#endif
f9fa32f0 4389#if ENABLE_SMACK
43b1f709 4390 use_smack = mac_smack_use();
7f18ef0a 4391#endif
349cc4a5 4392#if HAVE_APPARMOR
43b1f709 4393 use_apparmor = mac_apparmor_use();
7f18ef0a 4394#endif
165a31c0 4395 }
7f18ef0a 4396
ce932d2d
LP
4397 if (needs_sandboxing) {
4398 int which_failed;
4399
4400 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4401 * is set here. (See below.) */
4402
4403 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4404 if (r < 0) {
4405 *exit_status = EXIT_LIMITS;
4406 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4407 }
4408 }
4409
0af07108 4410 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4411 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4412 * wins here. (See above.) */
4413
1da37e58 4414 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4415 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4416 if (r < 0) {
4417 *exit_status = EXIT_PAM;
4418 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4419 }
ac45f971 4420
0af07108
ZJS
4421 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4422 if (ngids_after_pam < 0) {
4423 *exit_status = EXIT_MEMORY;
4424 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4425 }
b213e1c1 4426 }
5749f855 4427
0af07108 4428 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4429 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4430 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4431 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4432
4433 userns_set_up = true;
4434 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4435 if (r < 0) {
4436 *exit_status = EXIT_USER;
4437 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4438 }
4439 }
4440
a8d08f39
LP
4441 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4442
6e2d7c4f 4443 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4444 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4445 if (r == -EPERM)
4446 log_unit_warning_errno(unit, r,
4447 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4448 else if (r < 0) {
6e2d7c4f
MS
4449 *exit_status = EXIT_NETWORK;
4450 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4451 }
a8d08f39
LP
4452 } else if (context->network_namespace_path) {
4453 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4454 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4455 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4456 } else
4457 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4458 }
169c1bda 4459
a70581ff
XR
4460 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4461
4462 if (ns_type_supported(NAMESPACE_IPC)) {
4463 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4464 if (r == -EPERM)
4465 log_unit_warning_errno(unit, r,
4466 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4467 else if (r < 0) {
4468 *exit_status = EXIT_NAMESPACE;
4469 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4470 }
4471 } else if (context->ipc_namespace_path) {
4472 *exit_status = EXIT_NAMESPACE;
4473 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4474 "IPCNamespacePath= is not supported, refusing.");
4475 } else
4476 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4477 }
4478
ee818b89 4479 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4480 _cleanup_free_ char *error_path = NULL;
4481
9f71ba8d 4482 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4483 if (r < 0) {
4484 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4485 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4486 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4487 }
d35fbf6b 4488 }
81a2b7ce 4489
daf8f72b
LP
4490 if (needs_sandboxing) {
4491 r = apply_protect_hostname(unit, context, exit_status);
4492 if (r < 0)
4493 return r;
aecd5ac6
TM
4494 }
4495
5749f855
AZ
4496 /* Drop groups as early as possible.
4497 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4498 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4499 if (needs_setuid) {
afb11bf1
DG
4500 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4501 int ngids_to_enforce = 0;
4502
4503 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4504 ngids,
4505 gids_after_pam,
4506 ngids_after_pam,
4507 &gids_to_enforce);
4508 if (ngids_to_enforce < 0) {
4509 *exit_status = EXIT_MEMORY;
4510 return log_unit_error_errno(unit,
4511 ngids_to_enforce,
4512 "Failed to merge group lists. Group membership might be incorrect: %m");
4513 }
4514
4515 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4516 if (r < 0) {
4517 *exit_status = EXIT_GROUP;
12145637 4518 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4519 }
165a31c0 4520 }
096424d1 4521
5749f855
AZ
4522 /* If the user namespace was not set up above, try to do it now.
4523 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4524 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4525 * case of mount namespaces being less privileged when the mount point list is copied from a
4526 * different user namespace). */
9008e1ac 4527
5749f855
AZ
4528 if (needs_sandboxing && context->private_users && !userns_set_up) {
4529 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4530 if (r < 0) {
4531 *exit_status = EXIT_USER;
4532 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4533 }
4534 }
4535
9f71ba8d
ZJS
4536 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4537 * shall execute. */
4538
4539 _cleanup_free_ char *executable = NULL;
b83d5050 4540 _cleanup_close_ int executable_fd = -1;
8c35c10d 4541 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4542 if (r < 0) {
4543 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4544 log_unit_struct_errno(unit, LOG_INFO, r,
4545 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4546 LOG_UNIT_INVOCATION_ID(unit),
4547 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4548 command->path),
4549 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4550 return 0;
4551 }
4552
4553 *exit_status = EXIT_EXEC;
c2503e35
RH
4554
4555 return log_unit_struct_errno(unit, LOG_INFO, r,
4556 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4557 LOG_UNIT_INVOCATION_ID(unit),
4558 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4559 command->path),
4560 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4561 }
4562
b83d5050
ZJS
4563 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4564 if (r < 0) {
4565 *exit_status = EXIT_FDS;
4566 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4567 }
4568
9f71ba8d 4569#if HAVE_SELINUX
49590d67
MS
4570 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4571 int fd = -1;
4572
4573 if (socket_fd >= 0)
4574 fd = socket_fd;
4575 else if (params->n_socket_fds == 1)
4576 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4577 * use context from that fd to compute the label. */
4578 fd = params->fds[0];
4579
4580 if (fd >= 0) {
4581 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4582 if (r < 0) {
4583 if (!context->selinux_context_ignore) {
4584 *exit_status = EXIT_SELINUX_CONTEXT;
4585 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4586 }
4587 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4588 }
9f71ba8d
ZJS
4589 }
4590 }
4591#endif
4592
165a31c0 4593 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4594 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4595 * however if we have it as we want to keep it open until the final execve(). */
4596
1da37e58 4597 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4598 if (r >= 0)
4599 r = shift_fds(fds, n_fds);
4600 if (r >= 0)
25b583d7 4601 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4602 if (r < 0) {
4603 *exit_status = EXIT_FDS;
12145637 4604 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4605 }
e66cf1a3 4606
5686391b
LP
4607 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4608 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4609 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4610 * came this far. */
4611
165a31c0 4612 secure_bits = context->secure_bits;
e66cf1a3 4613
165a31c0
LP
4614 if (needs_sandboxing) {
4615 uint64_t bset;
e66cf1a3 4616
ce932d2d
LP
4617 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4618 * requested. (Note this is placed after the general resource limit initialization, see
4619 * above, in order to take precedence.) */
f4170c67
LP
4620 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4621 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4622 *exit_status = EXIT_LIMITS;
12145637 4623 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4624 }
4625 }
4626
37ac2744
JB
4627#if ENABLE_SMACK
4628 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4629 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4630 if (use_smack) {
b83d5050 4631 r = setup_smack(context, executable_fd);
29ff6247 4632 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4633 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4634 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4635 }
4636 }
4637#endif
4638
165a31c0
LP
4639 bset = context->capability_bounding_set;
4640 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4641 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4642 * instead of us doing that */
4643 if (needs_ambient_hack)
4644 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4645 (UINT64_C(1) << CAP_SETUID) |
4646 (UINT64_C(1) << CAP_SETGID);
4647
4648 if (!cap_test_all(bset)) {
4649 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4650 if (r < 0) {
4651 *exit_status = EXIT_CAPABILITIES;
12145637 4652 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4653 }
4c2630eb 4654 }
3b8bddde 4655
16fcb191
TK
4656 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4657 * keep-caps set.
4658 * To be able to raise the ambient capabilities after setresuid() they have to be
4659 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4660 * After setresuid() the ambient capabilities can be raised as they are present in
4661 * the permitted and inhertiable set. However it is possible that someone wants to
4662 * set ambient capabilities without changing the user, so we also set the ambient
4663 * capabilities here.
4664 * The requested ambient capabilities are raised in the inheritable set if the
4665 * second argument is true. */
943800f4 4666 if (!needs_ambient_hack) {
755d4b67
IP
4667 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4668 if (r < 0) {
4669 *exit_status = EXIT_CAPABILITIES;
12145637 4670 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4671 }
755d4b67 4672 }
165a31c0 4673 }
755d4b67 4674
fa97f630
JB
4675 /* chroot to root directory first, before we lose the ability to chroot */
4676 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4677 if (r < 0)
4678 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4679
165a31c0 4680 if (needs_setuid) {
08f67696 4681 if (uid_is_valid(uid)) {
ff0af2a1
LP
4682 r = enforce_user(context, uid);
4683 if (r < 0) {
4684 *exit_status = EXIT_USER;
12145637 4685 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4686 }
165a31c0
LP
4687
4688 if (!needs_ambient_hack &&
4689 context->capability_ambient_set != 0) {
755d4b67 4690
16fcb191 4691 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4692 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4693 if (r < 0) {
4694 *exit_status = EXIT_CAPABILITIES;
12145637 4695 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4696 }
755d4b67 4697 }
5b6319dc 4698 }
165a31c0 4699 }
d35fbf6b 4700
56ef8db9
JB
4701 /* Apply working directory here, because the working directory might be on NFS and only the user running
4702 * this service might have the correct privilege to change to the working directory */
fa97f630 4703 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4704 if (r < 0)
4705 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4706
165a31c0 4707 if (needs_sandboxing) {
37ac2744 4708 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4709 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4710 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4711 * are restricted. */
4712
349cc4a5 4713#if HAVE_SELINUX
43b1f709 4714 if (use_selinux) {
5cd9cd35
LP
4715 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4716
4717 if (exec_context) {
4718 r = setexeccon(exec_context);
006d1864
TM
4719 if (r < 0) {
4720 if (!context->selinux_context_ignore) {
4721 *exit_status = EXIT_SELINUX_CONTEXT;
4722 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4723 }
4724 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4725 }
4726 }
4727 }
4728#endif
4729
349cc4a5 4730#if HAVE_APPARMOR
43b1f709 4731 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4732 r = aa_change_onexec(context->apparmor_profile);
4733 if (r < 0 && !context->apparmor_profile_ignore) {
4734 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4735 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4736 }
4737 }
4738#endif
4739
165a31c0 4740 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4741 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4742 * CAP_SETPCAP. */
4743 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4744 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4745 * effective set here.
4746 * The effective set is overwritten during execve with the following values:
4747 * - ambient set (for non-root processes)
4748 * - (inheritable | bounding) set for root processes)
4749 *
4750 * Hence there is no security impact to raise it in the effective set before execve
4751 */
4752 r = capability_gain_cap_setpcap(NULL);
4753 if (r < 0) {
4754 *exit_status = EXIT_CAPABILITIES;
4755 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4756 }
755d4b67 4757 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4758 *exit_status = EXIT_SECUREBITS;
12145637 4759 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4760 }
dbdc4098 4761 }
5b6319dc 4762
59eeb84b 4763 if (context_has_no_new_privileges(context))
d35fbf6b 4764 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4765 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4766 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4767 }
4768
349cc4a5 4769#if HAVE_SECCOMP
469830d1
LP
4770 r = apply_address_families(unit, context);
4771 if (r < 0) {
4772 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4773 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4774 }
04aa0cb9 4775
469830d1
LP
4776 r = apply_memory_deny_write_execute(unit, context);
4777 if (r < 0) {
4778 *exit_status = EXIT_SECCOMP;
12145637 4779 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4780 }
f4170c67 4781
469830d1
LP
4782 r = apply_restrict_realtime(unit, context);
4783 if (r < 0) {
4784 *exit_status = EXIT_SECCOMP;
12145637 4785 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4786 }
4787
f69567cb
LP
4788 r = apply_restrict_suid_sgid(unit, context);
4789 if (r < 0) {
4790 *exit_status = EXIT_SECCOMP;
4791 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4792 }
4793
add00535
LP
4794 r = apply_restrict_namespaces(unit, context);
4795 if (r < 0) {
4796 *exit_status = EXIT_SECCOMP;
12145637 4797 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4798 }
4799
469830d1
LP
4800 r = apply_protect_sysctl(unit, context);
4801 if (r < 0) {
4802 *exit_status = EXIT_SECCOMP;
12145637 4803 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4804 }
4805
469830d1
LP
4806 r = apply_protect_kernel_modules(unit, context);
4807 if (r < 0) {
4808 *exit_status = EXIT_SECCOMP;
12145637 4809 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4810 }
4811
84703040
KK
4812 r = apply_protect_kernel_logs(unit, context);
4813 if (r < 0) {
4814 *exit_status = EXIT_SECCOMP;
4815 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4816 }
4817
fc64760d
KK
4818 r = apply_protect_clock(unit, context);
4819 if (r < 0) {
4820 *exit_status = EXIT_SECCOMP;
4821 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4822 }
4823
469830d1
LP
4824 r = apply_private_devices(unit, context);
4825 if (r < 0) {
4826 *exit_status = EXIT_SECCOMP;
12145637 4827 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4828 }
4829
4830 r = apply_syscall_archs(unit, context);
4831 if (r < 0) {
4832 *exit_status = EXIT_SECCOMP;
12145637 4833 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4834 }
4835
78e864e5
TM
4836 r = apply_lock_personality(unit, context);
4837 if (r < 0) {
4838 *exit_status = EXIT_SECCOMP;
12145637 4839 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4840 }
4841
9df2cdd8
TM
4842 r = apply_syscall_log(unit, context);
4843 if (r < 0) {
4844 *exit_status = EXIT_SECCOMP;
4845 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4846 }
4847
5cd9cd35
LP
4848 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4849 * by the filter as little as possible. */
165a31c0 4850 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4851 if (r < 0) {
4852 *exit_status = EXIT_SECCOMP;
12145637 4853 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4854 }
4855#endif
b1994387
ILG
4856
4857#if HAVE_LIBBPF
4858 r = apply_restrict_filesystems(unit, context);
4859 if (r < 0) {
4860 *exit_status = EXIT_BPF;
4861 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
4862 }
4863#endif
4864
d35fbf6b 4865 }
034c6ed7 4866
00819cc1
LP
4867 if (!strv_isempty(context->unset_environment)) {
4868 char **ee = NULL;
4869
4870 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4871 if (!ee) {
4872 *exit_status = EXIT_MEMORY;
12145637 4873 return log_oom();
00819cc1
LP
4874 }
4875
130d3d22 4876 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4877 }
4878
7ca69792
AZ
4879 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4880 replaced_argv = replace_env_argv(command->argv, accum_env);
4881 if (!replaced_argv) {
4882 *exit_status = EXIT_MEMORY;
4883 return log_oom();
4884 }
4885 final_argv = replaced_argv;
4886 } else
4887 final_argv = command->argv;
034c6ed7 4888
f1d34068 4889 if (DEBUG_LOGGING) {
c2b2df60 4890 _cleanup_free_ char *line = NULL;
81a2b7ce 4891
4ef15008 4892 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
4893 if (!line) {
4894 *exit_status = EXIT_MEMORY;
4895 return log_oom();
4896 }
4897
4898 log_unit_struct(unit, LOG_DEBUG,
4899 "EXECUTABLE=%s", executable,
4900 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 4901 }
dd305ec9 4902
5686391b
LP
4903 if (exec_fd >= 0) {
4904 uint8_t hot = 1;
4905
4906 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4907 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4908
4909 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4910 *exit_status = EXIT_EXEC;
4911 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4912 }
4913 }
4914
a6d9111c 4915 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4916
4917 if (exec_fd >= 0) {
4918 uint8_t hot = 0;
4919
4920 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4921 * that POLLHUP on it no longer means execve() succeeded. */
4922
4923 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4924 *exit_status = EXIT_EXEC;
4925 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4926 }
4927 }
12145637 4928
ff0af2a1 4929 *exit_status = EXIT_EXEC;
9f71ba8d 4930 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4931}
81a2b7ce 4932
34cf6c43 4933static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4934static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4935
f2341e0a
LP
4936int exec_spawn(Unit *unit,
4937 ExecCommand *command,
d35fbf6b
DM
4938 const ExecContext *context,
4939 const ExecParameters *params,
4940 ExecRuntime *runtime,
29206d46 4941 DynamicCreds *dcreds,
d35fbf6b 4942 pid_t *ret) {
8351ceae 4943
ee39ca20 4944 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4945 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4946 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4947 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4948 _cleanup_free_ char *line = NULL;
d35fbf6b 4949 pid_t pid;
8351ceae 4950
f2341e0a 4951 assert(unit);
d35fbf6b
DM
4952 assert(command);
4953 assert(context);
4954 assert(ret);
4955 assert(params);
25b583d7 4956 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4957
d35fbf6b
DM
4958 if (context->std_input == EXEC_INPUT_SOCKET ||
4959 context->std_output == EXEC_OUTPUT_SOCKET ||
4960 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4961
d85ff944
YW
4962 if (params->n_socket_fds > 1)
4963 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4964
d85ff944
YW
4965 if (params->n_socket_fds == 0)
4966 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4967
d35fbf6b
DM
4968 socket_fd = params->fds[0];
4969 } else {
4970 socket_fd = -1;
4971 fds = params->fds;
9b141911 4972 n_socket_fds = params->n_socket_fds;
25b583d7 4973 n_storage_fds = params->n_storage_fds;
d35fbf6b 4974 }
94f04347 4975
34cf6c43 4976 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4977 if (r < 0)
4978 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4979
f2341e0a 4980 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4981 if (r < 0)
f2341e0a 4982 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4983
4ef15008 4984 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
4985 if (!line)
4986 return log_oom();
fab56fc5 4987
9f71ba8d
ZJS
4988 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4989 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4990 mac_selinux_maybe_reload();
4991
c2503e35
RH
4992 log_unit_struct(unit, LOG_DEBUG,
4993 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4994 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4995 the mount namespace in the child, but we want to log
4996 from the parent, so we need to use the (possibly
4997 inaccurate) path here. */
4998 LOG_UNIT_INVOCATION_ID(unit));
12145637 4999
78f93209
LP
5000 if (params->cgroup_path) {
5001 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5002 if (r < 0)
5003 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5004 if (r > 0) { /* We are using a child cgroup */
5005 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5006 if (r < 0)
5007 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
5008
5009 /* Normally we would not propagate the oomd xattrs to children but since we created this
5010 * sub-cgroup internally we should do it. */
5011 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
5012 }
5013 }
5014
d35fbf6b
DM
5015 pid = fork();
5016 if (pid < 0)
74129a12 5017 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5018
5019 if (pid == 0) {
12145637 5020 int exit_status = EXIT_SUCCESS;
ff0af2a1 5021
f2341e0a
LP
5022 r = exec_child(unit,
5023 command,
ff0af2a1
LP
5024 context,
5025 params,
5026 runtime,
29206d46 5027 dcreds,
ff0af2a1 5028 socket_fd,
52c239d7 5029 named_iofds,
4c47affc 5030 fds,
9b141911 5031 n_socket_fds,
25b583d7 5032 n_storage_fds,
ff0af2a1 5033 files_env,
00d9ef85 5034 unit->manager->user_lookup_fds[1],
12145637
LP
5035 &exit_status);
5036
e1714f02
ZJS
5037 if (r < 0) {
5038 const char *status =
5039 exit_status_to_string(exit_status,
e04ed6db 5040 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5041
c2503e35
RH
5042 log_unit_struct_errno(unit, LOG_ERR, r,
5043 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5044 LOG_UNIT_INVOCATION_ID(unit),
5045 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5046 status, command->path),
5047 "EXECUTABLE=%s", command->path);
e1714f02 5048 }
4c2630eb 5049
ff0af2a1 5050 _exit(exit_status);
034c6ed7
LP
5051 }
5052
f2341e0a 5053 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5054
78f93209
LP
5055 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5056 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5057 * process will be killed too). */
5058 if (subcgroup_path)
5059 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5060
b58b4116 5061 exec_status_start(&command->exec_status, pid);
9fb86720 5062
034c6ed7 5063 *ret = pid;
5cb5a6ff
LP
5064 return 0;
5065}
5066
034c6ed7
LP
5067void exec_context_init(ExecContext *c) {
5068 assert(c);
5069
4c12626c 5070 c->umask = 0022;
5bead76e 5071 c->ioprio = ioprio_prio_value(IOPRIO_CLASS_BE, 0);
94f04347 5072 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5073 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5074 c->syslog_level_prefix = true;
353e12c2 5075 c->ignore_sigpipe = true;
3a43da28 5076 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5077 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5078 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5079 c->directories[t].mode = 0755;
12213aed 5080 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5081 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5082 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5083 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5084 c->log_level_max = -1;
005bfaf1
TM
5085#if HAVE_SECCOMP
5086 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5087#endif
51462135
DDM
5088 c->tty_rows = UINT_MAX;
5089 c->tty_cols = UINT_MAX;
b070c7c0 5090 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5091}
5092
613b411c 5093void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5094 assert(c);
5095
6796073e
LP
5096 c->environment = strv_free(c->environment);
5097 c->environment_files = strv_free(c->environment_files);
b4c14404 5098 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5099 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5100
31ce987c 5101 rlimit_free_all(c->rlimit);
034c6ed7 5102
5b10116e 5103 for (size_t l = 0; l < 3; l++) {
52c239d7 5104 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5105 c->stdio_file[l] = mfree(c->stdio_file[l]);
5106 }
52c239d7 5107
a1e58e8e
LP
5108 c->working_directory = mfree(c->working_directory);
5109 c->root_directory = mfree(c->root_directory);
915e6d16 5110 c->root_image = mfree(c->root_image);
18d73705 5111 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5112 c->root_hash = mfree(c->root_hash);
5113 c->root_hash_size = 0;
5114 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5115 c->root_hash_sig = mfree(c->root_hash_sig);
5116 c->root_hash_sig_size = 0;
5117 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5118 c->root_verity = mfree(c->root_verity);
93f59701 5119 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
5120 c->tty_path = mfree(c->tty_path);
5121 c->syslog_identifier = mfree(c->syslog_identifier);
5122 c->user = mfree(c->user);
5123 c->group = mfree(c->group);
034c6ed7 5124
6796073e 5125 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5126
a1e58e8e 5127 c->pam_name = mfree(c->pam_name);
5b6319dc 5128
2a624c36
AP
5129 c->read_only_paths = strv_free(c->read_only_paths);
5130 c->read_write_paths = strv_free(c->read_write_paths);
5131 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5132 c->exec_paths = strv_free(c->exec_paths);
5133 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5134 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5135
d2d6c096 5136 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5137 c->bind_mounts = NULL;
5138 c->n_bind_mounts = 0;
2abd4e38
YW
5139 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5140 c->temporary_filesystems = NULL;
5141 c->n_temporary_filesystems = 0;
b3d13314 5142 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5143
0985c7c4 5144 cpu_set_reset(&c->cpu_set);
b070c7c0 5145 numa_policy_reset(&c->numa_policy);
86a3475b 5146
a1e58e8e
LP
5147 c->utmp_id = mfree(c->utmp_id);
5148 c->selinux_context = mfree(c->selinux_context);
5149 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5150 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5151
b1994387
ILG
5152 c->restrict_filesystems = set_free(c->restrict_filesystems);
5153
8cfa775f 5154 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5155 c->syscall_archs = set_free(c->syscall_archs);
5156 c->address_families = set_free(c->address_families);
e66cf1a3 5157
5b10116e 5158 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5159 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5160
5161 c->log_level_max = -1;
5162
5163 exec_context_free_log_extra_fields(c);
08f3be7a 5164
5ac1530e
ZJS
5165 c->log_ratelimit_interval_usec = 0;
5166 c->log_ratelimit_burst = 0;
90fc172e 5167
08f3be7a
LP
5168 c->stdin_data = mfree(c->stdin_data);
5169 c->stdin_data_size = 0;
a8d08f39
LP
5170
5171 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5172 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5173
5174 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5175
43144be4 5176 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5177 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5178}
5179
34cf6c43 5180int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5181 assert(c);
5182
5183 if (!runtime_prefix)
5184 return 0;
5185
211a3d87 5186 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5187 _cleanup_free_ char *p = NULL;
e66cf1a3 5188
494d0247 5189 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5190 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5191 else
211a3d87 5192 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5193 if (!p)
5194 return -ENOMEM;
5195
7bc4bf4a
LP
5196 /* We execute this synchronously, since we need to be sure this is gone when we start the
5197 * service next. */
c6878637 5198 (void) rm_rf(p, REMOVE_ROOT);
211a3d87
LB
5199
5200 char **symlink;
5201 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5202 _cleanup_free_ char *symlink_abs = NULL;
5203
5204 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5205 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5206 else
5207 symlink_abs = path_join(runtime_prefix, *symlink);
5208 if (!symlink_abs)
5209 return -ENOMEM;
5210
5211 (void) unlink(symlink_abs);
5212 }
5213
e66cf1a3
LP
5214 }
5215
5216 return 0;
5cb5a6ff
LP
5217}
5218
bb0c0d6f
LP
5219int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5220 _cleanup_free_ char *p = NULL;
5221
5222 assert(c);
5223
5224 if (!runtime_prefix || !unit)
5225 return 0;
5226
5227 p = path_join(runtime_prefix, "credentials", unit);
5228 if (!p)
5229 return -ENOMEM;
5230
5231 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5232 * unmount it, and afterwards remove the mount point */
5233 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5234 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5235
5236 return 0;
5237}
5238
34cf6c43 5239static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5240 assert(c);
5241
a1e58e8e 5242 c->path = mfree(c->path);
6796073e 5243 c->argv = strv_free(c->argv);
43d0fcbd
LP
5244}
5245
da6053d0 5246void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5247 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5248 exec_command_done(c+i);
5249}
5250
f1acf85a 5251ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5252 ExecCommand *i;
5253
5254 while ((i = c)) {
71fda00f 5255 LIST_REMOVE(command, c, i);
43d0fcbd 5256 exec_command_done(i);
5cb5a6ff
LP
5257 free(i);
5258 }
f1acf85a
ZJS
5259
5260 return NULL;
5cb5a6ff
LP
5261}
5262
da6053d0 5263void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5264 for (size_t i = 0; i < n; i++)
f1acf85a 5265 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5266}
5267
6a1d4d9f 5268void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5269 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5270 exec_status_reset(&c[i].exec_status);
5271}
5272
5273void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5274 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5275 ExecCommand *z;
5276
5277 LIST_FOREACH(command, z, c[i])
5278 exec_status_reset(&z->exec_status);
5279 }
5280}
5281
039f0e70 5282typedef struct InvalidEnvInfo {
34cf6c43 5283 const Unit *unit;
039f0e70
LP
5284 const char *path;
5285} InvalidEnvInfo;
5286
5287static void invalid_env(const char *p, void *userdata) {
5288 InvalidEnvInfo *info = userdata;
5289
f2341e0a 5290 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5291}
5292
52c239d7
LB
5293const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5294 assert(c);
5295
5296 switch (fd_index) {
5073ff6b 5297
52c239d7
LB
5298 case STDIN_FILENO:
5299 if (c->std_input != EXEC_INPUT_NAMED_FD)
5300 return NULL;
5073ff6b 5301
52c239d7 5302 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5303
52c239d7
LB
5304 case STDOUT_FILENO:
5305 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5306 return NULL;
5073ff6b 5307
52c239d7 5308 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5309
52c239d7
LB
5310 case STDERR_FILENO:
5311 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5312 return NULL;
5073ff6b 5313
52c239d7 5314 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5315
52c239d7
LB
5316 default:
5317 return NULL;
5318 }
5319}
5320
2caa38e9
LP
5321static int exec_context_named_iofds(
5322 const ExecContext *c,
5323 const ExecParameters *p,
5324 int named_iofds[static 3]) {
5325
5b10116e 5326 size_t targets;
56fbd561 5327 const char* stdio_fdname[3];
da6053d0 5328 size_t n_fds;
52c239d7
LB
5329
5330 assert(c);
5331 assert(p);
2caa38e9 5332 assert(named_iofds);
52c239d7
LB
5333
5334 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5335 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5336 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5337
5b10116e 5338 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5339 stdio_fdname[i] = exec_context_fdname(c, i);
5340
4c47affc
FB
5341 n_fds = p->n_storage_fds + p->n_socket_fds;
5342
5b10116e 5343 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5344 if (named_iofds[STDIN_FILENO] < 0 &&
5345 c->std_input == EXEC_INPUT_NAMED_FD &&
5346 stdio_fdname[STDIN_FILENO] &&
5347 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5348
52c239d7
LB
5349 named_iofds[STDIN_FILENO] = p->fds[i];
5350 targets--;
56fbd561
ZJS
5351
5352 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5353 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5354 stdio_fdname[STDOUT_FILENO] &&
5355 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5356
52c239d7
LB
5357 named_iofds[STDOUT_FILENO] = p->fds[i];
5358 targets--;
56fbd561
ZJS
5359
5360 } else if (named_iofds[STDERR_FILENO] < 0 &&
5361 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5362 stdio_fdname[STDERR_FILENO] &&
5363 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5364
52c239d7
LB
5365 named_iofds[STDERR_FILENO] = p->fds[i];
5366 targets--;
5367 }
5368
56fbd561 5369 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5370}
5371
34cf6c43 5372static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5373 char **i, **r = NULL;
5374
5375 assert(c);
5376 assert(l);
5377
5378 STRV_FOREACH(i, c->environment_files) {
5379 char *fn;
52511fae 5380 int k;
8c7be95e
LP
5381 bool ignore = false;
5382 char **p;
7fd1b19b 5383 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5384
5385 fn = *i;
5386
5387 if (fn[0] == '-') {
5388 ignore = true;
313cefa1 5389 fn++;
8c7be95e
LP
5390 }
5391
5392 if (!path_is_absolute(fn)) {
8c7be95e
LP
5393 if (ignore)
5394 continue;
5395
5396 strv_free(r);
5397 return -EINVAL;
5398 }
5399
2bef10ab 5400 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5401 k = safe_glob(fn, 0, &pglob);
5402 if (k < 0) {
2bef10ab
PL
5403 if (ignore)
5404 continue;
8c7be95e 5405
2bef10ab 5406 strv_free(r);
d8c92e8b 5407 return k;
2bef10ab 5408 }
8c7be95e 5409
d8c92e8b
ZJS
5410 /* When we don't match anything, -ENOENT should be returned */
5411 assert(pglob.gl_pathc > 0);
5412
5b10116e 5413 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5414 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5415 if (k < 0) {
5416 if (ignore)
5417 continue;
8c7be95e 5418
2bef10ab 5419 strv_free(r);
2bef10ab 5420 return k;
e9c1ea9d 5421 }
ebc05a09 5422 /* Log invalid environment variables with filename */
039f0e70
LP
5423 if (p) {
5424 InvalidEnvInfo info = {
f2341e0a 5425 .unit = unit,
039f0e70
LP
5426 .path = pglob.gl_pathv[n]
5427 };
5428
5429 p = strv_env_clean_with_callback(p, invalid_env, &info);
5430 }
8c7be95e 5431
234519ae 5432 if (!r)
2bef10ab
PL
5433 r = p;
5434 else {
5435 char **m;
8c7be95e 5436
4ab3d29f 5437 m = strv_env_merge(r, p);
2bef10ab
PL
5438 strv_free(r);
5439 strv_free(p);
c84a9488 5440 if (!m)
2bef10ab 5441 return -ENOMEM;
2bef10ab
PL
5442
5443 r = m;
5444 }
8c7be95e
LP
5445 }
5446 }
5447
5448 *l = r;
5449
5450 return 0;
5451}
5452
6ac8fdc9 5453static bool tty_may_match_dev_console(const char *tty) {
7b912648 5454 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5455
1e22b5cd
LP
5456 if (!tty)
5457 return true;
5458
a119ec7c 5459 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5460
5461 /* trivial identity? */
5462 if (streq(tty, "console"))
5463 return true;
5464
7b912648
LP
5465 if (resolve_dev_console(&resolved) < 0)
5466 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5467
5468 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5469 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5470}
5471
6c0ae739
LP
5472static bool exec_context_may_touch_tty(const ExecContext *ec) {
5473 assert(ec);
1e22b5cd 5474
6c0ae739 5475 return ec->tty_reset ||
1e22b5cd
LP
5476 ec->tty_vhangup ||
5477 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5478 is_terminal_input(ec->std_input) ||
5479 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5480 is_terminal_output(ec->std_error);
5481}
5482
5483bool exec_context_may_touch_console(const ExecContext *ec) {
5484
5485 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5486 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5487}
5488
15ae422b
LP
5489static void strv_fprintf(FILE *f, char **l) {
5490 char **g;
5491
5492 assert(f);
5493
5494 STRV_FOREACH(g, l)
5495 fprintf(f, " %s", *g);
5496}
5497
ddc155b2
TM
5498static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5499 assert(f);
5500 assert(prefix);
5501 assert(name);
5502
5503 if (!strv_isempty(strv)) {
a7bd1656 5504 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5505 strv_fprintf(f, strv);
5506 fputs("\n", f);
5507 }
5508}
5509
34cf6c43 5510void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5291f26d 5511 char **e, **d;
add00535 5512 int r;
9eba9da4 5513
5cb5a6ff
LP
5514 assert(c);
5515 assert(f);
5516
4ad49000 5517 prefix = strempty(prefix);
5cb5a6ff
LP
5518
5519 fprintf(f,
94f04347
LP
5520 "%sUMask: %04o\n"
5521 "%sWorkingDirectory: %s\n"
451a074f 5522 "%sRootDirectory: %s\n"
15ae422b 5523 "%sNonBlocking: %s\n"
64747e2d 5524 "%sPrivateTmp: %s\n"
7f112f50 5525 "%sPrivateDevices: %s\n"
59eeb84b 5526 "%sProtectKernelTunables: %s\n"
e66a2f65 5527 "%sProtectKernelModules: %s\n"
84703040 5528 "%sProtectKernelLogs: %s\n"
fc64760d 5529 "%sProtectClock: %s\n"
59eeb84b 5530 "%sProtectControlGroups: %s\n"
d251207d
LP
5531 "%sPrivateNetwork: %s\n"
5532 "%sPrivateUsers: %s\n"
1b8689f9
LP
5533 "%sProtectHome: %s\n"
5534 "%sProtectSystem: %s\n"
5d997827 5535 "%sMountAPIVFS: %s\n"
f3e43635 5536 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5537 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5538 "%sRestrictRealtime: %s\n"
f69567cb 5539 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5540 "%sKeyringMode: %s\n"
4e399953
LP
5541 "%sProtectHostname: %s\n"
5542 "%sProtectProc: %s\n"
5543 "%sProcSubset: %s\n",
5cb5a6ff 5544 prefix, c->umask,
14eb3285
LP
5545 prefix, empty_to_root(c->working_directory),
5546 prefix, empty_to_root(c->root_directory),
15ae422b 5547 prefix, yes_no(c->non_blocking),
64747e2d 5548 prefix, yes_no(c->private_tmp),
7f112f50 5549 prefix, yes_no(c->private_devices),
59eeb84b 5550 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5551 prefix, yes_no(c->protect_kernel_modules),
84703040 5552 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5553 prefix, yes_no(c->protect_clock),
59eeb84b 5554 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5555 prefix, yes_no(c->private_network),
5556 prefix, yes_no(c->private_users),
1b8689f9
LP
5557 prefix, protect_home_to_string(c->protect_home),
5558 prefix, protect_system_to_string(c->protect_system),
5e98086d 5559 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5560 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5561 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5562 prefix, yes_no(c->restrict_realtime),
f69567cb 5563 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5564 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5565 prefix, yes_no(c->protect_hostname),
5566 prefix, protect_proc_to_string(c->protect_proc),
5567 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5568
915e6d16
LP
5569 if (c->root_image)
5570 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5571
18d73705
LB
5572 if (c->root_image_options) {
5573 MountOptions *o;
5574
5575 fprintf(f, "%sRootImageOptions:", prefix);
5576 LIST_FOREACH(mount_options, o, c->root_image_options)
5577 if (!isempty(o->options))
9ece6444
LB
5578 fprintf(f, " %s:%s",
5579 partition_designator_to_string(o->partition_designator),
5580 o->options);
18d73705
LB
5581 fprintf(f, "\n");
5582 }
5583
0389f4fa
LB
5584 if (c->root_hash) {
5585 _cleanup_free_ char *encoded = NULL;
5586 encoded = hexmem(c->root_hash, c->root_hash_size);
5587 if (encoded)
5588 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5589 }
5590
5591 if (c->root_hash_path)
5592 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5593
d4d55b0d
LB
5594 if (c->root_hash_sig) {
5595 _cleanup_free_ char *encoded = NULL;
5596 ssize_t len;
5597 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5598 if (len)
5599 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5600 }
5601
5602 if (c->root_hash_sig_path)
5603 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5604
0389f4fa
LB
5605 if (c->root_verity)
5606 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5607
8c7be95e
LP
5608 STRV_FOREACH(e, c->environment)
5609 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5610
5611 STRV_FOREACH(e, c->environment_files)
5612 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5613
b4c14404
FB
5614 STRV_FOREACH(e, c->pass_environment)
5615 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5616
00819cc1
LP
5617 STRV_FOREACH(e, c->unset_environment)
5618 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5619
53f47dfc
YW
5620 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5621
5b10116e 5622 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5623 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5624
211a3d87
LB
5625 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5626 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5627
5628 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5629 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5630 }
3536f49e 5631 }
c2bbd90b 5632
5291f26d 5633 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5634
fb33a393 5635 if (c->nice_set)
5291f26d 5636 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5637
dd6c17b1 5638 if (c->oom_score_adjust_set)
5291f26d 5639 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5640
ad21e542 5641 if (c->coredump_filter_set)
5291f26d 5642 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5643
5b10116e 5644 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5645 if (c->rlimit[i]) {
4c3a2b84 5646 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5647 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5648 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5649 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5650 }
94f04347 5651
f8b69d1d 5652 if (c->ioprio_set) {
1756a011 5653 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5654
5bead76e 5655 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5656 if (r >= 0)
5657 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5658
5bead76e 5659 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5660 }
94f04347 5661
f8b69d1d 5662 if (c->cpu_sched_set) {
1756a011 5663 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5664
837df140
YW
5665 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5666 if (r >= 0)
5667 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5668
94f04347 5669 fprintf(f,
38b48754
LP
5670 "%sCPUSchedulingPriority: %i\n"
5671 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5672 prefix, c->cpu_sched_priority,
5673 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5674 }
94f04347 5675
0985c7c4 5676 if (c->cpu_set.set) {
e7fca352
MS
5677 _cleanup_free_ char *affinity = NULL;
5678
5679 affinity = cpu_set_to_range_string(&c->cpu_set);
5680 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5681 }
5682
b070c7c0
MS
5683 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5684 _cleanup_free_ char *nodes = NULL;
5685
5686 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5687 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5688 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5689 }
5690
3a43da28 5691 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5692 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5693
5694 fprintf(f,
80876c20
LP
5695 "%sStandardInput: %s\n"
5696 "%sStandardOutput: %s\n"
5697 "%sStandardError: %s\n",
5698 prefix, exec_input_to_string(c->std_input),
5699 prefix, exec_output_to_string(c->std_output),
5700 prefix, exec_output_to_string(c->std_error));
5701
befc4a80
LP
5702 if (c->std_input == EXEC_INPUT_NAMED_FD)
5703 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5704 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5705 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5706 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5707 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5708
5709 if (c->std_input == EXEC_INPUT_FILE)
5710 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5711 if (c->std_output == EXEC_OUTPUT_FILE)
5712 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5713 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5714 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5715 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5716 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5717 if (c->std_error == EXEC_OUTPUT_FILE)
5718 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5719 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5720 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5721 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5722 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5723
80876c20
LP
5724 if (c->tty_path)
5725 fprintf(f,
6ea832a2
LP
5726 "%sTTYPath: %s\n"
5727 "%sTTYReset: %s\n"
5728 "%sTTYVHangup: %s\n"
51462135
DDM
5729 "%sTTYVTDisallocate: %s\n"
5730 "%sTTYRows: %u\n"
5731 "%sTTYColumns: %u\n",
6ea832a2
LP
5732 prefix, c->tty_path,
5733 prefix, yes_no(c->tty_reset),
5734 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5735 prefix, yes_no(c->tty_vt_disallocate),
5736 prefix, c->tty_rows,
5737 prefix, c->tty_cols);
94f04347 5738
9f6444eb 5739 if (IN_SET(c->std_output,
9f6444eb
LP
5740 EXEC_OUTPUT_KMSG,
5741 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5742 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5743 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5744 IN_SET(c->std_error,
9f6444eb
LP
5745 EXEC_OUTPUT_KMSG,
5746 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5747 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5748 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5749
5ce70e5b 5750 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5751
837df140
YW
5752 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5753 if (r >= 0)
5754 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5755
837df140
YW
5756 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5757 if (r >= 0)
5758 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5759 }
94f04347 5760
d3070fbd
LP
5761 if (c->log_level_max >= 0) {
5762 _cleanup_free_ char *t = NULL;
5763
5764 (void) log_level_to_string_alloc(c->log_level_max, &t);
5765
5766 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5767 }
5768
5291f26d 5769 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5770 fprintf(f,
5771 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5772 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5773
5ac1530e
ZJS
5774 if (c->log_ratelimit_burst > 0)
5775 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5776
5b10116e
ZJS
5777 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5778 fprintf(f, "%sLogExtraFields: ", prefix);
5779 fwrite(c->log_extra_fields[j].iov_base,
5780 1, c->log_extra_fields[j].iov_len,
5781 f);
5782 fputc('\n', f);
d3070fbd
LP
5783 }
5784
91dd5f7c
LP
5785 if (c->log_namespace)
5786 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5787
07d46372
YW
5788 if (c->secure_bits) {
5789 _cleanup_free_ char *str = NULL;
5790
5791 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5792 if (r >= 0)
5793 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5794 }
94f04347 5795
a103496c 5796 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5797 _cleanup_free_ char *str = NULL;
94f04347 5798
dd1f5bd0
YW
5799 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5800 if (r >= 0)
5801 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5802 }
5803
5804 if (c->capability_ambient_set != 0) {
dd1f5bd0 5805 _cleanup_free_ char *str = NULL;
755d4b67 5806
dd1f5bd0
YW
5807 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5808 if (r >= 0)
5809 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5810 }
5811
5812 if (c->user)
f2d3769a 5813 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5814 if (c->group)
f2d3769a 5815 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5816
29206d46
LP
5817 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5818
ddc155b2 5819 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5820
5b6319dc 5821 if (c->pam_name)
f2d3769a 5822 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5823
ddc155b2
TM
5824 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5825 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5826 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5827 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5828 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 5829 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 5830
5b10116e
ZJS
5831 for (size_t i = 0; i < c->n_bind_mounts; i++)
5832 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5833 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5834 c->bind_mounts[i].ignore_enoent ? "-": "",
5835 c->bind_mounts[i].source,
5836 c->bind_mounts[i].destination,
5837 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5838
5b10116e
ZJS
5839 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5840 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5841
5b10116e
ZJS
5842 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5843 t->path,
5844 isempty(t->options) ? "" : ":",
5845 strempty(t->options));
5846 }
2abd4e38 5847
169c1bda
LP
5848 if (c->utmp_id)
5849 fprintf(f,
5850 "%sUtmpIdentifier: %s\n",
5851 prefix, c->utmp_id);
7b52a628
MS
5852
5853 if (c->selinux_context)
5854 fprintf(f,
5f8640fb
LP
5855 "%sSELinuxContext: %s%s\n",
5856 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5857
80c21aea
WC
5858 if (c->apparmor_profile)
5859 fprintf(f,
5860 "%sAppArmorProfile: %s%s\n",
5861 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5862
5863 if (c->smack_process_label)
5864 fprintf(f,
5865 "%sSmackProcessLabel: %s%s\n",
5866 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5867
050f7277 5868 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5869 fprintf(f,
5870 "%sPersonality: %s\n",
5871 prefix, strna(personality_to_string(c->personality)));
5872
78e864e5
TM
5873 fprintf(f,
5874 "%sLockPersonality: %s\n",
5875 prefix, yes_no(c->lock_personality));
5876
17df7223 5877 if (c->syscall_filter) {
349cc4a5 5878#if HAVE_SECCOMP
8cfa775f 5879 void *id, *val;
17df7223 5880 bool first = true;
351a19b1 5881#endif
17df7223
LP
5882
5883 fprintf(f,
57183d11 5884 "%sSystemCallFilter: ",
17df7223
LP
5885 prefix);
5886
6b000af4 5887 if (!c->syscall_allow_list)
17df7223
LP
5888 fputc('~', f);
5889
349cc4a5 5890#if HAVE_SECCOMP
90e74a66 5891 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5892 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5893 const char *errno_name = NULL;
5894 int num = PTR_TO_INT(val);
17df7223
LP
5895
5896 if (first)
5897 first = false;
5898 else
5899 fputc(' ', f);
5900
57183d11 5901 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5902 fputs(strna(name), f);
8cfa775f
YW
5903
5904 if (num >= 0) {
005bfaf1 5905 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5906 if (errno_name)
5907 fprintf(f, ":%s", errno_name);
5908 else
5909 fprintf(f, ":%d", num);
5910 }
17df7223 5911 }
351a19b1 5912#endif
17df7223
LP
5913
5914 fputc('\n', f);
5915 }
5916
57183d11 5917 if (c->syscall_archs) {
349cc4a5 5918#if HAVE_SECCOMP
57183d11
LP
5919 void *id;
5920#endif
5921
5922 fprintf(f,
5923 "%sSystemCallArchitectures:",
5924 prefix);
5925
349cc4a5 5926#if HAVE_SECCOMP
90e74a66 5927 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5928 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5929#endif
5930 fputc('\n', f);
5931 }
5932
add00535
LP
5933 if (exec_context_restrict_namespaces_set(c)) {
5934 _cleanup_free_ char *s = NULL;
5935
86c2a9f1 5936 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5937 if (r >= 0)
5938 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5939 prefix, strna(s));
add00535
LP
5940 }
5941
b1994387
ILG
5942#if HAVE_LIBBPF
5943 if (exec_context_restrict_filesystems_set(c))
5944 SET_FOREACH(e, c->restrict_filesystems)
5945 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, *e);
5946#endif
5947
a8d08f39
LP
5948 if (c->network_namespace_path)
5949 fprintf(f,
5950 "%sNetworkNamespacePath: %s\n",
5951 prefix, c->network_namespace_path);
5952
3df90f24 5953 if (c->syscall_errno > 0) {
005bfaf1 5954#if HAVE_SECCOMP
3df90f24 5955 const char *errno_name;
005bfaf1 5956#endif
3df90f24
YW
5957
5958 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5959
005bfaf1
TM
5960#if HAVE_SECCOMP
5961 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5962 if (errno_name)
005bfaf1 5963 fputs(errno_name, f);
3df90f24 5964 else
005bfaf1
TM
5965 fprintf(f, "%d", c->syscall_errno);
5966#endif
5967 fputc('\n', f);
3df90f24 5968 }
b3d13314 5969
5b10116e 5970 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5971 MountOptions *o;
5972
79e20ceb 5973 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5974 c->mount_images[i].ignore_enoent ? "-": "",
5975 c->mount_images[i].source,
79e20ceb 5976 c->mount_images[i].destination);
427353f6 5977 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5978 fprintf(f, ":%s:%s",
427353f6 5979 partition_designator_to_string(o->partition_designator),
79e20ceb 5980 strempty(o->options));
427353f6
LB
5981 fprintf(f, "\n");
5982 }
93f59701
LB
5983
5984 for (size_t i = 0; i < c->n_extension_images; i++) {
5985 MountOptions *o;
5986
5987 fprintf(f, "%sExtensionImages: %s%s", prefix,
5988 c->extension_images[i].ignore_enoent ? "-": "",
5989 c->extension_images[i].source);
5990 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5991 fprintf(f, ":%s:%s",
5992 partition_designator_to_string(o->partition_designator),
5993 strempty(o->options));
5994 fprintf(f, "\n");
5995 }
5cb5a6ff
LP
5996}
5997
34cf6c43 5998bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5999 assert(c);
6000
61233823 6001 /* Returns true if the process forked off would run under
a931ad47
LP
6002 * an unchanged UID or as root. */
6003
6004 if (!c->user)
6005 return true;
6006
6007 if (streq(c->user, "root") || streq(c->user, "0"))
6008 return true;
6009
6010 return false;
6011}
6012
34cf6c43 6013int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6014 int p;
6015
6016 assert(c);
6017
6018 if (c->ioprio_set)
6019 return c->ioprio;
6020
6021 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6022 if (p < 0)
5bead76e 6023 return ioprio_prio_value(IOPRIO_CLASS_BE, 4);
7f452159
LP
6024
6025 return p;
6026}
6027
5e98086d
ZJS
6028bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6029 assert(c);
6030
61198784 6031 /* Explicit setting wins */
5e98086d
ZJS
6032 if (c->mount_apivfs_set)
6033 return c->mount_apivfs;
6034
61198784 6035 /* Default to "yes" if root directory or image are specified */
74e12520 6036 if (exec_context_with_rootfs(c))
61198784
ZJS
6037 return true;
6038
5e98086d
ZJS
6039 return false;
6040}
6041
d3070fbd 6042void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6043 assert(c);
6044
5b10116e 6045 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6046 free(c->log_extra_fields[l].iov_base);
6047 c->log_extra_fields = mfree(c->log_extra_fields);
6048 c->n_log_extra_fields = 0;
6049}
6050
6f765baf 6051void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6052 _cleanup_close_ int fd = -1;
6053 const char *path;
6054 struct stat st;
6f765baf
LP
6055 int r;
6056
6057 assert(c);
6058
6059 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6060 exec_context_tty_reset(c, NULL);
6061
6062 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6063 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6064 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6065 if (!exec_context_may_touch_tty(c))
6066 return;
6f765baf 6067
0ba976e8
LP
6068 path = exec_context_tty_path(c);
6069 if (!path)
6070 return;
6f765baf 6071
0ba976e8
LP
6072 fd = open(path, O_PATH|O_CLOEXEC);
6073 if (fd < 0)
6074 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6075 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6076 path);
6077
6078 if (fstat(fd, &st) < 0)
6079 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6080
6081 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6082 * if things are a character device, since a proper check either means we'd have to open the TTY and
6083 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6084 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6085 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6086 if (!S_ISCHR(st.st_mode))
6087 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6088
6089 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6090 if (r < 0)
6091 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6092}
6093
4c2f5842
LP
6094int exec_context_get_clean_directories(
6095 ExecContext *c,
6096 char **prefix,
6097 ExecCleanMask mask,
6098 char ***ret) {
6099
6100 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6101 int r;
6102
6103 assert(c);
6104 assert(prefix);
6105 assert(ret);
6106
5b10116e 6107 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6108 if (!FLAGS_SET(mask, 1U << t))
6109 continue;
6110
6111 if (!prefix[t])
6112 continue;
6113
211a3d87 6114 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6115 char *j;
6116
211a3d87 6117 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6118 if (!j)
6119 return -ENOMEM;
6120
6121 r = strv_consume(&l, j);
6122 if (r < 0)
6123 return r;
7f622a19
YW
6124
6125 /* Also remove private directories unconditionally. */
6126 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6127 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6128 if (!j)
6129 return -ENOMEM;
6130
6131 r = strv_consume(&l, j);
6132 if (r < 0)
6133 return r;
6134 }
6135
6136 char **symlink;
6137 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6138 j = path_join(prefix[t], *symlink);
7f622a19
YW
6139 if (!j)
6140 return -ENOMEM;
6141
6142 r = strv_consume(&l, j);
6143 if (r < 0)
6144 return r;
6145 }
4c2f5842
LP
6146 }
6147 }
6148
6149 *ret = TAKE_PTR(l);
6150 return 0;
6151}
6152
6153int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6154 ExecCleanMask mask = 0;
6155
6156 assert(c);
6157 assert(ret);
6158
6159 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6160 if (c->directories[t].n_items > 0)
4c2f5842
LP
6161 mask |= 1U << t;
6162
6163 *ret = mask;
6164 return 0;
6165}
6166
b58b4116 6167void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6168 assert(s);
5cb5a6ff 6169
2ed26ed0
LP
6170 *s = (ExecStatus) {
6171 .pid = pid,
6172 };
6173
b58b4116
LP
6174 dual_timestamp_get(&s->start_timestamp);
6175}
6176
34cf6c43 6177void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6178 assert(s);
6179
d46b79bb 6180 if (s->pid != pid)
2ed26ed0
LP
6181 *s = (ExecStatus) {
6182 .pid = pid,
6183 };
b58b4116 6184
63983207 6185 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6186
034c6ed7
LP
6187 s->code = code;
6188 s->status = status;
169c1bda 6189
6f765baf
LP
6190 if (context && context->utmp_id)
6191 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6192}
6193
6a1d4d9f
LP
6194void exec_status_reset(ExecStatus *s) {
6195 assert(s);
6196
6197 *s = (ExecStatus) {};
6198}
6199
34cf6c43 6200void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6201 assert(s);
6202 assert(f);
6203
9fb86720
LP
6204 if (s->pid <= 0)
6205 return;
6206
4c940960
LP
6207 prefix = strempty(prefix);
6208
9fb86720 6209 fprintf(f,
ccd06097
ZJS
6210 "%sPID: "PID_FMT"\n",
6211 prefix, s->pid);
9fb86720 6212
af9d16e1 6213 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6214 fprintf(f,
6215 "%sStart Timestamp: %s\n",
04f5c018 6216 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6217
af9d16e1 6218 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6219 fprintf(f,
6220 "%sExit Timestamp: %s\n"
6221 "%sExit Code: %s\n"
6222 "%sExit Status: %i\n",
04f5c018 6223 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6224 prefix, sigchld_code_to_string(s->code),
6225 prefix, s->status);
5cb5a6ff 6226}
44d8db9e 6227
34cf6c43 6228static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6229 _cleanup_free_ char *cmd = NULL;
4c940960 6230 const char *prefix2;
44d8db9e
LP
6231
6232 assert(c);
6233 assert(f);
6234
4c940960 6235 prefix = strempty(prefix);
63c372cb 6236 prefix2 = strjoina(prefix, "\t");
44d8db9e 6237
4ef15008 6238 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
44d8db9e
LP
6239 fprintf(f,
6240 "%sCommand Line: %s\n",
7c248223 6241 prefix, cmd ?: strerror_safe(ENOMEM));
44d8db9e 6242
9fb86720 6243 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6244}
6245
6246void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6247 assert(f);
6248
4c940960 6249 prefix = strempty(prefix);
44d8db9e
LP
6250
6251 LIST_FOREACH(command, c, c)
6252 exec_command_dump(c, f, prefix);
6253}
94f04347 6254
a6a80b4f
LP
6255void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6256 ExecCommand *end;
6257
6258 assert(l);
6259 assert(e);
6260
6261 if (*l) {
35b8ca3a 6262 /* It's kind of important, that we keep the order here */
71fda00f
LP
6263 LIST_FIND_TAIL(command, *l, end);
6264 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6265 } else
6266 *l = e;
6267}
6268
26fd040d
LP
6269int exec_command_set(ExecCommand *c, const char *path, ...) {
6270 va_list ap;
6271 char **l, *p;
6272
6273 assert(c);
6274 assert(path);
6275
6276 va_start(ap, path);
6277 l = strv_new_ap(path, ap);
6278 va_end(ap);
6279
6280 if (!l)
6281 return -ENOMEM;
6282
250a918d
LP
6283 p = strdup(path);
6284 if (!p) {
26fd040d
LP
6285 strv_free(l);
6286 return -ENOMEM;
6287 }
6288
6897dfe8 6289 free_and_replace(c->path, p);
26fd040d 6290
130d3d22 6291 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6292}
6293
86b23b07 6294int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6295 _cleanup_strv_free_ char **l = NULL;
86b23b07 6296 va_list ap;
86b23b07
JS
6297 int r;
6298
6299 assert(c);
6300 assert(path);
6301
6302 va_start(ap, path);
6303 l = strv_new_ap(path, ap);
6304 va_end(ap);
6305
6306 if (!l)
6307 return -ENOMEM;
6308
e287086b 6309 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6310 if (r < 0)
86b23b07 6311 return r;
86b23b07
JS
6312
6313 return 0;
6314}
6315
e8a565cb
YW
6316static void *remove_tmpdir_thread(void *p) {
6317 _cleanup_free_ char *path = p;
86b23b07 6318
e8a565cb
YW
6319 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6320 return NULL;
6321}
6322
6323static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6324 int r;
6325
6326 if (!rt)
6327 return NULL;
6328
6329 if (rt->manager)
6330 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6331
6332 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6333
6334 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6335 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6336
6337 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6338 if (r < 0)
e8a565cb 6339 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6340 else
6341 rt->tmp_dir = NULL;
e8a565cb 6342 }
613b411c 6343
56a13a49 6344 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6345 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6346
6347 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6348 if (r < 0)
e8a565cb 6349 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6350 else
6351 rt->var_tmp_dir = NULL;
e8a565cb
YW
6352 }
6353
6354 rt->id = mfree(rt->id);
6355 rt->tmp_dir = mfree(rt->tmp_dir);
6356 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6357 safe_close_pair(rt->netns_storage_socket);
a70581ff 6358 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6359 return mfree(rt);
6360}
6361
6362static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6363 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6364}
6365
56a13a49
ZJS
6366static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6367 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6368 ExecRuntime *n;
613b411c 6369
8e8009dc 6370 assert(ret);
613b411c 6371
56a13a49
ZJS
6372 id_copy = strdup(id);
6373 if (!id_copy)
6374 return -ENOMEM;
6375
8e8009dc
LP
6376 n = new(ExecRuntime, 1);
6377 if (!n)
613b411c
LP
6378 return -ENOMEM;
6379
8e8009dc 6380 *n = (ExecRuntime) {
56a13a49 6381 .id = TAKE_PTR(id_copy),
8e8009dc 6382 .netns_storage_socket = { -1, -1 },
a70581ff 6383 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6384 };
6385
6386 *ret = n;
613b411c
LP
6387 return 0;
6388}
6389
e8a565cb
YW
6390static int exec_runtime_add(
6391 Manager *m,
6392 const char *id,
56a13a49
ZJS
6393 char **tmp_dir,
6394 char **var_tmp_dir,
6395 int netns_storage_socket[2],
a70581ff 6396 int ipcns_storage_socket[2],
e8a565cb
YW
6397 ExecRuntime **ret) {
6398
6399 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6400 int r;
6401
e8a565cb 6402 assert(m);
613b411c
LP
6403 assert(id);
6404
a70581ff 6405 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6406
56a13a49 6407 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6408 if (r < 0)
6409 return r;
6410
63083706 6411 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6412 if (r < 0)
6413 return r;
e8a565cb 6414
56a13a49
ZJS
6415 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6416 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6417 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6418
6419 if (netns_storage_socket) {
56a13a49
ZJS
6420 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6421 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6422 }
6423
a70581ff
XR
6424 if (ipcns_storage_socket) {
6425 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6426 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6427 }
6428
e8a565cb
YW
6429 rt->manager = m;
6430
6431 if (ret)
6432 *ret = rt;
e8a565cb 6433 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6434 TAKE_PTR(rt);
e8a565cb
YW
6435 return 0;
6436}
6437
74aaf59b
LP
6438static int exec_runtime_make(
6439 Manager *m,
6440 const ExecContext *c,
6441 const char *id,
6442 ExecRuntime **ret) {
6443
56a13a49 6444 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6445 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6446 int r;
6447
6448 assert(m);
6449 assert(c);
6450 assert(id);
6451
6452 /* It is not necessary to create ExecRuntime object. */
a70581ff 6453 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6454 *ret = NULL;
e8a565cb 6455 return 0;
74aaf59b 6456 }
e8a565cb 6457
efa2f3a1
TM
6458 if (c->private_tmp &&
6459 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6460 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6461 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6462 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6463 if (r < 0)
6464 return r;
6465 }
6466
a8d08f39 6467 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6468 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6469 return -errno;
6470 }
6471
a70581ff
XR
6472 if (c->private_ipc || c->ipc_namespace_path) {
6473 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6474 return -errno;
6475 }
6476
6477 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6478 if (r < 0)
6479 return r;
6480
613b411c
LP
6481 return 1;
6482}
6483
e8a565cb
YW
6484int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6485 ExecRuntime *rt;
6486 int r;
613b411c 6487
e8a565cb
YW
6488 assert(m);
6489 assert(id);
6490 assert(ret);
6491
6492 rt = hashmap_get(m->exec_runtime_by_id, id);
6493 if (rt)
387f6955 6494 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6495 goto ref;
6496
74aaf59b
LP
6497 if (!create) {
6498 *ret = NULL;
e8a565cb 6499 return 0;
74aaf59b 6500 }
e8a565cb
YW
6501
6502 /* If not found, then create a new object. */
6503 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6504 if (r < 0)
e8a565cb 6505 return r;
74aaf59b
LP
6506 if (r == 0) {
6507 /* When r == 0, it is not necessary to create ExecRuntime object. */
6508 *ret = NULL;
6509 return 0;
6510 }
613b411c 6511
e8a565cb
YW
6512ref:
6513 /* increment reference counter. */
6514 rt->n_ref++;
6515 *ret = rt;
6516 return 1;
6517}
613b411c 6518
e8a565cb
YW
6519ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6520 if (!rt)
613b411c
LP
6521 return NULL;
6522
e8a565cb 6523 assert(rt->n_ref > 0);
613b411c 6524
e8a565cb
YW
6525 rt->n_ref--;
6526 if (rt->n_ref > 0)
f2341e0a
LP
6527 return NULL;
6528
e8a565cb 6529 return exec_runtime_free(rt, destroy);
613b411c
LP
6530}
6531
e8a565cb
YW
6532int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6533 ExecRuntime *rt;
e8a565cb
YW
6534
6535 assert(m);
613b411c
LP
6536 assert(f);
6537 assert(fds);
6538
90e74a66 6539 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6540 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6541
e8a565cb
YW
6542 if (rt->tmp_dir)
6543 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6544
e8a565cb
YW
6545 if (rt->var_tmp_dir)
6546 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6547
e8a565cb
YW
6548 if (rt->netns_storage_socket[0] >= 0) {
6549 int copy;
613b411c 6550
e8a565cb
YW
6551 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6552 if (copy < 0)
6553 return copy;
613b411c 6554
e8a565cb
YW
6555 fprintf(f, " netns-socket-0=%i", copy);
6556 }
613b411c 6557
e8a565cb
YW
6558 if (rt->netns_storage_socket[1] >= 0) {
6559 int copy;
613b411c 6560
e8a565cb
YW
6561 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6562 if (copy < 0)
6563 return copy;
613b411c 6564
e8a565cb
YW
6565 fprintf(f, " netns-socket-1=%i", copy);
6566 }
6567
a70581ff
XR
6568 if (rt->ipcns_storage_socket[0] >= 0) {
6569 int copy;
6570
6571 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6572 if (copy < 0)
6573 return copy;
6574
6575 fprintf(f, " ipcns-socket-0=%i", copy);
6576 }
6577
6578 if (rt->ipcns_storage_socket[1] >= 0) {
6579 int copy;
6580
6581 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6582 if (copy < 0)
6583 return copy;
6584
6585 fprintf(f, " ipcns-socket-1=%i", copy);
6586 }
6587
e8a565cb 6588 fputc('\n', f);
613b411c
LP
6589 }
6590
6591 return 0;
6592}
6593
e8a565cb
YW
6594int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6595 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6596 ExecRuntime *rt;
613b411c
LP
6597 int r;
6598
e8a565cb
YW
6599 /* This is for the migration from old (v237 or earlier) deserialization text.
6600 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6601 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6602 * so or not from the serialized text, then we always creates a new object owned by this. */
6603
6604 assert(u);
613b411c
LP
6605 assert(key);
6606 assert(value);
6607
e8a565cb
YW
6608 /* Manager manages ExecRuntime objects by the unit id.
6609 * So, we omit the serialized text when the unit does not have id (yet?)... */
6610 if (isempty(u->id)) {
6611 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6612 return 0;
6613 }
613b411c 6614
cbc165d1
ZJS
6615 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6616 return log_oom();
e8a565cb
YW
6617
6618 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6619 if (!rt) {
cbc165d1 6620 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6621 return log_oom();
613b411c 6622
e8a565cb
YW
6623 rt = rt_create;
6624 }
6625
6626 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6627 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6628 return -ENOMEM;
613b411c
LP
6629
6630 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6631 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6632 return -ENOMEM;
613b411c
LP
6633
6634 } else if (streq(key, "netns-socket-0")) {
6635 int fd;
6636
e8a565cb 6637 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6638 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6639 return 0;
613b411c 6640 }
e8a565cb
YW
6641
6642 safe_close(rt->netns_storage_socket[0]);
6643 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6644
613b411c
LP
6645 } else if (streq(key, "netns-socket-1")) {
6646 int fd;
6647
e8a565cb 6648 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6649 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6650 return 0;
613b411c 6651 }
e8a565cb
YW
6652
6653 safe_close(rt->netns_storage_socket[1]);
6654 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6655
613b411c
LP
6656 } else
6657 return 0;
6658
e8a565cb
YW
6659 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6660 if (rt_create) {
6661 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6662 if (r < 0) {
3fe91079 6663 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6664 return 0;
6665 }
613b411c 6666
e8a565cb 6667 rt_create->manager = u->manager;
613b411c 6668
e8a565cb 6669 /* Avoid cleanup */
56a13a49 6670 TAKE_PTR(rt_create);
e8a565cb 6671 }
98b47d54 6672
e8a565cb
YW
6673 return 1;
6674}
613b411c 6675
56a13a49
ZJS
6676int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6677 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6678 char *id = NULL;
a70581ff 6679 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6680 const char *p, *v = value;
6681 size_t n;
613b411c 6682
e8a565cb
YW
6683 assert(m);
6684 assert(value);
6685 assert(fds);
98b47d54 6686
e8a565cb 6687 n = strcspn(v, " ");
2f82562b 6688 id = strndupa_safe(v, n);
e8a565cb
YW
6689 if (v[n] != ' ')
6690 goto finalize;
6691 p = v + n + 1;
6692
6693 v = startswith(p, "tmp-dir=");
6694 if (v) {
6695 n = strcspn(v, " ");
56a13a49
ZJS
6696 tmp_dir = strndup(v, n);
6697 if (!tmp_dir)
6698 return log_oom();
e8a565cb
YW
6699 if (v[n] != ' ')
6700 goto finalize;
6701 p = v + n + 1;
6702 }
6703
6704 v = startswith(p, "var-tmp-dir=");
6705 if (v) {
6706 n = strcspn(v, " ");
56a13a49
ZJS
6707 var_tmp_dir = strndup(v, n);
6708 if (!var_tmp_dir)
6709 return log_oom();
e8a565cb
YW
6710 if (v[n] != ' ')
6711 goto finalize;
6712 p = v + n + 1;
6713 }
6714
6715 v = startswith(p, "netns-socket-0=");
6716 if (v) {
6717 char *buf;
6718
6719 n = strcspn(v, " ");
2f82562b 6720 buf = strndupa_safe(v, n);
c413bb28 6721
a70581ff 6722 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6723 if (r < 0)
6724 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6725 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6726 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6727 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6728 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6729 if (v[n] != ' ')
6730 goto finalize;
6731 p = v + n + 1;
613b411c
LP
6732 }
6733
e8a565cb
YW
6734 v = startswith(p, "netns-socket-1=");
6735 if (v) {
6736 char *buf;
98b47d54 6737
e8a565cb 6738 n = strcspn(v, " ");
2f82562b 6739 buf = strndupa_safe(v, n);
a70581ff
XR
6740
6741 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6742 if (r < 0)
6743 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6744 if (!fdset_contains(fds, netns_fdpair[1]))
6745 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6746 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6747 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6748 if (v[n] != ' ')
6749 goto finalize;
6750 p = v + n + 1;
6751 }
6752
6753 v = startswith(p, "ipcns-socket-0=");
6754 if (v) {
6755 char *buf;
6756
6757 n = strcspn(v, " ");
2f82562b 6758 buf = strndupa_safe(v, n);
a70581ff
XR
6759
6760 r = safe_atoi(buf, &ipcns_fdpair[0]);
6761 if (r < 0)
6762 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6763 if (!fdset_contains(fds, ipcns_fdpair[0]))
6764 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6765 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6766 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6767 if (v[n] != ' ')
6768 goto finalize;
6769 p = v + n + 1;
6770 }
6771
6772 v = startswith(p, "ipcns-socket-1=");
6773 if (v) {
6774 char *buf;
6775
6776 n = strcspn(v, " ");
2f82562b 6777 buf = strndupa_safe(v, n);
a70581ff
XR
6778
6779 r = safe_atoi(buf, &ipcns_fdpair[1]);
6780 if (r < 0)
6781 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6782 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6783 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6784 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6785 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6786 }
98b47d54 6787
e8a565cb 6788finalize:
a70581ff 6789 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6790 if (r < 0)
56a13a49
ZJS
6791 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6792 return 0;
e8a565cb 6793}
613b411c 6794
e8a565cb
YW
6795void exec_runtime_vacuum(Manager *m) {
6796 ExecRuntime *rt;
e8a565cb
YW
6797
6798 assert(m);
6799
6800 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6801
90e74a66 6802 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6803 if (rt->n_ref > 0)
6804 continue;
6805
6806 (void) exec_runtime_free(rt, false);
6807 }
613b411c
LP
6808}
6809
b9c04eaf
YW
6810void exec_params_clear(ExecParameters *p) {
6811 if (!p)
6812 return;
6813
c3f8a065
LP
6814 p->environment = strv_free(p->environment);
6815 p->fd_names = strv_free(p->fd_names);
6816 p->fds = mfree(p->fds);
6817 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6818}
6819
bb0c0d6f
LP
6820ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6821 if (!sc)
6822 return NULL;
6823
6824 free(sc->id);
6825 free(sc->data);
6826 return mfree(sc);
6827}
6828
43144be4
LP
6829ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6830 if (!lc)
6831 return NULL;
6832
6833 free(lc->id);
6834 free(lc->path);
6835 return mfree(lc);
6836}
6837
211a3d87
LB
6838void exec_directory_done(ExecDirectory *d) {
6839 if (!d)
6840 return;
6841
6842 for (size_t i = 0; i < d->n_items; i++) {
6843 free(d->items[i].path);
6844 strv_free(d->items[i].symlinks);
6845 }
6846
6847 d->items = mfree(d->items);
6848 d->n_items = 0;
6849 d->mode = 0755;
6850}
6851
6852int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
6853 _cleanup_strv_free_ char **s = NULL;
6854 _cleanup_free_ char *p = NULL;
6855
6856 assert(d);
6857 assert(n);
6858 assert(path);
6859
6860 p = strdup(path);
6861 if (!p)
6862 return -ENOMEM;
6863
6864 if (symlinks) {
6865 s = strv_copy(symlinks);
6866 if (!s)
6867 return -ENOMEM;
6868 }
6869
6870 if (!GREEDY_REALLOC(*d, *n + 1))
6871 return -ENOMEM;
6872
6873 (*d)[(*n) ++] = (ExecDirectoryItem) {
6874 .path = TAKE_PTR(p),
6875 .symlinks = TAKE_PTR(s),
6876 };
6877
6878 return 0;
6879}
6880
bb0c0d6f 6881DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 6882DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 6883
80876c20
LP
6884static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6885 [EXEC_INPUT_NULL] = "null",
6886 [EXEC_INPUT_TTY] = "tty",
6887 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6888 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6889 [EXEC_INPUT_SOCKET] = "socket",
6890 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6891 [EXEC_INPUT_DATA] = "data",
2038c3f5 6892 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6893};
6894
8a0867d6
LP
6895DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6896
94f04347 6897static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6898 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6899 [EXEC_OUTPUT_NULL] = "null",
80876c20 6900 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6901 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6902 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6903 [EXEC_OUTPUT_JOURNAL] = "journal",
6904 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6905 [EXEC_OUTPUT_SOCKET] = "socket",
6906 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6907 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6908 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6909 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6910};
6911
6912DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6913
6914static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6915 [EXEC_UTMP_INIT] = "init",
6916 [EXEC_UTMP_LOGIN] = "login",
6917 [EXEC_UTMP_USER] = "user",
6918};
6919
6920DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6921
6922static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6923 [EXEC_PRESERVE_NO] = "no",
6924 [EXEC_PRESERVE_YES] = "yes",
6925 [EXEC_PRESERVE_RESTART] = "restart",
6926};
6927
6928DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6929
6b7b2ed9 6930/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6931static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6932 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6933 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6934 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6935 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6936 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6937};
6938
6939DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6940
211a3d87
LB
6941/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
6942static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6943 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
6944 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
6945 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
6946 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
6947 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
6948};
6949
6950DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
6951
6b7b2ed9
LP
6952/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6953 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6954 * directories, specifically .timer units with their timestamp touch file. */
6955static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6956 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6957 [EXEC_DIRECTORY_STATE] = "state",
6958 [EXEC_DIRECTORY_CACHE] = "cache",
6959 [EXEC_DIRECTORY_LOGS] = "logs",
6960 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6961};
6962
6963DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6964
6965/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6966 * the service payload in. */
fb2042dd
YW
6967static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6968 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6969 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6970 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6971 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6972 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6973};
6974
6975DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6976
b1edf445
LP
6977static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6978 [EXEC_KEYRING_INHERIT] = "inherit",
6979 [EXEC_KEYRING_PRIVATE] = "private",
6980 [EXEC_KEYRING_SHARED] = "shared",
6981};
6982
6983DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);