]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
basic/escape: add helper for quoting command lines
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
fdb3deca 46#include "cgroup-setup.h"
bb0c0d6f 47#include "chown-recursive.h"
da681e1b 48#include "cpu-set-util.h"
6a818c3c 49#include "data-fd-util.h"
f6a6225e 50#include "def.h"
686d13b9 51#include "env-file.h"
4d1a6904 52#include "env-util.h"
17df7223 53#include "errno-list.h"
3ffd4af2 54#include "execute.h"
8dd4c05b 55#include "exit-status.h"
3ffd4af2 56#include "fd-util.h"
bb0c0d6f 57#include "fileio.h"
f97b34a6 58#include "format-util.h"
f4f15635 59#include "fs-util.h"
7d50b32a 60#include "glob-util.h"
0389f4fa 61#include "hexdecoct.h"
c004493c 62#include "io-util.h"
8dd4c05b 63#include "ioprio.h"
a1164ae3 64#include "label.h"
8dd4c05b
LP
65#include "log.h"
66#include "macro.h"
e8a565cb 67#include "manager.h"
2a341bb9 68#include "manager-dump.h"
0a970718 69#include "memory-util.h"
f5947a5e 70#include "missing_fs.h"
8dd4c05b 71#include "mkdir.h"
21935150 72#include "mount-util.h"
bb0c0d6f 73#include "mountpoint-util.h"
8dd4c05b 74#include "namespace.h"
6bedfcbb 75#include "parse-util.h"
8dd4c05b 76#include "path-util.h"
0b452006 77#include "process-util.h"
d3dcf4e3 78#include "random-util.h"
78f22b97 79#include "rlimit-util.h"
8dd4c05b 80#include "rm-rf.h"
349cc4a5 81#if HAVE_SECCOMP
3ffd4af2
LP
82#include "seccomp-util.h"
83#endif
07d46372 84#include "securebits-util.h"
8dd4c05b 85#include "selinux-util.h"
24882e06 86#include "signal-util.h"
8dd4c05b 87#include "smack-util.h"
57b7a260 88#include "socket-util.h"
fd63e712 89#include "special.h"
949befd3 90#include "stat-util.h"
8b43440b 91#include "string-table.h"
07630cea 92#include "string-util.h"
8dd4c05b 93#include "strv.h"
7ccbd1ae 94#include "syslog-util.h"
8dd4c05b 95#include "terminal-util.h"
bb0c0d6f 96#include "tmpfile-util.h"
566b7d23 97#include "umask-util.h"
2d3b784d 98#include "unit-serialize.h"
b1d4f8e1 99#include "user-util.h"
8dd4c05b 100#include "utmp-wtmp.h"
5cb5a6ff 101
e056b01d 102#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 103#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 104
531dca78
LP
105#define SNDBUF_SIZE (8*1024*1024)
106
da6053d0 107static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
108 if (n_fds <= 0)
109 return 0;
110
a0d40ac5
LP
111 /* Modifies the fds array! (sorts it) */
112
034c6ed7
LP
113 assert(fds);
114
5b10116e
ZJS
115 for (int start = 0;;) {
116 int restart_from = -1;
034c6ed7 117
5b10116e 118 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
119 int nfd;
120
121 /* Already at right index? */
122 if (fds[i] == i+3)
123 continue;
124
3cc2aff1
LP
125 nfd = fcntl(fds[i], F_DUPFD, i + 3);
126 if (nfd < 0)
034c6ed7
LP
127 return -errno;
128
03e334a1 129 safe_close(fds[i]);
034c6ed7
LP
130 fds[i] = nfd;
131
132 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 133 * let's remember that and try again from here */
034c6ed7
LP
134 if (nfd != i+3 && restart_from < 0)
135 restart_from = i;
136 }
137
138 if (restart_from < 0)
139 break;
140
141 start = restart_from;
142 }
143
144 return 0;
145}
146
25b583d7 147static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 148 size_t n_fds;
e2c76839 149 int r;
47a71eed 150
25b583d7 151 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
152 if (n_fds <= 0)
153 return 0;
154
155 assert(fds);
156
9b141911
FB
157 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
158 * O_NONBLOCK only applies to socket activation though. */
47a71eed 159
5b10116e 160 for (size_t i = 0; i < n_fds; i++) {
47a71eed 161
9b141911
FB
162 if (i < n_socket_fds) {
163 r = fd_nonblock(fds[i], nonblock);
164 if (r < 0)
165 return r;
166 }
47a71eed 167
451a074f
LP
168 /* We unconditionally drop FD_CLOEXEC from the fds,
169 * since after all we want to pass these fds to our
170 * children */
47a71eed 171
3cc2aff1
LP
172 r = fd_cloexec(fds[i], false);
173 if (r < 0)
e2c76839 174 return r;
47a71eed
LP
175 }
176
177 return 0;
178}
179
1e22b5cd 180static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
181 assert(context);
182
1e22b5cd
LP
183 if (context->stdio_as_fds)
184 return NULL;
185
80876c20
LP
186 if (context->tty_path)
187 return context->tty_path;
188
189 return "/dev/console";
190}
191
1e22b5cd
LP
192static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
193 const char *path;
194
6ea832a2
LP
195 assert(context);
196
1e22b5cd 197 path = exec_context_tty_path(context);
6ea832a2 198
1e22b5cd
LP
199 if (context->tty_vhangup) {
200 if (p && p->stdin_fd >= 0)
201 (void) terminal_vhangup_fd(p->stdin_fd);
202 else if (path)
203 (void) terminal_vhangup(path);
204 }
6ea832a2 205
1e22b5cd
LP
206 if (context->tty_reset) {
207 if (p && p->stdin_fd >= 0)
208 (void) reset_terminal_fd(p->stdin_fd, true);
209 else if (path)
210 (void) reset_terminal(path);
211 }
212
213 if (context->tty_vt_disallocate && path)
214 (void) vt_disallocate(path);
6ea832a2
LP
215}
216
6af760f3
LP
217static bool is_terminal_input(ExecInput i) {
218 return IN_SET(i,
219 EXEC_INPUT_TTY,
220 EXEC_INPUT_TTY_FORCE,
221 EXEC_INPUT_TTY_FAIL);
222}
223
3a1286b6 224static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
225 return IN_SET(o,
226 EXEC_OUTPUT_TTY,
6af760f3
LP
227 EXEC_OUTPUT_KMSG_AND_CONSOLE,
228 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
229}
230
aac8c0c3
LP
231static bool is_kmsg_output(ExecOutput o) {
232 return IN_SET(o,
233 EXEC_OUTPUT_KMSG,
234 EXEC_OUTPUT_KMSG_AND_CONSOLE);
235}
236
6af760f3
LP
237static bool exec_context_needs_term(const ExecContext *c) {
238 assert(c);
239
240 /* Return true if the execution context suggests we should set $TERM to something useful. */
241
242 if (is_terminal_input(c->std_input))
243 return true;
244
245 if (is_terminal_output(c->std_output))
246 return true;
247
248 if (is_terminal_output(c->std_error))
249 return true;
250
251 return !!c->tty_path;
3a1286b6
MS
252}
253
80876c20 254static int open_null_as(int flags, int nfd) {
046a82c1 255 int fd;
071830ff 256
80876c20 257 assert(nfd >= 0);
071830ff 258
613b411c
LP
259 fd = open("/dev/null", flags|O_NOCTTY);
260 if (fd < 0)
071830ff
LP
261 return -errno;
262
046a82c1 263 return move_fd(fd, nfd, false);
071830ff
LP
264}
265
91dd5f7c
LP
266static int connect_journal_socket(
267 int fd,
268 const char *log_namespace,
269 uid_t uid,
270 gid_t gid) {
271
f36a9d59
ZJS
272 union sockaddr_union sa;
273 socklen_t sa_len;
524daa8c
ZJS
274 uid_t olduid = UID_INVALID;
275 gid_t oldgid = GID_INVALID;
91dd5f7c 276 const char *j;
524daa8c
ZJS
277 int r;
278
91dd5f7c
LP
279 j = log_namespace ?
280 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
281 "/run/systemd/journal/stdout";
282 r = sockaddr_un_set_path(&sa.un, j);
283 if (r < 0)
284 return r;
f36a9d59 285 sa_len = r;
91dd5f7c 286
cad93f29 287 if (gid_is_valid(gid)) {
524daa8c
ZJS
288 oldgid = getgid();
289
92a17af9 290 if (setegid(gid) < 0)
524daa8c
ZJS
291 return -errno;
292 }
293
cad93f29 294 if (uid_is_valid(uid)) {
524daa8c
ZJS
295 olduid = getuid();
296
92a17af9 297 if (seteuid(uid) < 0) {
524daa8c
ZJS
298 r = -errno;
299 goto restore_gid;
300 }
301 }
302
f36a9d59 303 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
524daa8c
ZJS
304
305 /* If we fail to restore the uid or gid, things will likely
306 fail later on. This should only happen if an LSM interferes. */
307
cad93f29 308 if (uid_is_valid(uid))
524daa8c
ZJS
309 (void) seteuid(olduid);
310
311 restore_gid:
cad93f29 312 if (gid_is_valid(gid))
524daa8c
ZJS
313 (void) setegid(oldgid);
314
315 return r;
316}
317
fd1f9c89 318static int connect_logger_as(
34cf6c43 319 const Unit *unit,
fd1f9c89 320 const ExecContext *context,
af635cf3 321 const ExecParameters *params,
fd1f9c89
LP
322 ExecOutput output,
323 const char *ident,
fd1f9c89
LP
324 int nfd,
325 uid_t uid,
326 gid_t gid) {
327
2ac1ff68
EV
328 _cleanup_close_ int fd = -1;
329 int r;
071830ff
LP
330
331 assert(context);
af635cf3 332 assert(params);
80876c20
LP
333 assert(output < _EXEC_OUTPUT_MAX);
334 assert(ident);
335 assert(nfd >= 0);
071830ff 336
54fe0cdb
LP
337 fd = socket(AF_UNIX, SOCK_STREAM, 0);
338 if (fd < 0)
80876c20 339 return -errno;
071830ff 340
91dd5f7c 341 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
342 if (r < 0)
343 return r;
071830ff 344
2ac1ff68 345 if (shutdown(fd, SHUT_RD) < 0)
80876c20 346 return -errno;
071830ff 347
fd1f9c89 348 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 349
2ac1ff68 350 if (dprintf(fd,
62bca2c6 351 "%s\n"
80876c20
LP
352 "%s\n"
353 "%i\n"
54fe0cdb
LP
354 "%i\n"
355 "%i\n"
356 "%i\n"
4f4a1dbf 357 "%i\n",
c867611e 358 context->syslog_identifier ?: ident,
af635cf3 359 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
360 context->syslog_priority,
361 !!context->syslog_level_prefix,
f3dc6af2 362 false,
aac8c0c3 363 is_kmsg_output(output),
2ac1ff68
EV
364 is_terminal_output(output)) < 0)
365 return -errno;
80876c20 366
2ac1ff68 367 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 368}
2ac1ff68 369
3a274a21 370static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 371 int fd;
071830ff 372
80876c20
LP
373 assert(path);
374 assert(nfd >= 0);
fd1f9c89 375
3a274a21 376 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 377 if (fd < 0)
80876c20 378 return fd;
071830ff 379
046a82c1 380 return move_fd(fd, nfd, false);
80876c20 381}
071830ff 382
2038c3f5 383static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
384 union sockaddr_union sa;
385 socklen_t sa_len;
15a3e96f 386 _cleanup_close_ int fd = -1;
86fca584 387 int r;
071830ff 388
80876c20 389 assert(path);
071830ff 390
2038c3f5
LP
391 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
392 flags |= O_CREAT;
393
394 fd = open(path, flags|O_NOCTTY, mode);
395 if (fd >= 0)
15a3e96f 396 return TAKE_FD(fd);
071830ff 397
2038c3f5
LP
398 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
399 return -errno;
2038c3f5
LP
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
86fca584
ZJS
403 r = sockaddr_un_set_path(&sa.un, path);
404 if (r < 0)
405 return r == -EINVAL ? -ENXIO : r;
406 sa_len = r;
407
2038c3f5
LP
408 fd = socket(AF_UNIX, SOCK_STREAM, 0);
409 if (fd < 0)
410 return -errno;
411
86fca584 412 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 413 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 414 * indication that this wasn't an AF_UNIX socket after all */
071830ff 415
2038c3f5
LP
416 if ((flags & O_ACCMODE) == O_RDONLY)
417 r = shutdown(fd, SHUT_WR);
418 else if ((flags & O_ACCMODE) == O_WRONLY)
419 r = shutdown(fd, SHUT_RD);
420 else
86fca584 421 r = 0;
15a3e96f 422 if (r < 0)
2038c3f5 423 return -errno;
2038c3f5 424
15a3e96f 425 return TAKE_FD(fd);
80876c20 426}
071830ff 427
08f3be7a
LP
428static int fixup_input(
429 const ExecContext *context,
430 int socket_fd,
431 bool apply_tty_stdin) {
432
433 ExecInput std_input;
434
435 assert(context);
436
437 std_input = context->std_input;
1e3ad081
LP
438
439 if (is_terminal_input(std_input) && !apply_tty_stdin)
440 return EXEC_INPUT_NULL;
071830ff 441
03fd9c49 442 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
443 return EXEC_INPUT_NULL;
444
08f3be7a
LP
445 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
446 return EXEC_INPUT_NULL;
447
03fd9c49 448 return std_input;
4f2d528d
LP
449}
450
7966a916 451static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 452
7966a916 453 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
454 return EXEC_OUTPUT_INHERIT;
455
7966a916 456 return output;
4f2d528d
LP
457}
458
a34ceba6
LP
459static int setup_input(
460 const ExecContext *context,
461 const ExecParameters *params,
52c239d7 462 int socket_fd,
2caa38e9 463 const int named_iofds[static 3]) {
a34ceba6 464
4f2d528d
LP
465 ExecInput i;
466
467 assert(context);
a34ceba6 468 assert(params);
2caa38e9 469 assert(named_iofds);
a34ceba6
LP
470
471 if (params->stdin_fd >= 0) {
472 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
473 return -errno;
474
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
476 if (isatty(STDIN_FILENO)) {
477 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
478 (void) reset_terminal_fd(STDIN_FILENO, true);
479 }
a34ceba6
LP
480
481 return STDIN_FILENO;
482 }
4f2d528d 483
08f3be7a 484 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
485
486 switch (i) {
071830ff 487
80876c20
LP
488 case EXEC_INPUT_NULL:
489 return open_null_as(O_RDONLY, STDIN_FILENO);
490
491 case EXEC_INPUT_TTY:
492 case EXEC_INPUT_TTY_FORCE:
493 case EXEC_INPUT_TTY_FAIL: {
046a82c1 494 int fd;
071830ff 495
1e22b5cd 496 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
497 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
498 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
499 ACQUIRE_TERMINAL_WAIT,
3a43da28 500 USEC_INFINITY);
970edce6 501 if (fd < 0)
80876c20
LP
502 return fd;
503
046a82c1 504 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
505 }
506
4f2d528d 507 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
508 assert(socket_fd >= 0);
509
4f2d528d
LP
510 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
511
52c239d7 512 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
513 assert(named_iofds[STDIN_FILENO] >= 0);
514
52c239d7
LB
515 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
516 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
517
08f3be7a
LP
518 case EXEC_INPUT_DATA: {
519 int fd;
520
521 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
522 if (fd < 0)
523 return fd;
524
525 return move_fd(fd, STDIN_FILENO, false);
526 }
527
2038c3f5
LP
528 case EXEC_INPUT_FILE: {
529 bool rw;
530 int fd;
531
532 assert(context->stdio_file[STDIN_FILENO]);
533
534 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
535 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
536
537 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
538 if (fd < 0)
539 return fd;
540
541 return move_fd(fd, STDIN_FILENO, false);
542 }
543
80876c20
LP
544 default:
545 assert_not_reached("Unknown input type");
546 }
547}
548
41fc585a
LP
549static bool can_inherit_stderr_from_stdout(
550 const ExecContext *context,
551 ExecOutput o,
552 ExecOutput e) {
553
554 assert(context);
555
556 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
557 * stderr fd */
558
559 if (e == EXEC_OUTPUT_INHERIT)
560 return true;
561 if (e != o)
562 return false;
563
564 if (e == EXEC_OUTPUT_NAMED_FD)
565 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
566
8d7dab1f 567 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
568 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
569
570 return true;
571}
572
a34ceba6 573static int setup_output(
34cf6c43 574 const Unit *unit,
a34ceba6
LP
575 const ExecContext *context,
576 const ExecParameters *params,
577 int fileno,
578 int socket_fd,
2caa38e9 579 const int named_iofds[static 3],
a34ceba6 580 const char *ident,
7bce046b
LP
581 uid_t uid,
582 gid_t gid,
583 dev_t *journal_stream_dev,
584 ino_t *journal_stream_ino) {
a34ceba6 585
4f2d528d
LP
586 ExecOutput o;
587 ExecInput i;
47c1d80d 588 int r;
4f2d528d 589
f2341e0a 590 assert(unit);
80876c20 591 assert(context);
a34ceba6 592 assert(params);
80876c20 593 assert(ident);
7bce046b
LP
594 assert(journal_stream_dev);
595 assert(journal_stream_ino);
80876c20 596
a34ceba6
LP
597 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
598
599 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
600 return -errno;
601
602 return STDOUT_FILENO;
603 }
604
605 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
606 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
607 return -errno;
608
609 return STDERR_FILENO;
610 }
611
08f3be7a 612 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 613 o = fixup_output(context->std_output, socket_fd);
4f2d528d 614
eb17e935
MS
615 if (fileno == STDERR_FILENO) {
616 ExecOutput e;
617 e = fixup_output(context->std_error, socket_fd);
80876c20 618
eb17e935
MS
619 /* This expects the input and output are already set up */
620
621 /* Don't change the stderr file descriptor if we inherit all
622 * the way and are not on a tty */
623 if (e == EXEC_OUTPUT_INHERIT &&
624 o == EXEC_OUTPUT_INHERIT &&
625 i == EXEC_INPUT_NULL &&
626 !is_terminal_input(context->std_input) &&
7966a916 627 getppid() != 1)
eb17e935
MS
628 return fileno;
629
630 /* Duplicate from stdout if possible */
41fc585a 631 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 632 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 633
eb17e935 634 o = e;
80876c20 635
eb17e935 636 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
637 /* If input got downgraded, inherit the original value */
638 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 639 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 640
08f3be7a
LP
641 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
642 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 643 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 644
acb591e4
LP
645 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
646 if (getppid() != 1)
eb17e935 647 return fileno;
94f04347 648
eb17e935
MS
649 /* We need to open /dev/null here anew, to get the right access mode. */
650 return open_null_as(O_WRONLY, fileno);
071830ff 651 }
94f04347 652
eb17e935 653 switch (o) {
80876c20
LP
654
655 case EXEC_OUTPUT_NULL:
eb17e935 656 return open_null_as(O_WRONLY, fileno);
80876c20
LP
657
658 case EXEC_OUTPUT_TTY:
4f2d528d 659 if (is_terminal_input(i))
eb17e935 660 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
661
662 /* We don't reset the terminal if this is just about output */
1e22b5cd 663 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 664
9a6bca7a 665 case EXEC_OUTPUT_KMSG:
28dbc1e8 666 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
667 case EXEC_OUTPUT_JOURNAL:
668 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 669 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 670 if (r < 0) {
7966a916
ZJS
671 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
672 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 673 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
674 } else {
675 struct stat st;
676
677 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
678 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
679 * services to detect whether they are connected to the journal or not.
680 *
681 * If both stdout and stderr are connected to a stream then let's make sure to store the data
682 * about STDERR as that's usually the best way to do logging. */
7bce046b 683
ab2116b1
LP
684 if (fstat(fileno, &st) >= 0 &&
685 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
686 *journal_stream_dev = st.st_dev;
687 *journal_stream_ino = st.st_ino;
688 }
47c1d80d
MS
689 }
690 return r;
4f2d528d
LP
691
692 case EXEC_OUTPUT_SOCKET:
693 assert(socket_fd >= 0);
e75a9ed1 694
eb17e935 695 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 696
52c239d7 697 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
698 assert(named_iofds[fileno] >= 0);
699
52c239d7
LB
700 (void) fd_nonblock(named_iofds[fileno], false);
701 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
702
566b7d23 703 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
704 case EXEC_OUTPUT_FILE_APPEND:
705 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 706 bool rw;
566b7d23 707 int fd, flags;
2038c3f5
LP
708
709 assert(context->stdio_file[fileno]);
710
711 rw = context->std_input == EXEC_INPUT_FILE &&
712 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
713
714 if (rw)
715 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
716
566b7d23
ZD
717 flags = O_WRONLY;
718 if (o == EXEC_OUTPUT_FILE_APPEND)
719 flags |= O_APPEND;
8d7dab1f
LW
720 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
721 flags |= O_TRUNC;
566b7d23
ZD
722
723 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
724 if (fd < 0)
725 return fd;
726
566b7d23 727 return move_fd(fd, fileno, 0);
2038c3f5
LP
728 }
729
94f04347 730 default:
80876c20 731 assert_not_reached("Unknown error type");
94f04347 732 }
071830ff
LP
733}
734
02a51aba 735static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 736 int r;
02a51aba
LP
737
738 assert(fd >= 0);
02a51aba 739
1ff74fb6 740 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
741 if (isatty(fd) < 1) {
742 if (IN_SET(errno, EINVAL, ENOTTY))
743 return 0; /* not a tty */
1ff74fb6 744
02a51aba 745 return -errno;
4b3b5bc7 746 }
02a51aba 747
4b3b5bc7 748 /* This might fail. What matters are the results. */
f2df231f 749 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
750 if (r < 0)
751 return r;
02a51aba 752
4b3b5bc7 753 return 1;
02a51aba
LP
754}
755
7d5ceb64 756static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
757 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
758 int r;
80876c20 759
80876c20
LP
760 assert(_saved_stdin);
761 assert(_saved_stdout);
762
af6da548
LP
763 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
764 if (saved_stdin < 0)
765 return -errno;
80876c20 766
af6da548 767 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
768 if (saved_stdout < 0)
769 return -errno;
80876c20 770
8854d795 771 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
772 if (fd < 0)
773 return fd;
80876c20 774
af6da548
LP
775 r = chown_terminal(fd, getuid());
776 if (r < 0)
3d18b167 777 return r;
02a51aba 778
3d18b167
LP
779 r = reset_terminal_fd(fd, true);
780 if (r < 0)
781 return r;
80876c20 782
2b33ab09 783 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 784 fd = -1;
2b33ab09
LP
785 if (r < 0)
786 return r;
80876c20
LP
787
788 *_saved_stdin = saved_stdin;
789 *_saved_stdout = saved_stdout;
790
3d18b167 791 saved_stdin = saved_stdout = -1;
80876c20 792
3d18b167 793 return 0;
80876c20
LP
794}
795
63d77c92 796static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
797 assert(err < 0);
798
799 if (err == -ETIMEDOUT)
63d77c92 800 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
801 else {
802 errno = -err;
63d77c92 803 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
804 }
805}
806
63d77c92 807static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 808 _cleanup_close_ int fd = -1;
80876c20 809
3b20f877 810 assert(vc);
80876c20 811
7d5ceb64 812 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 813 if (fd < 0)
3b20f877 814 return;
80876c20 815
63d77c92 816 write_confirm_error_fd(err, fd, u);
af6da548 817}
80876c20 818
3d18b167 819static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 820 int r = 0;
80876c20 821
af6da548
LP
822 assert(saved_stdin);
823 assert(saved_stdout);
824
825 release_terminal();
826
827 if (*saved_stdin >= 0)
80876c20 828 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 829 r = -errno;
80876c20 830
af6da548 831 if (*saved_stdout >= 0)
80876c20 832 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 833 r = -errno;
80876c20 834
3d18b167
LP
835 *saved_stdin = safe_close(*saved_stdin);
836 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
837
838 return r;
839}
840
3b20f877
FB
841enum {
842 CONFIRM_PRETEND_FAILURE = -1,
843 CONFIRM_PRETEND_SUCCESS = 0,
844 CONFIRM_EXECUTE = 1,
845};
846
eedf223a 847static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 848 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 849 _cleanup_free_ char *e = NULL;
3b20f877 850 char c;
af6da548 851
3b20f877 852 /* For any internal errors, assume a positive response. */
7d5ceb64 853 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 854 if (r < 0) {
63d77c92 855 write_confirm_error(r, vc, u);
3b20f877
FB
856 return CONFIRM_EXECUTE;
857 }
af6da548 858
b0eb2944
FB
859 /* confirm_spawn might have been disabled while we were sleeping. */
860 if (manager_is_confirm_spawn_disabled(u->manager)) {
861 r = 1;
862 goto restore_stdio;
863 }
af6da548 864
2bcd3c26
FB
865 e = ellipsize(cmdline, 60, 100);
866 if (!e) {
867 log_oom();
868 r = CONFIRM_EXECUTE;
869 goto restore_stdio;
870 }
af6da548 871
d172b175 872 for (;;) {
539622bd 873 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 874 if (r < 0) {
63d77c92 875 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
876 r = CONFIRM_EXECUTE;
877 goto restore_stdio;
878 }
af6da548 879
d172b175 880 switch (c) {
b0eb2944
FB
881 case 'c':
882 printf("Resuming normal execution.\n");
883 manager_disable_confirm_spawn();
884 r = 1;
885 break;
dd6f9ac0
FB
886 case 'D':
887 unit_dump(u, stdout, " ");
888 continue; /* ask again */
d172b175
FB
889 case 'f':
890 printf("Failing execution.\n");
891 r = CONFIRM_PRETEND_FAILURE;
892 break;
893 case 'h':
b0eb2944
FB
894 printf(" c - continue, proceed without asking anymore\n"
895 " D - dump, show the state of the unit\n"
dd6f9ac0 896 " f - fail, don't execute the command and pretend it failed\n"
d172b175 897 " h - help\n"
eedf223a 898 " i - info, show a short summary of the unit\n"
56fde33a 899 " j - jobs, show jobs that are in progress\n"
d172b175
FB
900 " s - skip, don't execute the command and pretend it succeeded\n"
901 " y - yes, execute the command\n");
dd6f9ac0 902 continue; /* ask again */
eedf223a
FB
903 case 'i':
904 printf(" Description: %s\n"
905 " Unit: %s\n"
906 " Command: %s\n",
907 u->id, u->description, cmdline);
908 continue; /* ask again */
56fde33a
FB
909 case 'j':
910 manager_dump_jobs(u->manager, stdout, " ");
911 continue; /* ask again */
539622bd
FB
912 case 'n':
913 /* 'n' was removed in favor of 'f'. */
914 printf("Didn't understand 'n', did you mean 'f'?\n");
915 continue; /* ask again */
d172b175
FB
916 case 's':
917 printf("Skipping execution.\n");
918 r = CONFIRM_PRETEND_SUCCESS;
919 break;
920 case 'y':
921 r = CONFIRM_EXECUTE;
922 break;
923 default:
924 assert_not_reached("Unhandled choice");
925 }
3b20f877 926 break;
3b20f877 927 }
af6da548 928
3b20f877 929restore_stdio:
af6da548 930 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 931 return r;
80876c20
LP
932}
933
4d885bd3
DH
934static int get_fixed_user(const ExecContext *c, const char **user,
935 uid_t *uid, gid_t *gid,
936 const char **home, const char **shell) {
81a2b7ce 937 int r;
4d885bd3 938 const char *name;
81a2b7ce 939
4d885bd3 940 assert(c);
81a2b7ce 941
23deef88
LP
942 if (!c->user)
943 return 0;
944
4d885bd3
DH
945 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
946 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 947
23deef88 948 name = c->user;
fafff8f1 949 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
950 if (r < 0)
951 return r;
81a2b7ce 952
4d885bd3
DH
953 *user = name;
954 return 0;
955}
956
957static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
958 int r;
959 const char *name;
960
961 assert(c);
962
963 if (!c->group)
964 return 0;
965
966 name = c->group;
fafff8f1 967 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
968 if (r < 0)
969 return r;
970
971 *group = name;
972 return 0;
973}
974
cdc5d5c5
DH
975static int get_supplementary_groups(const ExecContext *c, const char *user,
976 const char *group, gid_t gid,
977 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
978 char **i;
979 int r, k = 0;
980 int ngroups_max;
981 bool keep_groups = false;
982 gid_t *groups = NULL;
983 _cleanup_free_ gid_t *l_gids = NULL;
984
985 assert(c);
986
bbeea271
DH
987 /*
988 * If user is given, then lookup GID and supplementary groups list.
989 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
990 * here and as early as possible so we keep the list of supplementary
991 * groups of the caller.
bbeea271
DH
992 */
993 if (user && gid_is_valid(gid) && gid != 0) {
994 /* First step, initialize groups from /etc/groups */
995 if (initgroups(user, gid) < 0)
996 return -errno;
997
998 keep_groups = true;
999 }
1000
ac6e8be6 1001 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1002 return 0;
1003
366ddd25
DH
1004 /*
1005 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1006 * be positive, otherwise fail.
1007 */
1008 errno = 0;
1009 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1010 if (ngroups_max <= 0)
1011 return errno_or_else(EOPNOTSUPP);
366ddd25 1012
4d885bd3
DH
1013 l_gids = new(gid_t, ngroups_max);
1014 if (!l_gids)
1015 return -ENOMEM;
81a2b7ce 1016
4d885bd3
DH
1017 if (keep_groups) {
1018 /*
1019 * Lookup the list of groups that the user belongs to, we
1020 * avoid NSS lookups here too for gid=0.
1021 */
1022 k = ngroups_max;
1023 if (getgrouplist(user, gid, l_gids, &k) < 0)
1024 return -EINVAL;
1025 } else
1026 k = 0;
81a2b7ce 1027
4d885bd3
DH
1028 STRV_FOREACH(i, c->supplementary_groups) {
1029 const char *g;
81a2b7ce 1030
4d885bd3
DH
1031 if (k >= ngroups_max)
1032 return -E2BIG;
81a2b7ce 1033
4d885bd3 1034 g = *i;
fafff8f1 1035 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1036 if (r < 0)
1037 return r;
81a2b7ce 1038
4d885bd3
DH
1039 k++;
1040 }
81a2b7ce 1041
4d885bd3
DH
1042 /*
1043 * Sets ngids to zero to drop all supplementary groups, happens
1044 * when we are under root and SupplementaryGroups= is empty.
1045 */
1046 if (k == 0) {
1047 *ngids = 0;
1048 return 0;
1049 }
81a2b7ce 1050
4d885bd3
DH
1051 /* Otherwise get the final list of supplementary groups */
1052 groups = memdup(l_gids, sizeof(gid_t) * k);
1053 if (!groups)
1054 return -ENOMEM;
1055
1056 *supplementary_gids = groups;
1057 *ngids = k;
1058
1059 groups = NULL;
1060
1061 return 0;
1062}
1063
34cf6c43 1064static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1065 int r;
1066
709dbeac
YW
1067 /* Handle SupplementaryGroups= if it is not empty */
1068 if (ngids > 0) {
4d885bd3
DH
1069 r = maybe_setgroups(ngids, supplementary_gids);
1070 if (r < 0)
97f0e76f 1071 return r;
4d885bd3 1072 }
81a2b7ce 1073
4d885bd3
DH
1074 if (gid_is_valid(gid)) {
1075 /* Then set our gids */
1076 if (setresgid(gid, gid, gid) < 0)
1077 return -errno;
81a2b7ce
LP
1078 }
1079
1080 return 0;
1081}
1082
dbdc4098
TK
1083static int set_securebits(int bits, int mask) {
1084 int current, applied;
1085 current = prctl(PR_GET_SECUREBITS);
1086 if (current < 0)
1087 return -errno;
1088 /* Clear all securebits defined in mask and set bits */
1089 applied = (current & ~mask) | bits;
1090 if (current == applied)
1091 return 0;
1092 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1093 return -errno;
1094 return 1;
1095}
1096
81a2b7ce 1097static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1098 assert(context);
dbdc4098 1099 int r;
81a2b7ce 1100
4d885bd3
DH
1101 if (!uid_is_valid(uid))
1102 return 0;
1103
479050b3 1104 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1105 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1106 * required, so we also need keep-caps in this case.
1107 */
81a2b7ce 1108
dbdc4098 1109 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1110
1111 /* First step: If we need to keep capabilities but
1112 * drop privileges we need to make sure we keep our
cbb21cca 1113 * caps, while we drop privileges. */
693ced48 1114 if (uid != 0) {
dbdc4098
TK
1115 /* Add KEEP_CAPS to the securebits */
1116 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1117 if (r < 0)
1118 return r;
693ced48 1119 }
81a2b7ce
LP
1120 }
1121
479050b3 1122 /* Second step: actually set the uids */
81a2b7ce
LP
1123 if (setresuid(uid, uid, uid) < 0)
1124 return -errno;
1125
1126 /* At this point we should have all necessary capabilities but
1127 are otherwise a normal user. However, the caps might got
1128 corrupted due to the setresuid() so we need clean them up
1129 later. This is done outside of this call. */
1130
1131 return 0;
1132}
1133
349cc4a5 1134#if HAVE_PAM
5b6319dc
LP
1135
1136static int null_conv(
1137 int num_msg,
1138 const struct pam_message **msg,
1139 struct pam_response **resp,
1140 void *appdata_ptr) {
1141
1142 /* We don't support conversations */
1143
1144 return PAM_CONV_ERR;
1145}
1146
cefc33ae
LP
1147#endif
1148
5b6319dc
LP
1149static int setup_pam(
1150 const char *name,
1151 const char *user,
940c5210 1152 uid_t uid,
2d6fce8d 1153 gid_t gid,
5b6319dc 1154 const char *tty,
2065ca69 1155 char ***env,
5b8d1f6b 1156 const int fds[], size_t n_fds) {
5b6319dc 1157
349cc4a5 1158#if HAVE_PAM
cefc33ae 1159
5b6319dc
LP
1160 static const struct pam_conv conv = {
1161 .conv = null_conv,
1162 .appdata_ptr = NULL
1163 };
1164
2d7c6aa2 1165 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1166 pam_handle_t *handle = NULL;
d6e5f3ad 1167 sigset_t old_ss;
7bb70b6e 1168 int pam_code = PAM_SUCCESS, r;
84eada2f 1169 char **nv, **e = NULL;
5b6319dc
LP
1170 bool close_session = false;
1171 pid_t pam_pid = 0, parent_pid;
970edce6 1172 int flags = 0;
5b6319dc
LP
1173
1174 assert(name);
1175 assert(user);
2065ca69 1176 assert(env);
5b6319dc
LP
1177
1178 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1179 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1180 * systemd via the cgroup logic. It will then remove the PAM
1181 * session again. The parent process will exec() the actual
1182 * daemon. We do things this way to ensure that the main PID
1183 * of the daemon is the one we initially fork()ed. */
1184
7bb70b6e
LP
1185 r = barrier_create(&barrier);
1186 if (r < 0)
2d7c6aa2
DH
1187 goto fail;
1188
553d2243 1189 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1190 flags |= PAM_SILENT;
1191
f546241b
ZJS
1192 pam_code = pam_start(name, user, &conv, &handle);
1193 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1194 handle = NULL;
1195 goto fail;
1196 }
1197
3cd24c1a
LP
1198 if (!tty) {
1199 _cleanup_free_ char *q = NULL;
1200
1201 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1202 * out if that's the case, and read the TTY off it. */
1203
1204 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1205 tty = strjoina("/dev/", q);
1206 }
1207
f546241b
ZJS
1208 if (tty) {
1209 pam_code = pam_set_item(handle, PAM_TTY, tty);
1210 if (pam_code != PAM_SUCCESS)
5b6319dc 1211 goto fail;
f546241b 1212 }
5b6319dc 1213
84eada2f
JW
1214 STRV_FOREACH(nv, *env) {
1215 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1216 if (pam_code != PAM_SUCCESS)
1217 goto fail;
1218 }
1219
970edce6 1220 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1221 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1222 goto fail;
1223
3bb39ea9
DG
1224 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1225 if (pam_code != PAM_SUCCESS)
46d7c6af 1226 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1227
970edce6 1228 pam_code = pam_open_session(handle, flags);
f546241b 1229 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1230 goto fail;
1231
1232 close_session = true;
1233
f546241b
ZJS
1234 e = pam_getenvlist(handle);
1235 if (!e) {
5b6319dc
LP
1236 pam_code = PAM_BUF_ERR;
1237 goto fail;
1238 }
1239
1240 /* Block SIGTERM, so that we know that it won't get lost in
1241 * the child */
ce30c8dc 1242
72c0a2c2 1243 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1244
df0ff127 1245 parent_pid = getpid_cached();
5b6319dc 1246
4c253ed1
LP
1247 r = safe_fork("(sd-pam)", 0, &pam_pid);
1248 if (r < 0)
5b6319dc 1249 goto fail;
4c253ed1 1250 if (r == 0) {
7bb70b6e 1251 int sig, ret = EXIT_PAM;
5b6319dc
LP
1252
1253 /* The child's job is to reset the PAM session on
1254 * termination */
2d7c6aa2 1255 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1256
1da37e58
ZJS
1257 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1258 * those fds are open here that have been opened by PAM. */
4c253ed1 1259 (void) close_many(fds, n_fds);
5b6319dc 1260
940c5210
AK
1261 /* Drop privileges - we don't need any to pam_close_session
1262 * and this will make PR_SET_PDEATHSIG work in most cases.
1263 * If this fails, ignore the error - but expect sd-pam threads
1264 * to fail to exit normally */
2d6fce8d 1265
97f0e76f
LP
1266 r = maybe_setgroups(0, NULL);
1267 if (r < 0)
1268 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1269 if (setresgid(gid, gid, gid) < 0)
1270 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1271 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1272 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1273
9c274488 1274 (void) ignore_signals(SIGPIPE);
ce30c8dc 1275
940c5210
AK
1276 /* Wait until our parent died. This will only work if
1277 * the above setresuid() succeeds, otherwise the kernel
1278 * will not allow unprivileged parents kill their privileged
1279 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1280 * to do the rest for us. */
1281 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1282 goto child_finish;
1283
2d7c6aa2
DH
1284 /* Tell the parent that our setup is done. This is especially
1285 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1286 * setup might race against our setresuid(2) call.
1287 *
1288 * If the parent aborted, we'll detect this below, hence ignore
1289 * return failure here. */
1290 (void) barrier_place(&barrier);
2d7c6aa2 1291
643f4706 1292 /* Check if our parent process might already have died? */
5b6319dc 1293 if (getppid() == parent_pid) {
d6e5f3ad
DM
1294 sigset_t ss;
1295
1296 assert_se(sigemptyset(&ss) >= 0);
1297 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1298
3dead8d9
LP
1299 for (;;) {
1300 if (sigwait(&ss, &sig) < 0) {
1301 if (errno == EINTR)
1302 continue;
1303
1304 goto child_finish;
1305 }
5b6319dc 1306
3dead8d9
LP
1307 assert(sig == SIGTERM);
1308 break;
1309 }
5b6319dc
LP
1310 }
1311
3bb39ea9
DG
1312 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1313 if (pam_code != PAM_SUCCESS)
1314 goto child_finish;
1315
3dead8d9 1316 /* If our parent died we'll end the session */
f546241b 1317 if (getppid() != parent_pid) {
970edce6 1318 pam_code = pam_close_session(handle, flags);
f546241b 1319 if (pam_code != PAM_SUCCESS)
5b6319dc 1320 goto child_finish;
f546241b 1321 }
5b6319dc 1322
7bb70b6e 1323 ret = 0;
5b6319dc
LP
1324
1325 child_finish:
970edce6 1326 pam_end(handle, pam_code | flags);
7bb70b6e 1327 _exit(ret);
5b6319dc
LP
1328 }
1329
2d7c6aa2
DH
1330 barrier_set_role(&barrier, BARRIER_PARENT);
1331
5b6319dc
LP
1332 /* If the child was forked off successfully it will do all the
1333 * cleanups, so forget about the handle here. */
1334 handle = NULL;
1335
3b8bddde 1336 /* Unblock SIGTERM again in the parent */
72c0a2c2 1337 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1338
1339 /* We close the log explicitly here, since the PAM modules
1340 * might have opened it, but we don't want this fd around. */
1341 closelog();
1342
2d7c6aa2
DH
1343 /* Synchronously wait for the child to initialize. We don't care for
1344 * errors as we cannot recover. However, warn loudly if it happens. */
1345 if (!barrier_place_and_sync(&barrier))
1346 log_error("PAM initialization failed");
1347
130d3d22 1348 return strv_free_and_replace(*env, e);
5b6319dc
LP
1349
1350fail:
970edce6
ZJS
1351 if (pam_code != PAM_SUCCESS) {
1352 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1353 r = -EPERM; /* PAM errors do not map to errno */
1354 } else
1355 log_error_errno(r, "PAM failed: %m");
9ba35398 1356
5b6319dc
LP
1357 if (handle) {
1358 if (close_session)
970edce6 1359 pam_code = pam_close_session(handle, flags);
5b6319dc 1360
970edce6 1361 pam_end(handle, pam_code | flags);
5b6319dc
LP
1362 }
1363
1364 strv_free(e);
5b6319dc
LP
1365 closelog();
1366
7bb70b6e 1367 return r;
cefc33ae
LP
1368#else
1369 return 0;
5b6319dc 1370#endif
cefc33ae 1371}
5b6319dc 1372
5d6b1584
LP
1373static void rename_process_from_path(const char *path) {
1374 char process_name[11];
1375 const char *p;
1376 size_t l;
1377
1378 /* This resulting string must fit in 10 chars (i.e. the length
1379 * of "/sbin/init") to look pretty in /bin/ps */
1380
2b6bf07d 1381 p = basename(path);
5d6b1584
LP
1382 if (isempty(p)) {
1383 rename_process("(...)");
1384 return;
1385 }
1386
1387 l = strlen(p);
1388 if (l > 8) {
1389 /* The end of the process name is usually more
1390 * interesting, since the first bit might just be
1391 * "systemd-" */
1392 p = p + l - 8;
1393 l = 8;
1394 }
1395
1396 process_name[0] = '(';
1397 memcpy(process_name+1, p, l);
1398 process_name[1+l] = ')';
1399 process_name[1+l+1] = 0;
1400
1401 rename_process(process_name);
1402}
1403
469830d1
LP
1404static bool context_has_address_families(const ExecContext *c) {
1405 assert(c);
1406
6b000af4 1407 return c->address_families_allow_list ||
469830d1
LP
1408 !set_isempty(c->address_families);
1409}
1410
1411static bool context_has_syscall_filters(const ExecContext *c) {
1412 assert(c);
1413
6b000af4 1414 return c->syscall_allow_list ||
8cfa775f 1415 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1416}
1417
9df2cdd8
TM
1418static bool context_has_syscall_logs(const ExecContext *c) {
1419 assert(c);
1420
1421 return c->syscall_log_allow_list ||
1422 !hashmap_isempty(c->syscall_log);
1423}
1424
469830d1
LP
1425static bool context_has_no_new_privileges(const ExecContext *c) {
1426 assert(c);
1427
1428 if (c->no_new_privileges)
1429 return true;
1430
1431 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1432 return false;
1433
1434 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1435 return c->lock_personality ||
469830d1 1436 c->memory_deny_write_execute ||
0538d2a8 1437 c->private_devices ||
fc64760d 1438 c->protect_clock ||
0538d2a8 1439 c->protect_hostname ||
469830d1
LP
1440 c->protect_kernel_tunables ||
1441 c->protect_kernel_modules ||
84703040 1442 c->protect_kernel_logs ||
0538d2a8
YW
1443 context_has_address_families(c) ||
1444 exec_context_restrict_namespaces_set(c) ||
1445 c->restrict_realtime ||
1446 c->restrict_suid_sgid ||
78e864e5 1447 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1448 context_has_syscall_filters(c) ||
1449 context_has_syscall_logs(c);
469830d1
LP
1450}
1451
bb0c0d6f
LP
1452static bool exec_context_has_credentials(const ExecContext *context) {
1453
1454 assert(context);
1455
1456 return !hashmap_isempty(context->set_credentials) ||
1457 context->load_credentials;
1458}
1459
349cc4a5 1460#if HAVE_SECCOMP
17df7223 1461
83f12b27 1462static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1463
1464 if (is_seccomp_available())
1465 return false;
1466
f673b62d 1467 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1468 return true;
83f12b27
FS
1469}
1470
165a31c0 1471static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1472 uint32_t negative_action, default_action, action;
165a31c0 1473 int r;
8351ceae 1474
469830d1 1475 assert(u);
c0467cf3 1476 assert(c);
8351ceae 1477
469830d1 1478 if (!context_has_syscall_filters(c))
83f12b27
FS
1479 return 0;
1480
469830d1
LP
1481 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1482 return 0;
e9642be2 1483
005bfaf1 1484 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1485
6b000af4 1486 if (c->syscall_allow_list) {
469830d1
LP
1487 default_action = negative_action;
1488 action = SCMP_ACT_ALLOW;
7c66bae2 1489 } else {
469830d1
LP
1490 default_action = SCMP_ACT_ALLOW;
1491 action = negative_action;
57183d11 1492 }
8351ceae 1493
165a31c0 1494 if (needs_ambient_hack) {
6b000af4 1495 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1496 if (r < 0)
1497 return r;
1498 }
1499
b54f36c6 1500 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1501}
1502
9df2cdd8
TM
1503static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1504#ifdef SCMP_ACT_LOG
1505 uint32_t default_action, action;
1506#endif
1507
1508 assert(u);
1509 assert(c);
1510
1511 if (!context_has_syscall_logs(c))
1512 return 0;
1513
1514#ifdef SCMP_ACT_LOG
1515 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1516 return 0;
1517
1518 if (c->syscall_log_allow_list) {
1519 /* Log nothing but the ones listed */
1520 default_action = SCMP_ACT_ALLOW;
1521 action = SCMP_ACT_LOG;
1522 } else {
1523 /* Log everything but the ones listed */
1524 default_action = SCMP_ACT_LOG;
1525 action = SCMP_ACT_ALLOW;
1526 }
1527
1528 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1529#else
1530 /* old libseccomp */
1531 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1532 return 0;
1533#endif
1534}
1535
469830d1
LP
1536static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1537 assert(u);
4298d0b5
LP
1538 assert(c);
1539
469830d1 1540 if (set_isempty(c->syscall_archs))
83f12b27
FS
1541 return 0;
1542
469830d1
LP
1543 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1544 return 0;
4298d0b5 1545
469830d1
LP
1546 return seccomp_restrict_archs(c->syscall_archs);
1547}
4298d0b5 1548
469830d1
LP
1549static int apply_address_families(const Unit* u, const ExecContext *c) {
1550 assert(u);
1551 assert(c);
4298d0b5 1552
469830d1
LP
1553 if (!context_has_address_families(c))
1554 return 0;
4298d0b5 1555
469830d1
LP
1556 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1557 return 0;
4298d0b5 1558
6b000af4 1559 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1560}
4298d0b5 1561
83f12b27 1562static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1563 assert(u);
f3e43635
TM
1564 assert(c);
1565
469830d1 1566 if (!c->memory_deny_write_execute)
83f12b27
FS
1567 return 0;
1568
469830d1
LP
1569 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1570 return 0;
f3e43635 1571
469830d1 1572 return seccomp_memory_deny_write_execute();
f3e43635
TM
1573}
1574
83f12b27 1575static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1576 assert(u);
f4170c67
LP
1577 assert(c);
1578
469830d1 1579 if (!c->restrict_realtime)
83f12b27
FS
1580 return 0;
1581
469830d1
LP
1582 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1583 return 0;
f4170c67 1584
469830d1 1585 return seccomp_restrict_realtime();
f4170c67
LP
1586}
1587
f69567cb
LP
1588static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1589 assert(u);
1590 assert(c);
1591
1592 if (!c->restrict_suid_sgid)
1593 return 0;
1594
1595 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1596 return 0;
1597
1598 return seccomp_restrict_suid_sgid();
1599}
1600
59e856c7 1601static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1602 assert(u);
59eeb84b
LP
1603 assert(c);
1604
1605 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1606 * let's protect even those systems where this is left on in the kernel. */
1607
469830d1 1608 if (!c->protect_kernel_tunables)
59eeb84b
LP
1609 return 0;
1610
469830d1
LP
1611 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1612 return 0;
59eeb84b 1613
469830d1 1614 return seccomp_protect_sysctl();
59eeb84b
LP
1615}
1616
59e856c7 1617static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1618 assert(u);
502d704e
DH
1619 assert(c);
1620
25a8d8a0 1621 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1622
469830d1
LP
1623 if (!c->protect_kernel_modules)
1624 return 0;
1625
502d704e
DH
1626 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1627 return 0;
1628
b54f36c6 1629 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1630}
1631
84703040
KK
1632static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1633 assert(u);
1634 assert(c);
1635
1636 if (!c->protect_kernel_logs)
1637 return 0;
1638
1639 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1640 return 0;
1641
1642 return seccomp_protect_syslog();
1643}
1644
daf8f72b 1645static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1646 assert(u);
1647 assert(c);
1648
1649 if (!c->protect_clock)
1650 return 0;
1651
1652 if (skip_seccomp_unavailable(u, "ProtectClock="))
1653 return 0;
1654
1655 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1656}
1657
59e856c7 1658static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1659 assert(u);
ba128bb8
LP
1660 assert(c);
1661
8f81a5f6 1662 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1663
469830d1
LP
1664 if (!c->private_devices)
1665 return 0;
1666
ba128bb8
LP
1667 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1668 return 0;
1669
b54f36c6 1670 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1671}
1672
34cf6c43 1673static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1674 assert(u);
add00535
LP
1675 assert(c);
1676
1677 if (!exec_context_restrict_namespaces_set(c))
1678 return 0;
1679
1680 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1681 return 0;
1682
1683 return seccomp_restrict_namespaces(c->restrict_namespaces);
1684}
1685
78e864e5 1686static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1687 unsigned long personality;
1688 int r;
78e864e5
TM
1689
1690 assert(u);
1691 assert(c);
1692
1693 if (!c->lock_personality)
1694 return 0;
1695
1696 if (skip_seccomp_unavailable(u, "LockPersonality="))
1697 return 0;
1698
e8132d63
LP
1699 personality = c->personality;
1700
1701 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1702 if (personality == PERSONALITY_INVALID) {
1703
1704 r = opinionated_personality(&personality);
1705 if (r < 0)
1706 return r;
1707 }
78e864e5
TM
1708
1709 return seccomp_lock_personality(personality);
1710}
1711
c0467cf3 1712#endif
8351ceae 1713
daf8f72b 1714static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1715 assert(u);
1716 assert(c);
1717
1718 if (!c->protect_hostname)
1719 return 0;
1720
1721 if (ns_type_supported(NAMESPACE_UTS)) {
1722 if (unshare(CLONE_NEWUTS) < 0) {
1723 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1724 *ret_exit_status = EXIT_NAMESPACE;
1725 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1726 }
1727
1728 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1729 }
1730 } else
1731 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1732
1733#if HAVE_SECCOMP
8f3e342f
ZJS
1734 int r;
1735
daf8f72b
LP
1736 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1737 return 0;
1738
1739 r = seccomp_protect_hostname();
1740 if (r < 0) {
1741 *ret_exit_status = EXIT_SECCOMP;
1742 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1743 }
1744#endif
1745
1746 return 0;
1747}
1748
3042bbeb 1749static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1750 assert(idle_pipe);
1751
54eb2300
LP
1752 idle_pipe[1] = safe_close(idle_pipe[1]);
1753 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1754
1755 if (idle_pipe[0] >= 0) {
1756 int r;
1757
1758 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1759
1760 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1761 ssize_t n;
1762
31a7eb86 1763 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1764 n = write(idle_pipe[3], "x", 1);
1765 if (n > 0)
cd972d69 1766 /* Wait for systemd to react to the signal above. */
54756dce 1767 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1768 }
1769
54eb2300 1770 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1771
1772 }
1773
54eb2300 1774 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1775}
1776
fb2042dd
YW
1777static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1778
7cae38c4 1779static int build_environment(
34cf6c43 1780 const Unit *u,
9fa95f85 1781 const ExecContext *c,
1e22b5cd 1782 const ExecParameters *p,
da6053d0 1783 size_t n_fds,
7cae38c4
LP
1784 const char *home,
1785 const char *username,
1786 const char *shell,
7bce046b
LP
1787 dev_t journal_stream_dev,
1788 ino_t journal_stream_ino,
7cae38c4
LP
1789 char ***ret) {
1790
1791 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1792 size_t n_env = 0;
7cae38c4
LP
1793 char *x;
1794
4b58153d 1795 assert(u);
7cae38c4 1796 assert(c);
7c1cb6f1 1797 assert(p);
7cae38c4
LP
1798 assert(ret);
1799
dc4e2940 1800#define N_ENV_VARS 17
8d5bb13d 1801 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1802 if (!our_env)
1803 return -ENOMEM;
1804
1805 if (n_fds > 0) {
8dd4c05b
LP
1806 _cleanup_free_ char *joined = NULL;
1807
df0ff127 1808 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1809 return -ENOMEM;
1810 our_env[n_env++] = x;
1811
da6053d0 1812 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1813 return -ENOMEM;
1814 our_env[n_env++] = x;
8dd4c05b 1815
1e22b5cd 1816 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1817 if (!joined)
1818 return -ENOMEM;
1819
605405c6 1820 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1821 if (!x)
1822 return -ENOMEM;
1823 our_env[n_env++] = x;
7cae38c4
LP
1824 }
1825
b08af3b1 1826 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1827 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1828 return -ENOMEM;
1829 our_env[n_env++] = x;
1830
1e22b5cd 1831 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1832 return -ENOMEM;
1833 our_env[n_env++] = x;
1834 }
1835
fd63e712
LP
1836 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1837 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1838 * check the database directly. */
ac647978 1839 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1840 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1841 if (!x)
1842 return -ENOMEM;
1843 our_env[n_env++] = x;
1844 }
1845
7cae38c4 1846 if (home) {
b910cc72 1847 x = strjoin("HOME=", home);
7cae38c4
LP
1848 if (!x)
1849 return -ENOMEM;
7bbead1d 1850
4ff361cc 1851 path_simplify(x + 5);
7cae38c4
LP
1852 our_env[n_env++] = x;
1853 }
1854
1855 if (username) {
b910cc72 1856 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1857 if (!x)
1858 return -ENOMEM;
1859 our_env[n_env++] = x;
1860
b910cc72 1861 x = strjoin("USER=", username);
7cae38c4
LP
1862 if (!x)
1863 return -ENOMEM;
1864 our_env[n_env++] = x;
1865 }
1866
1867 if (shell) {
b910cc72 1868 x = strjoin("SHELL=", shell);
7cae38c4
LP
1869 if (!x)
1870 return -ENOMEM;
7bbead1d 1871
4ff361cc 1872 path_simplify(x + 6);
7cae38c4
LP
1873 our_env[n_env++] = x;
1874 }
1875
4b58153d
LP
1876 if (!sd_id128_is_null(u->invocation_id)) {
1877 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1878 return -ENOMEM;
1879
1880 our_env[n_env++] = x;
1881 }
1882
6af760f3
LP
1883 if (exec_context_needs_term(c)) {
1884 const char *tty_path, *term = NULL;
1885
1886 tty_path = exec_context_tty_path(c);
1887
e8cf09b2
LP
1888 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1889 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1890 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1891
e8cf09b2 1892 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1893 term = getenv("TERM");
e8cf09b2 1894
6af760f3
LP
1895 if (!term)
1896 term = default_term_for_tty(tty_path);
7cae38c4 1897
b910cc72 1898 x = strjoin("TERM=", term);
7cae38c4
LP
1899 if (!x)
1900 return -ENOMEM;
1901 our_env[n_env++] = x;
1902 }
1903
7bce046b
LP
1904 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1905 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1906 return -ENOMEM;
1907
1908 our_env[n_env++] = x;
1909 }
1910
91dd5f7c
LP
1911 if (c->log_namespace) {
1912 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1913 if (!x)
1914 return -ENOMEM;
1915
1916 our_env[n_env++] = x;
1917 }
1918
5b10116e 1919 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
fb2042dd
YW
1920 _cleanup_free_ char *pre = NULL, *joined = NULL;
1921 const char *n;
1922
1923 if (!p->prefix[t])
1924 continue;
1925
1926 if (strv_isempty(c->directories[t].paths))
1927 continue;
1928
1929 n = exec_directory_env_name_to_string(t);
1930 if (!n)
1931 continue;
1932
1933 pre = strjoin(p->prefix[t], "/");
1934 if (!pre)
1935 return -ENOMEM;
1936
48904c8b 1937 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
fb2042dd
YW
1938 if (!joined)
1939 return -ENOMEM;
1940
1941 x = strjoin(n, "=", joined);
1942 if (!x)
1943 return -ENOMEM;
1944
1945 our_env[n_env++] = x;
1946 }
1947
bb0c0d6f
LP
1948 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1949 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1950 if (!x)
1951 return -ENOMEM;
1952
1953 our_env[n_env++] = x;
1954 }
1955
dc4e2940
YW
1956 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1957 return -ENOMEM;
1958
1959 our_env[n_env++] = x;
1960
7cae38c4 1961 our_env[n_env++] = NULL;
8d5bb13d
LP
1962 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1963#undef N_ENV_VARS
7cae38c4 1964
ae2a15bc 1965 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1966
1967 return 0;
1968}
1969
b4c14404
FB
1970static int build_pass_environment(const ExecContext *c, char ***ret) {
1971 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1972 size_t n_env = 0;
b4c14404
FB
1973 char **i;
1974
1975 STRV_FOREACH(i, c->pass_environment) {
1976 _cleanup_free_ char *x = NULL;
1977 char *v;
1978
1979 v = getenv(*i);
1980 if (!v)
1981 continue;
605405c6 1982 x = strjoin(*i, "=", v);
b4c14404
FB
1983 if (!x)
1984 return -ENOMEM;
00819cc1 1985
319a4f4b 1986 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 1987 return -ENOMEM;
00819cc1 1988
1cc6c93a 1989 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1990 pass_env[n_env] = NULL;
b4c14404
FB
1991 }
1992
ae2a15bc 1993 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1994
1995 return 0;
1996}
1997
5e8deb94 1998bool exec_needs_mount_namespace(
8b44a3d2
LP
1999 const ExecContext *context,
2000 const ExecParameters *params,
4657abb5 2001 const ExecRuntime *runtime) {
8b44a3d2
LP
2002
2003 assert(context);
8b44a3d2 2004
915e6d16
LP
2005 if (context->root_image)
2006 return true;
2007
2a624c36
AP
2008 if (!strv_isempty(context->read_write_paths) ||
2009 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2010 !strv_isempty(context->inaccessible_paths) ||
2011 !strv_isempty(context->exec_paths) ||
2012 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2013 return true;
2014
42b1d8e0 2015 if (context->n_bind_mounts > 0)
d2d6c096
LP
2016 return true;
2017
2abd4e38
YW
2018 if (context->n_temporary_filesystems > 0)
2019 return true;
2020
b3d13314
LB
2021 if (context->n_mount_images > 0)
2022 return true;
2023
93f59701
LB
2024 if (context->n_extension_images > 0)
2025 return true;
2026
37ed15d7 2027 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2028 return true;
2029
2030 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2031 return true;
2032
8b44a3d2 2033 if (context->private_devices ||
228af36f 2034 context->private_mounts ||
8b44a3d2 2035 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2036 context->protect_home != PROTECT_HOME_NO ||
2037 context->protect_kernel_tunables ||
c575770b 2038 context->protect_kernel_modules ||
94a7b275 2039 context->protect_kernel_logs ||
4e399953
LP
2040 context->protect_control_groups ||
2041 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2042 context->proc_subset != PROC_SUBSET_ALL ||
2043 context->private_ipc ||
2044 context->ipc_namespace_path)
8b44a3d2
LP
2045 return true;
2046
37c56f89 2047 if (context->root_directory) {
5e98086d 2048 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2049 return true;
2050
5b10116e 2051 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2052 if (params && !params->prefix[t])
37c56f89
YW
2053 continue;
2054
2055 if (!strv_isempty(context->directories[t].paths))
2056 return true;
2057 }
2058 }
5d997827 2059
42b1d8e0 2060 if (context->dynamic_user &&
b43ee82f 2061 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
2062 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2063 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2064 return true;
2065
91dd5f7c
LP
2066 if (context->log_namespace)
2067 return true;
2068
8b44a3d2
LP
2069 return false;
2070}
2071
5749f855 2072static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2073 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2074 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2075 _cleanup_close_ int unshare_ready_fd = -1;
2076 _cleanup_(sigkill_waitp) pid_t pid = 0;
2077 uint64_t c = 1;
d251207d
LP
2078 ssize_t n;
2079 int r;
2080
5749f855
AZ
2081 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2082 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2083 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2084 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2085 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2086 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2087 * continues execution normally.
2088 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2089 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2090
5749f855
AZ
2091 /* Can only set up multiple mappings with CAP_SETUID. */
2092 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2093 r = asprintf(&uid_map,
5749f855 2094 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2095 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2096 ouid, ouid, uid, uid);
2097 else
2098 r = asprintf(&uid_map,
2099 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2100 ouid, ouid);
d251207d 2101
5749f855
AZ
2102 if (r < 0)
2103 return -ENOMEM;
2104
2105 /* Can only set up multiple mappings with CAP_SETGID. */
2106 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2107 r = asprintf(&gid_map,
5749f855 2108 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2109 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2110 ogid, ogid, gid, gid);
2111 else
2112 r = asprintf(&gid_map,
2113 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2114 ogid, ogid);
2115
2116 if (r < 0)
2117 return -ENOMEM;
d251207d
LP
2118
2119 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2120 * namespace. */
2121 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2122 if (unshare_ready_fd < 0)
2123 return -errno;
2124
2125 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2126 * failed. */
2127 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2128 return -errno;
2129
4c253ed1
LP
2130 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2131 if (r < 0)
2132 return r;
2133 if (r == 0) {
d251207d
LP
2134 _cleanup_close_ int fd = -1;
2135 const char *a;
2136 pid_t ppid;
2137
2138 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2139 * here, after the parent opened its own user namespace. */
2140
2141 ppid = getppid();
2142 errno_pipe[0] = safe_close(errno_pipe[0]);
2143
2144 /* Wait until the parent unshared the user namespace */
2145 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2146 r = -errno;
2147 goto child_fail;
2148 }
2149
2150 /* Disable the setgroups() system call in the child user namespace, for good. */
2151 a = procfs_file_alloca(ppid, "setgroups");
2152 fd = open(a, O_WRONLY|O_CLOEXEC);
2153 if (fd < 0) {
2154 if (errno != ENOENT) {
2155 r = -errno;
2156 goto child_fail;
2157 }
2158
2159 /* If the file is missing the kernel is too old, let's continue anyway. */
2160 } else {
2161 if (write(fd, "deny\n", 5) < 0) {
2162 r = -errno;
2163 goto child_fail;
2164 }
2165
2166 fd = safe_close(fd);
2167 }
2168
2169 /* First write the GID map */
2170 a = procfs_file_alloca(ppid, "gid_map");
2171 fd = open(a, O_WRONLY|O_CLOEXEC);
2172 if (fd < 0) {
2173 r = -errno;
2174 goto child_fail;
2175 }
2176 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2177 r = -errno;
2178 goto child_fail;
2179 }
2180 fd = safe_close(fd);
2181
2182 /* The write the UID map */
2183 a = procfs_file_alloca(ppid, "uid_map");
2184 fd = open(a, O_WRONLY|O_CLOEXEC);
2185 if (fd < 0) {
2186 r = -errno;
2187 goto child_fail;
2188 }
2189 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2190 r = -errno;
2191 goto child_fail;
2192 }
2193
2194 _exit(EXIT_SUCCESS);
2195
2196 child_fail:
2197 (void) write(errno_pipe[1], &r, sizeof(r));
2198 _exit(EXIT_FAILURE);
2199 }
2200
2201 errno_pipe[1] = safe_close(errno_pipe[1]);
2202
2203 if (unshare(CLONE_NEWUSER) < 0)
2204 return -errno;
2205
2206 /* Let the child know that the namespace is ready now */
2207 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2208 return -errno;
2209
2210 /* Try to read an error code from the child */
2211 n = read(errno_pipe[0], &r, sizeof(r));
2212 if (n < 0)
2213 return -errno;
2214 if (n == sizeof(r)) { /* an error code was sent to us */
2215 if (r < 0)
2216 return r;
2217 return -EIO;
2218 }
2219 if (n != 0) /* on success we should have read 0 bytes */
2220 return -EIO;
2221
2e87a1fd
LP
2222 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2223 pid = 0;
d251207d
LP
2224 if (r < 0)
2225 return r;
2e87a1fd 2226 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2227 return -EIO;
2228
2229 return 0;
2230}
2231
494d0247
YW
2232static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2233 if (!context->dynamic_user)
2234 return false;
2235
2236 if (type == EXEC_DIRECTORY_CONFIGURATION)
2237 return false;
2238
2239 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2240 return false;
2241
2242 return true;
2243}
2244
3536f49e 2245static int setup_exec_directory(
07689d5d
LP
2246 const ExecContext *context,
2247 const ExecParameters *params,
2248 uid_t uid,
3536f49e 2249 gid_t gid,
3536f49e
YW
2250 ExecDirectoryType type,
2251 int *exit_status) {
07689d5d 2252
72fd1768 2253 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2254 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2255 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2256 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2257 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2258 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2259 };
07689d5d
LP
2260 char **rt;
2261 int r;
2262
2263 assert(context);
2264 assert(params);
72fd1768 2265 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2266 assert(exit_status);
07689d5d 2267
3536f49e
YW
2268 if (!params->prefix[type])
2269 return 0;
2270
8679efde 2271 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2272 if (!uid_is_valid(uid))
2273 uid = 0;
2274 if (!gid_is_valid(gid))
2275 gid = 0;
2276 }
2277
2278 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 2279 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2280
edbfeb12 2281 p = path_join(params->prefix[type], *rt);
3536f49e
YW
2282 if (!p) {
2283 r = -ENOMEM;
2284 goto fail;
2285 }
07689d5d 2286
23a7448e
YW
2287 r = mkdir_parents_label(p, 0755);
2288 if (r < 0)
3536f49e 2289 goto fail;
23a7448e 2290
494d0247 2291 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2292 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2293 * case we want to avoid leaving a directory around fully accessible that is owned by
2294 * a dynamic user whose UID is later on reused. To lock this down we use the same
2295 * trick used by container managers to prohibit host users to get access to files of
2296 * the same UID in containers: we place everything inside a directory that has an
2297 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2298 * for unprivileged host code. We then use fs namespacing to make this directory
2299 * permeable for the service itself.
6c47cd7d 2300 *
3f5b1508
LP
2301 * Specifically: for a service which wants a special directory "foo/" we first create
2302 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2303 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2304 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2305 * unprivileged host users can't look into it. Inside of the namespace of the unit
2306 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2307 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2308 * for the service and making sure it only gets access to the dirs it needs but no
2309 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2310 *
3f5b1508
LP
2311 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2312 * to be owned by the service itself.
2313 *
2314 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2315 * for sharing files or sockets with other services. */
6c47cd7d 2316
4ede9802
LP
2317 pp = path_join(params->prefix[type], "private");
2318 if (!pp) {
6c47cd7d
LP
2319 r = -ENOMEM;
2320 goto fail;
2321 }
2322
2323 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2324 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2325 if (r < 0)
2326 goto fail;
2327
4ede9802 2328 if (!path_extend(&pp, *rt)) {
6c47cd7d
LP
2329 r = -ENOMEM;
2330 goto fail;
2331 }
2332
2333 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2334 r = mkdir_parents_label(pp, 0755);
2335 if (r < 0)
2336 goto fail;
2337
949befd3
LP
2338 if (is_dir(p, false) > 0 &&
2339 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2340
2341 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2342 * it over. Most likely the service has been upgraded from one that didn't use
2343 * DynamicUser=1, to one that does. */
2344
cf52c45d
LP
2345 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2346 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2347 exec_directory_type_to_string(type), p, pp);
2348
949befd3
LP
2349 if (rename(p, pp) < 0) {
2350 r = -errno;
2351 goto fail;
2352 }
2353 } else {
2354 /* Otherwise, create the actual directory for the service */
2355
2356 r = mkdir_label(pp, context->directories[type].mode);
2357 if (r < 0 && r != -EEXIST)
2358 goto fail;
2359 }
6c47cd7d 2360
6c47cd7d 2361 /* And link it up from the original place */
6c9c51e5 2362 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2363 if (r < 0)
2364 goto fail;
2365
6c47cd7d 2366 } else {
5c6d40d1
LP
2367 _cleanup_free_ char *target = NULL;
2368
2369 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2370 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2371 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2372
2373 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2374 * by DynamicUser=1 (see above)?
2375 *
2376 * We do this for all directory types except for ConfigurationDirectory=,
2377 * since they all support the private/ symlink logic at least in some
2378 * configurations, see above. */
5c6d40d1 2379
578dc69f
YW
2380 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2381 if (r < 0)
2382 goto fail;
2383
5c6d40d1
LP
2384 q = path_join(params->prefix[type], "private", *rt);
2385 if (!q) {
2386 r = -ENOMEM;
2387 goto fail;
2388 }
2389
578dc69f
YW
2390 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2391 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2392 if (r < 0)
2393 goto fail;
2394
2395 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2396
2397 /* Hmm, apparently DynamicUser= was once turned on for this service,
2398 * but is no longer. Let's move the directory back up. */
2399
cf52c45d
LP
2400 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2401 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2402 exec_directory_type_to_string(type), q, p);
2403
5c6d40d1
LP
2404 if (unlink(p) < 0) {
2405 r = -errno;
2406 goto fail;
2407 }
2408
2409 if (rename(q, p) < 0) {
2410 r = -errno;
2411 goto fail;
2412 }
2413 }
2414 }
2415
6c47cd7d 2416 r = mkdir_label(p, context->directories[type].mode);
d484580c 2417 if (r < 0) {
d484580c
LP
2418 if (r != -EEXIST)
2419 goto fail;
2420
206e9864
LP
2421 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2422 struct stat st;
2423
2424 /* Don't change the owner/access mode of the configuration directory,
2425 * as in the common case it is not written to by a service, and shall
2426 * not be writable. */
2427
2428 if (stat(p, &st) < 0) {
2429 r = -errno;
2430 goto fail;
2431 }
2432
2433 /* Still complain if the access mode doesn't match */
2434 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2435 log_warning("%s \'%s\' already exists but the mode is different. "
2436 "(File system: %o %sMode: %o)",
2437 exec_directory_type_to_string(type), *rt,
2438 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2439
6cff72eb 2440 continue;
206e9864 2441 }
6cff72eb 2442 }
a1164ae3 2443 }
07689d5d 2444
206e9864 2445 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2446 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2447 * current UID/GID ownership.) */
2448 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2449 if (r < 0)
2450 goto fail;
c71b2eb7 2451
607b358e
LP
2452 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2453 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2454 * assignments to exist. */
607b358e 2455 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2456 if (r < 0)
3536f49e 2457 goto fail;
07689d5d
LP
2458 }
2459
2460 return 0;
3536f49e
YW
2461
2462fail:
2463 *exit_status = exit_status_table[type];
3536f49e 2464 return r;
07689d5d
LP
2465}
2466
bb0c0d6f
LP
2467static int write_credential(
2468 int dfd,
2469 const char *id,
2470 const void *data,
2471 size_t size,
2472 uid_t uid,
2473 bool ownership_ok) {
2474
2475 _cleanup_(unlink_and_freep) char *tmp = NULL;
2476 _cleanup_close_ int fd = -1;
2477 int r;
2478
2479 r = tempfn_random_child("", "cred", &tmp);
2480 if (r < 0)
2481 return r;
2482
2483 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2484 if (fd < 0) {
2485 tmp = mfree(tmp);
2486 return -errno;
2487 }
2488
2489 r = loop_write(fd, data, size, /* do_pool = */ false);
2490 if (r < 0)
2491 return r;
2492
2493 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2494 return -errno;
2495
2496 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2497 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2498 if (r < 0) {
2499 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2500 return r;
2501
2502 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2503 * to express: that the user gets read access and nothing
2504 * else. But if the backing fs can't support that (e.g. ramfs)
2505 * then we can use file ownership instead. But that's only safe if
2506 * we can then re-mount the whole thing read-only, so that the
2507 * user can no longer chmod() the file to gain write access. */
2508 return r;
2509
f5fbe71d 2510 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2511 return -errno;
2512 }
2513 }
2514
2515 if (renameat(dfd, tmp, dfd, id) < 0)
2516 return -errno;
2517
2518 tmp = mfree(tmp);
2519 return 0;
2520}
2521
2522#define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2523
2524static int acquire_credentials(
2525 const ExecContext *context,
2526 const ExecParameters *params,
d3dcf4e3 2527 const char *unit,
bb0c0d6f
LP
2528 const char *p,
2529 uid_t uid,
2530 bool ownership_ok) {
2531
2532 uint64_t left = CREDENTIALS_BYTES_MAX;
2533 _cleanup_close_ int dfd = -1;
2534 ExecSetCredential *sc;
2535 char **id, **fn;
bb0c0d6f
LP
2536 int r;
2537
2538 assert(context);
2539 assert(p);
2540
2541 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2542 if (dfd < 0)
2543 return -errno;
2544
69e3234d 2545 /* First we use the literally specified credentials. Note that they might be overridden again below,
bb0c0d6f 2546 * and thus act as a "default" if the same credential is specified multiple times */
90e74a66 2547 HASHMAP_FOREACH(sc, context->set_credentials) {
bb0c0d6f
LP
2548 size_t add;
2549
2550 add = strlen(sc->id) + sc->size;
2551 if (add > left)
2552 return -E2BIG;
2553
2554 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2555 if (r < 0)
2556 return r;
2557
2558 left -= add;
2559 }
2560
2561 /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2562 STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2563 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2564 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2565 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2566 bool missing_ok = true;
bb0c0d6f
LP
2567 const char *source;
2568 size_t size, add;
2569
2570 if (path_is_absolute(*fn)) {
2571 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2572 source = *fn;
2573 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2574
2575 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2576 * via the source socket address in case we read off an AF_UNIX socket. */
2577 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2578 return -ENOMEM;
2579
fc682be2
LP
2580 missing_ok = false;
2581
bb0c0d6f
LP
2582 } else if (params->received_credentials) {
2583 /* If this is a relative path, take it relative to the credentials we received
2584 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2585 * on a credential store, i.e. this is guaranteed to be regular files. */
2586 j = path_join(params->received_credentials, *fn);
2587 if (!j)
2588 return -ENOMEM;
2589
2590 source = j;
2591 } else
2592 source = NULL;
2593
2594 if (source)
986311c2 2595 r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
bb0c0d6f
LP
2596 else
2597 r = -ENOENT;
fc682be2
LP
2598 if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) {
2599 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2600 * will get clear errors if we don't pass such a missing credential on as they
2601 * themselves will get ENOENT when trying to read them, which should not be much
2602 * worse than when we handle the error here and make it fatal.
2603 *
2604 * Also, if the source file doesn't exist, but we already acquired the key otherwise,
2605 * then don't fail either. */
2606 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn);
bb0c0d6f 2607 continue;
fc682be2 2608 }
bb0c0d6f 2609 if (r < 0)
fc682be2 2610 return log_debug_errno(r, "Failed to read credential '%s': %m", *fn);
bb0c0d6f
LP
2611
2612 add = strlen(*id) + size;
2613 if (add > left)
2614 return -E2BIG;
2615
2616 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2617 if (r < 0)
2618 return r;
2619
2620 left -= add;
2621 }
2622
2623 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2624 return -errno;
2625
2626 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2627 * accessible */
2628
2629 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2630 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2631 if (r < 0) {
2632 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2633 return r;
2634
2635 if (!ownership_ok)
2636 return r;
2637
f5fbe71d 2638 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2639 return -errno;
2640 }
2641 }
2642
2643 return 0;
2644}
2645
2646static int setup_credentials_internal(
2647 const ExecContext *context,
2648 const ExecParameters *params,
d3dcf4e3 2649 const char *unit,
bb0c0d6f
LP
2650 const char *final, /* This is where the credential store shall eventually end up at */
2651 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2652 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2653 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2654 uid_t uid) {
2655
2656 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2657 * if we mounted something; false if we definitely can't mount anything */
2658 bool final_mounted;
2659 const char *where;
2660
2661 assert(context);
2662 assert(final);
2663 assert(workspace);
2664
2665 if (reuse_workspace) {
2666 r = path_is_mount_point(workspace, NULL, 0);
2667 if (r < 0)
2668 return r;
2669 if (r > 0)
2670 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2671 else
2672 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2673 } else
2674 workspace_mounted = -1; /* ditto */
2675
2676 r = path_is_mount_point(final, NULL, 0);
2677 if (r < 0)
2678 return r;
2679 if (r > 0) {
2680 /* If the final place already has something mounted, we use that. If the workspace also has
2681 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2682 * different). */
2683 final_mounted = true;
2684
2685 if (workspace_mounted < 0) {
2686 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2687 * the final version to the workspace, and make it writable, so that we can make
2688 * changes */
2689
21935150
LP
2690 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2691 if (r < 0)
2692 return r;
bb0c0d6f 2693
21935150
LP
2694 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2695 if (r < 0)
2696 return r;
bb0c0d6f
LP
2697
2698 workspace_mounted = true;
2699 }
2700 } else
2701 final_mounted = false;
2702
2703 if (workspace_mounted < 0) {
2704 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2705 for (int try = 0;; try++) {
2706
2707 if (try == 0) {
2708 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2709 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2710 if (r >= 0) {
bb0c0d6f
LP
2711 workspace_mounted = true;
2712 break;
2713 }
2714
2715 } else if (try == 1) {
2716 _cleanup_free_ char *opts = NULL;
2717
2718 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2719 return -ENOMEM;
2720
2721 /* Fall back to "tmpfs" otherwise */
21935150
LP
2722 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2723 if (r >= 0) {
bb0c0d6f
LP
2724 workspace_mounted = true;
2725 break;
2726 }
2727
2728 } else {
2729 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2730 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2731 if (r < 0) {
2732 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2733 return r;
bb0c0d6f
LP
2734
2735 if (must_mount) /* If we it's not OK to use the plain directory
2736 * fallback, propagate all errors too */
21935150 2737 return r;
bb0c0d6f
LP
2738
2739 /* If we lack privileges to bind mount stuff, then let's gracefully
2740 * proceed for compat with container envs, and just use the final dir
2741 * as is. */
2742
2743 workspace_mounted = false;
2744 break;
2745 }
2746
2747 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2748 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2749 if (r < 0)
2750 return r;
bb0c0d6f
LP
2751
2752 workspace_mounted = true;
2753 break;
2754 }
2755 }
2756 }
2757
2758 assert(!must_mount || workspace_mounted > 0);
2759 where = workspace_mounted ? workspace : final;
2760
d3dcf4e3 2761 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2762 if (r < 0)
2763 return r;
2764
2765 if (workspace_mounted) {
2766 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2767 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2768 if (r < 0)
2769 return r;
bb0c0d6f
LP
2770
2771 /* And mount it to the final place, read-only */
21935150
LP
2772 if (final_mounted)
2773 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2774 else
2775 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2776 if (r < 0)
2777 return r;
bb0c0d6f
LP
2778 } else {
2779 _cleanup_free_ char *parent = NULL;
2780
2781 /* If we do not have our own mount put used the plain directory fallback, then we need to
2782 * open access to the top-level credential directory and the per-service directory now */
2783
2784 parent = dirname_malloc(final);
2785 if (!parent)
2786 return -ENOMEM;
2787 if (chmod(parent, 0755) < 0)
2788 return -errno;
2789 }
2790
2791 return 0;
2792}
2793
2794static int setup_credentials(
2795 const ExecContext *context,
2796 const ExecParameters *params,
2797 const char *unit,
2798 uid_t uid) {
2799
2800 _cleanup_free_ char *p = NULL, *q = NULL;
2801 const char *i;
2802 int r;
2803
2804 assert(context);
2805 assert(params);
2806
2807 if (!exec_context_has_credentials(context))
2808 return 0;
2809
2810 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2811 return -EINVAL;
2812
2813 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2814 * and the subdir we mount over with a read-only file system readable by the service's user */
2815 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2816 if (!q)
2817 return -ENOMEM;
2818
2819 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2820 if (r < 0 && r != -EEXIST)
2821 return r;
2822
2823 p = path_join(q, unit);
2824 if (!p)
2825 return -ENOMEM;
2826
2827 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2828 if (r < 0 && r != -EEXIST)
2829 return r;
2830
2831 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2832 if (r < 0) {
2833 _cleanup_free_ char *t = NULL, *u = NULL;
2834
2835 /* If this is not a privilege or support issue then propagate the error */
2836 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2837 return r;
2838
2839 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2840 * it into place, so that users can't access half-initialized credential stores. */
2841 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2842 if (!t)
2843 return -ENOMEM;
2844
2845 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2846 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2847 * after it is fully set up */
2848 u = path_join(t, unit);
2849 if (!u)
2850 return -ENOMEM;
2851
2852 FOREACH_STRING(i, t, u) {
2853 r = mkdir_label(i, 0700);
2854 if (r < 0 && r != -EEXIST)
2855 return r;
2856 }
2857
2858 r = setup_credentials_internal(
2859 context,
2860 params,
d3dcf4e3 2861 unit,
bb0c0d6f
LP
2862 p, /* final mount point */
2863 u, /* temporary workspace to overmount */
2864 true, /* reuse the workspace if it is already a mount */
2865 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2866 uid);
2867
2868 (void) rmdir(u); /* remove the workspace again if we can. */
2869
2870 if (r < 0)
2871 return r;
2872
2873 } else if (r == 0) {
2874
2875 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2876 * we can use the same directory for all cases, after turning off propagation. Question
2877 * though is: where do we turn off propagation exactly, and where do we place the workspace
2878 * directory? We need some place that is guaranteed to be a mount point in the host, and
2879 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2880 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2881 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2882 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2883 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2884 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2885 * propagation on the former, and then overmount the latter.
2886 *
2887 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2888 * for this purpose, but there are few other candidates that work equally well for us, and
2889 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 2890 * that no one else sees this should be OK to do. */
bb0c0d6f 2891
21935150
LP
2892 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2893 if (r < 0)
bb0c0d6f
LP
2894 goto child_fail;
2895
2896 r = setup_credentials_internal(
2897 context,
2898 params,
d3dcf4e3 2899 unit,
bb0c0d6f
LP
2900 p, /* final mount point */
2901 "/dev/shm", /* temporary workspace to overmount */
2902 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2903 true, /* insist that something is mounted, do not allow fallback to plain directory */
2904 uid);
2905 if (r < 0)
2906 goto child_fail;
2907
2908 _exit(EXIT_SUCCESS);
2909
2910 child_fail:
2911 _exit(EXIT_FAILURE);
2912 }
2913
2914 return 0;
2915}
2916
92b423b9 2917#if ENABLE_SMACK
cefc33ae
LP
2918static int setup_smack(
2919 const ExecContext *context,
b83d5050 2920 int executable_fd) {
cefc33ae
LP
2921 int r;
2922
2923 assert(context);
b83d5050 2924 assert(executable_fd >= 0);
cefc33ae 2925
cefc33ae
LP
2926 if (context->smack_process_label) {
2927 r = mac_smack_apply_pid(0, context->smack_process_label);
2928 if (r < 0)
2929 return r;
2930 }
2931#ifdef SMACK_DEFAULT_PROCESS_LABEL
2932 else {
2933 _cleanup_free_ char *exec_label = NULL;
2934
b83d5050 2935 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 2936 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2937 return r;
2938
2939 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2940 if (r < 0)
2941 return r;
2942 }
cefc33ae
LP
2943#endif
2944
2945 return 0;
2946}
92b423b9 2947#endif
cefc33ae 2948
6c47cd7d
LP
2949static int compile_bind_mounts(
2950 const ExecContext *context,
2951 const ExecParameters *params,
2952 BindMount **ret_bind_mounts,
da6053d0 2953 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2954 char ***ret_empty_directories) {
2955
2956 _cleanup_strv_free_ char **empty_directories = NULL;
2957 BindMount *bind_mounts;
5b10116e 2958 size_t n, h = 0;
6c47cd7d
LP
2959 int r;
2960
2961 assert(context);
2962 assert(params);
2963 assert(ret_bind_mounts);
2964 assert(ret_n_bind_mounts);
2965 assert(ret_empty_directories);
2966
2967 n = context->n_bind_mounts;
5b10116e 2968 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2969 if (!params->prefix[t])
2970 continue;
2971
2972 n += strv_length(context->directories[t].paths);
2973 }
2974
2975 if (n <= 0) {
2976 *ret_bind_mounts = NULL;
2977 *ret_n_bind_mounts = 0;
2978 *ret_empty_directories = NULL;
2979 return 0;
2980 }
2981
2982 bind_mounts = new(BindMount, n);
2983 if (!bind_mounts)
2984 return -ENOMEM;
2985
5b10116e 2986 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
2987 BindMount *item = context->bind_mounts + i;
2988 char *s, *d;
2989
2990 s = strdup(item->source);
2991 if (!s) {
2992 r = -ENOMEM;
2993 goto finish;
2994 }
2995
2996 d = strdup(item->destination);
2997 if (!d) {
2998 free(s);
2999 r = -ENOMEM;
3000 goto finish;
3001 }
3002
3003 bind_mounts[h++] = (BindMount) {
3004 .source = s,
3005 .destination = d,
3006 .read_only = item->read_only,
3007 .recursive = item->recursive,
3008 .ignore_enoent = item->ignore_enoent,
3009 };
3010 }
3011
5b10116e 3012 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3013 char **suffix;
3014
3015 if (!params->prefix[t])
3016 continue;
3017
3018 if (strv_isempty(context->directories[t].paths))
3019 continue;
3020
494d0247 3021 if (exec_directory_is_private(context, t) &&
74e12520 3022 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3023 char *private_root;
3024
3025 /* So this is for a dynamic user, and we need to make sure the process can access its own
3026 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3027 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3028
657ee2d8 3029 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3030 if (!private_root) {
3031 r = -ENOMEM;
3032 goto finish;
3033 }
3034
3035 r = strv_consume(&empty_directories, private_root);
a635a7ae 3036 if (r < 0)
6c47cd7d 3037 goto finish;
6c47cd7d
LP
3038 }
3039
3040 STRV_FOREACH(suffix, context->directories[t].paths) {
3041 char *s, *d;
3042
494d0247 3043 if (exec_directory_is_private(context, t))
657ee2d8 3044 s = path_join(params->prefix[t], "private", *suffix);
6c47cd7d 3045 else
657ee2d8 3046 s = path_join(params->prefix[t], *suffix);
6c47cd7d
LP
3047 if (!s) {
3048 r = -ENOMEM;
3049 goto finish;
3050 }
3051
494d0247 3052 if (exec_directory_is_private(context, t) &&
74e12520 3053 exec_context_with_rootfs(context))
5609f688
YW
3054 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3055 * directory is not created on the root directory. So, let's bind-mount the directory
3056 * on the 'non-private' place. */
657ee2d8 3057 d = path_join(params->prefix[t], *suffix);
5609f688
YW
3058 else
3059 d = strdup(s);
6c47cd7d
LP
3060 if (!d) {
3061 free(s);
3062 r = -ENOMEM;
3063 goto finish;
3064 }
3065
3066 bind_mounts[h++] = (BindMount) {
3067 .source = s,
3068 .destination = d,
3069 .read_only = false,
9ce4e4b0 3070 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3071 .recursive = true,
3072 .ignore_enoent = false,
3073 };
3074 }
3075 }
3076
3077 assert(h == n);
3078
3079 *ret_bind_mounts = bind_mounts;
3080 *ret_n_bind_mounts = n;
ae2a15bc 3081 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3082
3083 return (int) n;
3084
3085finish:
3086 bind_mount_free_many(bind_mounts, h);
3087 return r;
3088}
3089
4e677599
LP
3090static bool insist_on_sandboxing(
3091 const ExecContext *context,
3092 const char *root_dir,
3093 const char *root_image,
3094 const BindMount *bind_mounts,
3095 size_t n_bind_mounts) {
3096
4e677599
LP
3097 assert(context);
3098 assert(n_bind_mounts == 0 || bind_mounts);
3099
3100 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3101 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3102 * rearrange stuff in a way we cannot ignore gracefully. */
3103
3104 if (context->n_temporary_filesystems > 0)
3105 return true;
3106
3107 if (root_dir || root_image)
3108 return true;
3109
b3d13314
LB
3110 if (context->n_mount_images > 0)
3111 return true;
3112
4e677599
LP
3113 if (context->dynamic_user)
3114 return true;
3115
3116 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3117 * essential. */
5b10116e 3118 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3119 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3120 return true;
3121
91dd5f7c
LP
3122 if (context->log_namespace)
3123 return true;
3124
4e677599
LP
3125 return false;
3126}
3127
6818c54c 3128static int apply_mount_namespace(
34cf6c43 3129 const Unit *u,
9f71ba8d 3130 ExecCommandFlags command_flags,
6818c54c
LP
3131 const ExecContext *context,
3132 const ExecParameters *params,
7cc5ef5f
ZJS
3133 const ExecRuntime *runtime,
3134 char **error_path) {
6818c54c 3135
7bcef4ef 3136 _cleanup_strv_free_ char **empty_directories = NULL;
56a13a49 3137 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3138 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3139 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3140 NamespaceInfo ns_info;
165a31c0 3141 bool needs_sandboxing;
6c47cd7d 3142 BindMount *bind_mounts = NULL;
da6053d0 3143 size_t n_bind_mounts = 0;
6818c54c 3144 int r;
93c6bb51 3145
2b3c1b9e
DH
3146 assert(context);
3147
915e6d16
LP
3148 if (params->flags & EXEC_APPLY_CHROOT) {
3149 root_image = context->root_image;
3150
3151 if (!root_image)
3152 root_dir = context->root_directory;
3153 }
93c6bb51 3154
6c47cd7d
LP
3155 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3156 if (r < 0)
3157 return r;
3158
9f71ba8d 3159 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3160 if (needs_sandboxing) {
3161 /* The runtime struct only contains the parent of the private /tmp,
3162 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3163 * that is sticky, and that's the one we want to use here.
3164 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3165
3166 if (context->private_tmp && runtime) {
56a13a49
ZJS
3167 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3168 tmp_dir = runtime->tmp_dir;
3169 else if (runtime->tmp_dir)
3170 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3171
3172 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3173 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3174 else if (runtime->var_tmp_dir)
56a13a49 3175 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3176 }
3177
b5a33299
YW
3178 ns_info = (NamespaceInfo) {
3179 .ignore_protect_paths = false,
3180 .private_dev = context->private_devices,
3181 .protect_control_groups = context->protect_control_groups,
3182 .protect_kernel_tunables = context->protect_kernel_tunables,
3183 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3184 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3185 .protect_hostname = context->protect_hostname,
5e98086d 3186 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3187 .private_mounts = context->private_mounts,
52b3d652
LP
3188 .protect_home = context->protect_home,
3189 .protect_system = context->protect_system,
4e399953
LP
3190 .protect_proc = context->protect_proc,
3191 .proc_subset = context->proc_subset,
80271a44 3192 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3193 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3194 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3195 };
ecf63c91 3196 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3197 /*
3198 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3199 * sandbox info, otherwise enforce it, don't ignore protected paths and
3200 * fail if we are enable to apply the sandbox inside the mount namespace.
3201 */
3202 ns_info = (NamespaceInfo) {
3203 .ignore_protect_paths = true,
3204 };
3205 else
3206 ns_info = (NamespaceInfo) {};
b5a33299 3207
37ed15d7
FB
3208 if (context->mount_flags == MS_SHARED)
3209 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3210
a631cbfa
LP
3211 if (exec_context_has_credentials(context) &&
3212 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3213 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3214 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3215 if (!creds_path) {
3216 r = -ENOMEM;
3217 goto finalize;
3218 }
bbb4e7f3
LP
3219 }
3220
5e8deb94
LB
3221 if (MANAGER_IS_SYSTEM(u->manager)) {
3222 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3223 if (!propagate_dir) {
3224 r = -ENOMEM;
3225 goto finalize;
3226 }
3227
5e8deb94 3228 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3229 if (!incoming_dir) {
3230 r = -ENOMEM;
3231 goto finalize;
3232 }
5e8deb94
LB
3233 }
3234
18d73705 3235 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3236 &ns_info, context->read_write_paths,
165a31c0
LP
3237 needs_sandboxing ? context->read_only_paths : NULL,
3238 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3239 needs_sandboxing ? context->exec_paths : NULL,
3240 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d
LP
3241 empty_directories,
3242 bind_mounts,
3243 n_bind_mounts,
2abd4e38
YW
3244 context->temporary_filesystems,
3245 context->n_temporary_filesystems,
b3d13314
LB
3246 context->mount_images,
3247 context->n_mount_images,
56a13a49
ZJS
3248 tmp_dir,
3249 var_tmp_dir,
bbb4e7f3 3250 creds_path,
91dd5f7c 3251 context->log_namespace,
915e6d16 3252 context->mount_flags,
d4d55b0d
LB
3253 context->root_hash, context->root_hash_size, context->root_hash_path,
3254 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3255 context->root_verity,
93f59701
LB
3256 context->extension_images,
3257 context->n_extension_images,
5e8deb94
LB
3258 propagate_dir,
3259 incoming_dir,
3bdc25a4 3260 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3261 error_path);
93c6bb51 3262
1beab8b0 3263 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3264 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3265 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3266 * completely different execution environment. */
aca835ed 3267 if (r == -ENOANO) {
4e677599
LP
3268 if (insist_on_sandboxing(
3269 context,
3270 root_dir, root_image,
3271 bind_mounts,
3272 n_bind_mounts)) {
3273 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3274 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3275 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3276
3277 r = -EOPNOTSUPP;
3278 } else {
aca835ed 3279 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3280 r = 0;
aca835ed 3281 }
93c6bb51
DH
3282 }
3283
8062e643 3284finalize:
4e677599 3285 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3286 return r;
3287}
3288
915e6d16
LP
3289static int apply_working_directory(
3290 const ExecContext *context,
3291 const ExecParameters *params,
3292 const char *home,
376fecf6 3293 int *exit_status) {
915e6d16 3294
6732edab 3295 const char *d, *wd;
2b3c1b9e
DH
3296
3297 assert(context);
376fecf6 3298 assert(exit_status);
2b3c1b9e 3299
6732edab
LP
3300 if (context->working_directory_home) {
3301
376fecf6
LP
3302 if (!home) {
3303 *exit_status = EXIT_CHDIR;
6732edab 3304 return -ENXIO;
376fecf6 3305 }
6732edab 3306
2b3c1b9e 3307 wd = home;
6732edab 3308
14eb3285
LP
3309 } else
3310 wd = empty_to_root(context->working_directory);
e7f1e7c6 3311
fa97f630 3312 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3313 d = wd;
fa97f630 3314 else
3b0e5bb5 3315 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3316
376fecf6
LP
3317 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3318 *exit_status = EXIT_CHDIR;
2b3c1b9e 3319 return -errno;
376fecf6 3320 }
e7f1e7c6
DH
3321
3322 return 0;
3323}
3324
fa97f630
JB
3325static int apply_root_directory(
3326 const ExecContext *context,
3327 const ExecParameters *params,
3328 const bool needs_mount_ns,
3329 int *exit_status) {
3330
3331 assert(context);
3332 assert(exit_status);
3333
5b10116e 3334 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3335 if (!needs_mount_ns && context->root_directory)
3336 if (chroot(context->root_directory) < 0) {
3337 *exit_status = EXIT_CHROOT;
3338 return -errno;
3339 }
fa97f630
JB
3340
3341 return 0;
3342}
3343
b1edf445 3344static int setup_keyring(
34cf6c43 3345 const Unit *u,
b1edf445
LP
3346 const ExecContext *context,
3347 const ExecParameters *p,
3348 uid_t uid, gid_t gid) {
3349
74dd6b51 3350 key_serial_t keyring;
e64c2d0b
DJL
3351 int r = 0;
3352 uid_t saved_uid;
3353 gid_t saved_gid;
74dd6b51
LP
3354
3355 assert(u);
b1edf445 3356 assert(context);
74dd6b51
LP
3357 assert(p);
3358
3359 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3360 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3361 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3362 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3363 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3364 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3365
b1edf445
LP
3366 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3367 return 0;
3368
e64c2d0b
DJL
3369 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3370 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3371 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3372 * & group is just as nasty as acquiring a reference to the user keyring. */
3373
3374 saved_uid = getuid();
3375 saved_gid = getgid();
3376
3377 if (gid_is_valid(gid) && gid != saved_gid) {
3378 if (setregid(gid, -1) < 0)
3379 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3380 }
3381
3382 if (uid_is_valid(uid) && uid != saved_uid) {
3383 if (setreuid(uid, -1) < 0) {
3384 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3385 goto out;
3386 }
3387 }
3388
74dd6b51
LP
3389 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3390 if (keyring == -1) {
3391 if (errno == ENOSYS)
8002fb97 3392 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3393 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3394 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3395 else if (errno == EDQUOT)
8002fb97 3396 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3397 else
e64c2d0b 3398 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3399
e64c2d0b 3400 goto out;
74dd6b51
LP
3401 }
3402
e64c2d0b
DJL
3403 /* When requested link the user keyring into the session keyring. */
3404 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3405
3406 if (keyctl(KEYCTL_LINK,
3407 KEY_SPEC_USER_KEYRING,
3408 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3409 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3410 goto out;
3411 }
3412 }
3413
3414 /* Restore uid/gid back */
3415 if (uid_is_valid(uid) && uid != saved_uid) {
3416 if (setreuid(saved_uid, -1) < 0) {
3417 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3418 goto out;
3419 }
3420 }
3421
3422 if (gid_is_valid(gid) && gid != saved_gid) {
3423 if (setregid(saved_gid, -1) < 0)
3424 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3425 }
3426
3427 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3428 if (!sd_id128_is_null(u->invocation_id)) {
3429 key_serial_t key;
3430
3431 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3432 if (key == -1)
8002fb97 3433 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3434 else {
3435 if (keyctl(KEYCTL_SETPERM, key,
3436 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3437 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3438 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3439 }
3440 }
3441
e64c2d0b 3442out:
37b22b3b 3443 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3444 /* no extra logging, as only the first already reported error matters */
3445 if (getuid() != saved_uid)
3446 (void) setreuid(saved_uid, -1);
b1edf445 3447
e64c2d0b
DJL
3448 if (getgid() != saved_gid)
3449 (void) setregid(saved_gid, -1);
b1edf445 3450
e64c2d0b 3451 return r;
74dd6b51
LP
3452}
3453
3042bbeb 3454static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3455 assert(array);
3456 assert(n);
2caa38e9 3457 assert(pair);
29206d46
LP
3458
3459 if (pair[0] >= 0)
3460 array[(*n)++] = pair[0];
3461 if (pair[1] >= 0)
3462 array[(*n)++] = pair[1];
3463}
3464
a34ceba6
LP
3465static int close_remaining_fds(
3466 const ExecParameters *params,
34cf6c43
YW
3467 const ExecRuntime *runtime,
3468 const DynamicCreds *dcreds,
00d9ef85 3469 int user_lookup_fd,
a34ceba6 3470 int socket_fd,
5b8d1f6b 3471 const int *fds, size_t n_fds) {
a34ceba6 3472
da6053d0 3473 size_t n_dont_close = 0;
00d9ef85 3474 int dont_close[n_fds + 12];
a34ceba6
LP
3475
3476 assert(params);
3477
3478 if (params->stdin_fd >= 0)
3479 dont_close[n_dont_close++] = params->stdin_fd;
3480 if (params->stdout_fd >= 0)
3481 dont_close[n_dont_close++] = params->stdout_fd;
3482 if (params->stderr_fd >= 0)
3483 dont_close[n_dont_close++] = params->stderr_fd;
3484
3485 if (socket_fd >= 0)
3486 dont_close[n_dont_close++] = socket_fd;
3487 if (n_fds > 0) {
3488 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3489 n_dont_close += n_fds;
3490 }
3491
a70581ff 3492 if (runtime) {
29206d46 3493 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3494 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3495 }
29206d46
LP
3496
3497 if (dcreds) {
3498 if (dcreds->user)
3499 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3500 if (dcreds->group)
3501 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3502 }
3503
00d9ef85
LP
3504 if (user_lookup_fd >= 0)
3505 dont_close[n_dont_close++] = user_lookup_fd;
3506
a34ceba6
LP
3507 return close_all_fds(dont_close, n_dont_close);
3508}
3509
00d9ef85
LP
3510static int send_user_lookup(
3511 Unit *unit,
3512 int user_lookup_fd,
3513 uid_t uid,
3514 gid_t gid) {
3515
3516 assert(unit);
3517
3518 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3519 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3520 * specified. */
3521
3522 if (user_lookup_fd < 0)
3523 return 0;
3524
3525 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3526 return 0;
3527
3528 if (writev(user_lookup_fd,
3529 (struct iovec[]) {
e6a7ec4b
LP
3530 IOVEC_INIT(&uid, sizeof(uid)),
3531 IOVEC_INIT(&gid, sizeof(gid)),
3532 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3533 return -errno;
3534
3535 return 0;
3536}
3537
6732edab
LP
3538static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3539 int r;
3540
3541 assert(c);
3542 assert(home);
3543 assert(buf);
3544
3545 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3546
3547 if (*home)
3548 return 0;
3549
3550 if (!c->working_directory_home)
3551 return 0;
3552
6732edab
LP
3553 r = get_home_dir(buf);
3554 if (r < 0)
3555 return r;
3556
3557 *home = *buf;
3558 return 1;
3559}
3560
da50b85a
LP
3561static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3562 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3563 int r;
3564
3565 assert(c);
3566 assert(p);
3567 assert(ret);
3568
3569 assert(c->dynamic_user);
3570
3571 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3572 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3573 * directories. */
3574
5b10116e 3575 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3576 char **i;
3577
3578 if (t == EXEC_DIRECTORY_CONFIGURATION)
3579 continue;
3580
3581 if (!p->prefix[t])
3582 continue;
3583
3584 STRV_FOREACH(i, c->directories[t].paths) {
3585 char *e;
3586
494d0247 3587 if (exec_directory_is_private(c, t))
657ee2d8 3588 e = path_join(p->prefix[t], "private", *i);
494d0247
YW
3589 else
3590 e = path_join(p->prefix[t], *i);
da50b85a
LP
3591 if (!e)
3592 return -ENOMEM;
3593
3594 r = strv_consume(&list, e);
3595 if (r < 0)
3596 return r;
3597 }
3598 }
3599
ae2a15bc 3600 *ret = TAKE_PTR(list);
da50b85a
LP
3601
3602 return 0;
3603}
3604
34cf6c43
YW
3605static char *exec_command_line(char **argv);
3606
78f93209
LP
3607static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3608 bool using_subcgroup;
3609 char *p;
3610
3611 assert(params);
3612 assert(ret);
3613
3614 if (!params->cgroup_path)
3615 return -EINVAL;
3616
3617 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3618 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3619 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3620 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3621 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3622 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3623 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3624 * flag, which is only passed for the former statements, not for the latter. */
3625
3626 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3627 if (using_subcgroup)
657ee2d8 3628 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3629 else
3630 p = strdup(params->cgroup_path);
3631 if (!p)
3632 return -ENOMEM;
3633
3634 *ret = p;
3635 return using_subcgroup;
3636}
3637
e2b2fb7f
MS
3638static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3639 _cleanup_(cpu_set_reset) CPUSet s = {};
3640 int r;
3641
3642 assert(c);
3643 assert(ret);
3644
3645 if (!c->numa_policy.nodes.set) {
3646 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3647 return 0;
3648 }
3649
3650 r = numa_to_cpu_set(&c->numa_policy, &s);
3651 if (r < 0)
3652 return r;
3653
3654 cpu_set_reset(ret);
3655
3656 return cpu_set_add_all(ret, &s);
3657}
3658
3659bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3660 assert(c);
3661
3662 return c->cpu_affinity_from_numa;
3663}
3664
1da37e58
ZJS
3665static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3666 int r;
3667
3668 assert(fds);
3669 assert(n_fds);
3670 assert(*n_fds < fds_size);
3671 assert(ret_fd);
3672
3673 if (fd < 0) {
3674 *ret_fd = -1;
3675 return 0;
3676 }
3677
3678 if (fd < 3 + (int) *n_fds) {
3679 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3680 * the fds we pass to the process (or which are closed only during execve). */
3681
3682 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3683 if (r < 0)
3684 return -errno;
3685
3686 CLOSE_AND_REPLACE(fd, r);
3687 }
3688
3689 *ret_fd = fds[*n_fds] = fd;
3690 (*n_fds) ++;
3691 return 1;
3692}
3693
ff0af2a1 3694static int exec_child(
f2341e0a 3695 Unit *unit,
34cf6c43 3696 const ExecCommand *command,
ff0af2a1
LP
3697 const ExecContext *context,
3698 const ExecParameters *params,
3699 ExecRuntime *runtime,
29206d46 3700 DynamicCreds *dcreds,
ff0af2a1 3701 int socket_fd,
2caa38e9 3702 const int named_iofds[static 3],
4c47affc 3703 int *fds,
da6053d0 3704 size_t n_socket_fds,
25b583d7 3705 size_t n_storage_fds,
ff0af2a1 3706 char **files_env,
00d9ef85 3707 int user_lookup_fd,
12145637 3708 int *exit_status) {
d35fbf6b 3709
7ca69792 3710 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3711 int r, ngids = 0, exec_fd;
4d885bd3
DH
3712 _cleanup_free_ gid_t *supplementary_gids = NULL;
3713 const char *username = NULL, *groupname = NULL;
5686391b 3714 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3715 const char *home = NULL, *shell = NULL;
7ca69792 3716 char **final_argv = NULL;
7bce046b
LP
3717 dev_t journal_stream_dev = 0;
3718 ino_t journal_stream_ino = 0;
5749f855 3719 bool userns_set_up = false;
165a31c0
LP
3720 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3721 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3722 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3723 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3724#if HAVE_SELINUX
7f59dd35 3725 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3726 bool use_selinux = false;
ecfbc84f 3727#endif
f9fa32f0 3728#if ENABLE_SMACK
43b1f709 3729 bool use_smack = false;
ecfbc84f 3730#endif
349cc4a5 3731#if HAVE_APPARMOR
43b1f709 3732 bool use_apparmor = false;
ecfbc84f 3733#endif
5749f855
AZ
3734 uid_t saved_uid = getuid();
3735 gid_t saved_gid = getgid();
fed1e721
LP
3736 uid_t uid = UID_INVALID;
3737 gid_t gid = GID_INVALID;
1da37e58
ZJS
3738 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3739 n_keep_fds; /* total number of fds not to close */
165a31c0 3740 int secure_bits;
afb11bf1
DG
3741 _cleanup_free_ gid_t *gids_after_pam = NULL;
3742 int ngids_after_pam = 0;
034c6ed7 3743
f2341e0a 3744 assert(unit);
5cb5a6ff
LP
3745 assert(command);
3746 assert(context);
d35fbf6b 3747 assert(params);
ff0af2a1 3748 assert(exit_status);
d35fbf6b
DM
3749
3750 rename_process_from_path(command->path);
3751
9c274488
LP
3752 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3753 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3754 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3755 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3756 SIGNALS_IGNORE);
d35fbf6b
DM
3757
3758 if (context->ignore_sigpipe)
9c274488 3759 (void) ignore_signals(SIGPIPE);
d35fbf6b 3760
ff0af2a1
LP
3761 r = reset_signal_mask();
3762 if (r < 0) {
3763 *exit_status = EXIT_SIGNAL_MASK;
12145637 3764 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3765 }
034c6ed7 3766
d35fbf6b
DM
3767 if (params->idle_pipe)
3768 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3769
2c027c62
LP
3770 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3771 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3772 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3773 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3774
d35fbf6b 3775 log_forget_fds();
2c027c62 3776 log_set_open_when_needed(true);
4f2d528d 3777
40a80078
LP
3778 /* In case anything used libc syslog(), close this here, too */
3779 closelog();
3780
b83d5050 3781 int keep_fds[n_fds + 2];
1da37e58
ZJS
3782 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3783 n_keep_fds = n_fds;
3784
3785 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3786 if (r < 0) {
3787 *exit_status = EXIT_FDS;
3788 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3789 }
3790
3791 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3792 if (r < 0) {
3793 *exit_status = EXIT_FDS;
12145637 3794 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3795 }
3796
0af07108
ZJS
3797 if (!context->same_pgrp &&
3798 setsid() < 0) {
3799 *exit_status = EXIT_SETSID;
3800 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3801 }
9e2f7c11 3802
1e22b5cd 3803 exec_context_tty_reset(context, params);
d35fbf6b 3804
c891efaf 3805 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3806 const char *vc = params->confirm_spawn;
3b20f877
FB
3807 _cleanup_free_ char *cmdline = NULL;
3808
ee39ca20 3809 cmdline = exec_command_line(command->argv);
3b20f877 3810 if (!cmdline) {
0460aa5c 3811 *exit_status = EXIT_MEMORY;
12145637 3812 return log_oom();
3b20f877 3813 }
d35fbf6b 3814
eedf223a 3815 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3816 if (r != CONFIRM_EXECUTE) {
3817 if (r == CONFIRM_PRETEND_SUCCESS) {
3818 *exit_status = EXIT_SUCCESS;
3819 return 0;
3820 }
ff0af2a1 3821 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
3822 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3823 "Execution cancelled by the user");
d35fbf6b
DM
3824 }
3825 }
1a63a750 3826
d521916d
LP
3827 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3828 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3829 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3830 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3831 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3832 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3833 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3834 *exit_status = EXIT_MEMORY;
3835 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3836 }
3837
29206d46 3838 if (context->dynamic_user && dcreds) {
da50b85a 3839 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 3840
d521916d 3841 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 3842 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
3843 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3844 *exit_status = EXIT_USER;
12145637 3845 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
3846 }
3847
da50b85a
LP
3848 r = compile_suggested_paths(context, params, &suggested_paths);
3849 if (r < 0) {
3850 *exit_status = EXIT_MEMORY;
3851 return log_oom();
3852 }
3853
3854 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
3855 if (r < 0) {
3856 *exit_status = EXIT_USER;
d85ff944
YW
3857 if (r == -EILSEQ)
3858 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3859 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 3860 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 3861 }
524daa8c 3862
70dd455c 3863 if (!uid_is_valid(uid)) {
29206d46 3864 *exit_status = EXIT_USER;
d85ff944 3865 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
3866 }
3867
3868 if (!gid_is_valid(gid)) {
3869 *exit_status = EXIT_USER;
d85ff944 3870 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 3871 }
5bc7452b 3872
29206d46
LP
3873 if (dcreds->user)
3874 username = dcreds->user->name;
3875
3876 } else {
4d885bd3
DH
3877 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3878 if (r < 0) {
3879 *exit_status = EXIT_USER;
12145637 3880 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 3881 }
5bc7452b 3882
4d885bd3
DH
3883 r = get_fixed_group(context, &groupname, &gid);
3884 if (r < 0) {
3885 *exit_status = EXIT_GROUP;
12145637 3886 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 3887 }
cdc5d5c5 3888 }
29206d46 3889
cdc5d5c5
DH
3890 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3891 r = get_supplementary_groups(context, username, groupname, gid,
3892 &supplementary_gids, &ngids);
3893 if (r < 0) {
3894 *exit_status = EXIT_GROUP;
12145637 3895 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 3896 }
5bc7452b 3897
00d9ef85
LP
3898 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3899 if (r < 0) {
3900 *exit_status = EXIT_USER;
12145637 3901 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
3902 }
3903
3904 user_lookup_fd = safe_close(user_lookup_fd);
3905
6732edab
LP
3906 r = acquire_home(context, uid, &home, &home_buffer);
3907 if (r < 0) {
3908 *exit_status = EXIT_CHDIR;
12145637 3909 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
3910 }
3911
d35fbf6b
DM
3912 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3913 * must sure to drop O_NONBLOCK */
3914 if (socket_fd >= 0)
a34ceba6 3915 (void) fd_nonblock(socket_fd, false);
acbb0225 3916
4c70a4a7
MS
3917 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3918 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3919 if (params->cgroup_path) {
3920 _cleanup_free_ char *p = NULL;
3921
3922 r = exec_parameters_get_cgroup_path(params, &p);
3923 if (r < 0) {
3924 *exit_status = EXIT_CGROUP;
3925 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3926 }
3927
3928 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3929 if (r < 0) {
3930 *exit_status = EXIT_CGROUP;
3931 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3932 }
3933 }
3934
a8d08f39 3935 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 3936 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
3937 if (r < 0) {
3938 *exit_status = EXIT_NETWORK;
3939 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3940 }
3941 }
3942
a70581ff
XR
3943 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3944 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3945 if (r < 0) {
3946 *exit_status = EXIT_NAMESPACE;
3947 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3948 }
3949 }
3950
52c239d7 3951 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
3952 if (r < 0) {
3953 *exit_status = EXIT_STDIN;
12145637 3954 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 3955 }
034c6ed7 3956
52c239d7 3957 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3958 if (r < 0) {
3959 *exit_status = EXIT_STDOUT;
12145637 3960 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
3961 }
3962
52c239d7 3963 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3964 if (r < 0) {
3965 *exit_status = EXIT_STDERR;
12145637 3966 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
3967 }
3968
d35fbf6b 3969 if (context->oom_score_adjust_set) {
9f8168eb
LP
3970 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3971 * prohibit write access to this file, and we shouldn't trip up over that. */
3972 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 3973 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 3974 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 3975 else if (r < 0) {
ff0af2a1 3976 *exit_status = EXIT_OOM_ADJUST;
12145637 3977 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 3978 }
d35fbf6b
DM
3979 }
3980
ad21e542
ZJS
3981 if (context->coredump_filter_set) {
3982 r = set_coredump_filter(context->coredump_filter);
3983 if (ERRNO_IS_PRIVILEGE(r))
3984 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3985 else if (r < 0)
3986 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3987 }
3988
39090201
DJL
3989 if (context->nice_set) {
3990 r = setpriority_closest(context->nice);
3991 if (r < 0)
3992 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3993 }
613b411c 3994
d35fbf6b
DM
3995 if (context->cpu_sched_set) {
3996 struct sched_param param = {
3997 .sched_priority = context->cpu_sched_priority,
3998 };
3999
ff0af2a1
LP
4000 r = sched_setscheduler(0,
4001 context->cpu_sched_policy |
4002 (context->cpu_sched_reset_on_fork ?
4003 SCHED_RESET_ON_FORK : 0),
4004 &param);
4005 if (r < 0) {
4006 *exit_status = EXIT_SETSCHEDULER;
12145637 4007 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4008 }
d35fbf6b 4009 }
fc9b2a84 4010
e2b2fb7f
MS
4011 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4012 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4013 const CPUSet *cpu_set;
4014
4015 if (context->cpu_affinity_from_numa) {
4016 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4017 if (r < 0) {
4018 *exit_status = EXIT_CPUAFFINITY;
4019 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4020 }
4021
4022 cpu_set = &converted_cpu_set;
4023 } else
4024 cpu_set = &context->cpu_set;
4025
4026 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4027 *exit_status = EXIT_CPUAFFINITY;
12145637 4028 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4029 }
e2b2fb7f 4030 }
034c6ed7 4031
b070c7c0
MS
4032 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4033 r = apply_numa_policy(&context->numa_policy);
4034 if (r == -EOPNOTSUPP)
33fe9e3f 4035 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4036 else if (r < 0) {
4037 *exit_status = EXIT_NUMA_POLICY;
4038 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4039 }
4040 }
4041
d35fbf6b
DM
4042 if (context->ioprio_set)
4043 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4044 *exit_status = EXIT_IOPRIO;
12145637 4045 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4046 }
da726a4d 4047
d35fbf6b
DM
4048 if (context->timer_slack_nsec != NSEC_INFINITY)
4049 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4050 *exit_status = EXIT_TIMERSLACK;
12145637 4051 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4052 }
9eba9da4 4053
21022b9d
LP
4054 if (context->personality != PERSONALITY_INVALID) {
4055 r = safe_personality(context->personality);
4056 if (r < 0) {
ff0af2a1 4057 *exit_status = EXIT_PERSONALITY;
12145637 4058 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4059 }
21022b9d 4060 }
94f04347 4061
d35fbf6b 4062 if (context->utmp_id)
df0ff127 4063 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 4064 context->tty_path,
023a4f67
LP
4065 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4066 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4067 USER_PROCESS,
6a93917d 4068 username);
d35fbf6b 4069
08f67696 4070 if (uid_is_valid(uid)) {
ff0af2a1
LP
4071 r = chown_terminal(STDIN_FILENO, uid);
4072 if (r < 0) {
4073 *exit_status = EXIT_STDIN;
12145637 4074 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4075 }
d35fbf6b 4076 }
8e274523 4077
4e1dfa45 4078 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4079 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4080 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4081 * touch a single hierarchy too. */
584b8688 4082 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4083 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4084 if (r < 0) {
4085 *exit_status = EXIT_CGROUP;
12145637 4086 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4087 }
d35fbf6b 4088 }
034c6ed7 4089
5b10116e 4090 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 4091 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
4092 if (r < 0)
4093 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4094 }
94f04347 4095
bb0c0d6f
LP
4096 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4097 r = setup_credentials(context, params, unit->id, uid);
4098 if (r < 0) {
4099 *exit_status = EXIT_CREDENTIALS;
4100 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4101 }
4102 }
4103
7bce046b 4104 r = build_environment(
fd63e712 4105 unit,
7bce046b
LP
4106 context,
4107 params,
4108 n_fds,
4109 home,
4110 username,
4111 shell,
4112 journal_stream_dev,
4113 journal_stream_ino,
4114 &our_env);
2065ca69
JW
4115 if (r < 0) {
4116 *exit_status = EXIT_MEMORY;
12145637 4117 return log_oom();
2065ca69
JW
4118 }
4119
4120 r = build_pass_environment(context, &pass_env);
4121 if (r < 0) {
4122 *exit_status = EXIT_MEMORY;
12145637 4123 return log_oom();
2065ca69
JW
4124 }
4125
4126 accum_env = strv_env_merge(5,
4127 params->environment,
4128 our_env,
4129 pass_env,
4130 context->environment,
44e5d006 4131 files_env);
2065ca69
JW
4132 if (!accum_env) {
4133 *exit_status = EXIT_MEMORY;
12145637 4134 return log_oom();
2065ca69 4135 }
1280503b 4136 accum_env = strv_env_clean(accum_env);
2065ca69 4137
096424d1 4138 (void) umask(context->umask);
b213e1c1 4139
b1edf445 4140 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4141 if (r < 0) {
4142 *exit_status = EXIT_KEYRING;
12145637 4143 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4144 }
4145
165a31c0 4146 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4147 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4148
165a31c0
LP
4149 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4150 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4151
165a31c0
LP
4152 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4153 if (needs_ambient_hack)
4154 needs_setuid = false;
4155 else
4156 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4157
4158 if (needs_sandboxing) {
7f18ef0a
FK
4159 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4160 * present. The actual MAC context application will happen later, as late as possible, to avoid
4161 * impacting our own code paths. */
4162
349cc4a5 4163#if HAVE_SELINUX
43b1f709 4164 use_selinux = mac_selinux_use();
7f18ef0a 4165#endif
f9fa32f0 4166#if ENABLE_SMACK
43b1f709 4167 use_smack = mac_smack_use();
7f18ef0a 4168#endif
349cc4a5 4169#if HAVE_APPARMOR
43b1f709 4170 use_apparmor = mac_apparmor_use();
7f18ef0a 4171#endif
165a31c0 4172 }
7f18ef0a 4173
ce932d2d
LP
4174 if (needs_sandboxing) {
4175 int which_failed;
4176
4177 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4178 * is set here. (See below.) */
4179
4180 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4181 if (r < 0) {
4182 *exit_status = EXIT_LIMITS;
4183 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4184 }
4185 }
4186
0af07108 4187 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4188 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4189 * wins here. (See above.) */
4190
1da37e58 4191 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4192 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4193 if (r < 0) {
4194 *exit_status = EXIT_PAM;
4195 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4196 }
ac45f971 4197
0af07108
ZJS
4198 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4199 if (ngids_after_pam < 0) {
4200 *exit_status = EXIT_MEMORY;
4201 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4202 }
b213e1c1 4203 }
5749f855 4204
0af07108 4205 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4206 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4207 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4208 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4209
4210 userns_set_up = true;
4211 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4212 if (r < 0) {
4213 *exit_status = EXIT_USER;
4214 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4215 }
4216 }
4217
a8d08f39
LP
4218 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4219
6e2d7c4f 4220 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4221 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4222 if (r == -EPERM)
4223 log_unit_warning_errno(unit, r,
4224 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4225 else if (r < 0) {
6e2d7c4f
MS
4226 *exit_status = EXIT_NETWORK;
4227 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4228 }
a8d08f39
LP
4229 } else if (context->network_namespace_path) {
4230 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4231 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4232 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4233 } else
4234 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4235 }
169c1bda 4236
a70581ff
XR
4237 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4238
4239 if (ns_type_supported(NAMESPACE_IPC)) {
4240 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4241 if (r == -EPERM)
4242 log_unit_warning_errno(unit, r,
4243 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4244 else if (r < 0) {
4245 *exit_status = EXIT_NAMESPACE;
4246 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4247 }
4248 } else if (context->ipc_namespace_path) {
4249 *exit_status = EXIT_NAMESPACE;
4250 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4251 "IPCNamespacePath= is not supported, refusing.");
4252 } else
4253 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4254 }
4255
ee818b89 4256 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 4257 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4258 _cleanup_free_ char *error_path = NULL;
4259
9f71ba8d 4260 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4261 if (r < 0) {
4262 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4263 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4264 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4265 }
d35fbf6b 4266 }
81a2b7ce 4267
daf8f72b
LP
4268 if (needs_sandboxing) {
4269 r = apply_protect_hostname(unit, context, exit_status);
4270 if (r < 0)
4271 return r;
aecd5ac6
TM
4272 }
4273
5749f855
AZ
4274 /* Drop groups as early as possible.
4275 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4276 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4277 if (needs_setuid) {
afb11bf1
DG
4278 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4279 int ngids_to_enforce = 0;
4280
4281 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4282 ngids,
4283 gids_after_pam,
4284 ngids_after_pam,
4285 &gids_to_enforce);
4286 if (ngids_to_enforce < 0) {
4287 *exit_status = EXIT_MEMORY;
4288 return log_unit_error_errno(unit,
4289 ngids_to_enforce,
4290 "Failed to merge group lists. Group membership might be incorrect: %m");
4291 }
4292
4293 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4294 if (r < 0) {
4295 *exit_status = EXIT_GROUP;
12145637 4296 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4297 }
165a31c0 4298 }
096424d1 4299
5749f855
AZ
4300 /* If the user namespace was not set up above, try to do it now.
4301 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4302 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4303 * case of mount namespaces being less privileged when the mount point list is copied from a
4304 * different user namespace). */
9008e1ac 4305
5749f855
AZ
4306 if (needs_sandboxing && context->private_users && !userns_set_up) {
4307 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4308 if (r < 0) {
4309 *exit_status = EXIT_USER;
4310 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4311 }
4312 }
4313
9f71ba8d
ZJS
4314 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4315 * shall execute. */
4316
4317 _cleanup_free_ char *executable = NULL;
b83d5050
ZJS
4318 _cleanup_close_ int executable_fd = -1;
4319 r = find_executable_full(command->path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4320 if (r < 0) {
4321 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4322 log_unit_struct_errno(unit, LOG_INFO, r,
4323 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4324 LOG_UNIT_INVOCATION_ID(unit),
4325 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4326 command->path),
4327 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4328 return 0;
4329 }
4330
4331 *exit_status = EXIT_EXEC;
c2503e35
RH
4332
4333 return log_unit_struct_errno(unit, LOG_INFO, r,
4334 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4335 LOG_UNIT_INVOCATION_ID(unit),
4336 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4337 command->path),
4338 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4339 }
4340
b83d5050
ZJS
4341 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4342 if (r < 0) {
4343 *exit_status = EXIT_FDS;
4344 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4345 }
4346
9f71ba8d 4347#if HAVE_SELINUX
49590d67
MS
4348 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4349 int fd = -1;
4350
4351 if (socket_fd >= 0)
4352 fd = socket_fd;
4353 else if (params->n_socket_fds == 1)
4354 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4355 * use context from that fd to compute the label. */
4356 fd = params->fds[0];
4357
4358 if (fd >= 0) {
4359 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4360 if (r < 0) {
4361 *exit_status = EXIT_SELINUX_CONTEXT;
4362 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4363 }
9f71ba8d
ZJS
4364 }
4365 }
4366#endif
4367
165a31c0 4368 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4369 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4370 * however if we have it as we want to keep it open until the final execve(). */
4371
1da37e58 4372 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4373 if (r >= 0)
4374 r = shift_fds(fds, n_fds);
4375 if (r >= 0)
25b583d7 4376 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4377 if (r < 0) {
4378 *exit_status = EXIT_FDS;
12145637 4379 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4380 }
e66cf1a3 4381
5686391b
LP
4382 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4383 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4384 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4385 * came this far. */
4386
165a31c0 4387 secure_bits = context->secure_bits;
e66cf1a3 4388
165a31c0
LP
4389 if (needs_sandboxing) {
4390 uint64_t bset;
e66cf1a3 4391
ce932d2d
LP
4392 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4393 * requested. (Note this is placed after the general resource limit initialization, see
4394 * above, in order to take precedence.) */
f4170c67
LP
4395 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4396 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4397 *exit_status = EXIT_LIMITS;
12145637 4398 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4399 }
4400 }
4401
37ac2744
JB
4402#if ENABLE_SMACK
4403 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4404 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4405 if (use_smack) {
b83d5050 4406 r = setup_smack(context, executable_fd);
37ac2744
JB
4407 if (r < 0) {
4408 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4409 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4410 }
4411 }
4412#endif
4413
165a31c0
LP
4414 bset = context->capability_bounding_set;
4415 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4416 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4417 * instead of us doing that */
4418 if (needs_ambient_hack)
4419 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4420 (UINT64_C(1) << CAP_SETUID) |
4421 (UINT64_C(1) << CAP_SETGID);
4422
4423 if (!cap_test_all(bset)) {
4424 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4425 if (r < 0) {
4426 *exit_status = EXIT_CAPABILITIES;
12145637 4427 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4428 }
4c2630eb 4429 }
3b8bddde 4430
16fcb191
TK
4431 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4432 * keep-caps set.
4433 * To be able to raise the ambient capabilities after setresuid() they have to be
4434 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4435 * After setresuid() the ambient capabilities can be raised as they are present in
4436 * the permitted and inhertiable set. However it is possible that someone wants to
4437 * set ambient capabilities without changing the user, so we also set the ambient
4438 * capabilities here.
4439 * The requested ambient capabilities are raised in the inheritable set if the
4440 * second argument is true. */
943800f4 4441 if (!needs_ambient_hack) {
755d4b67
IP
4442 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4443 if (r < 0) {
4444 *exit_status = EXIT_CAPABILITIES;
12145637 4445 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4446 }
755d4b67 4447 }
165a31c0 4448 }
755d4b67 4449
fa97f630
JB
4450 /* chroot to root directory first, before we lose the ability to chroot */
4451 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4452 if (r < 0)
4453 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4454
165a31c0 4455 if (needs_setuid) {
08f67696 4456 if (uid_is_valid(uid)) {
ff0af2a1
LP
4457 r = enforce_user(context, uid);
4458 if (r < 0) {
4459 *exit_status = EXIT_USER;
12145637 4460 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4461 }
165a31c0
LP
4462
4463 if (!needs_ambient_hack &&
4464 context->capability_ambient_set != 0) {
755d4b67 4465
16fcb191 4466 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4467 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4468 if (r < 0) {
4469 *exit_status = EXIT_CAPABILITIES;
12145637 4470 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4471 }
755d4b67 4472 }
5b6319dc 4473 }
165a31c0 4474 }
d35fbf6b 4475
56ef8db9
JB
4476 /* Apply working directory here, because the working directory might be on NFS and only the user running
4477 * this service might have the correct privilege to change to the working directory */
fa97f630 4478 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4479 if (r < 0)
4480 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4481
165a31c0 4482 if (needs_sandboxing) {
37ac2744 4483 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4484 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4485 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4486 * are restricted. */
4487
349cc4a5 4488#if HAVE_SELINUX
43b1f709 4489 if (use_selinux) {
5cd9cd35
LP
4490 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4491
4492 if (exec_context) {
4493 r = setexeccon(exec_context);
4494 if (r < 0) {
4495 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 4496 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
4497 }
4498 }
4499 }
4500#endif
4501
349cc4a5 4502#if HAVE_APPARMOR
43b1f709 4503 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4504 r = aa_change_onexec(context->apparmor_profile);
4505 if (r < 0 && !context->apparmor_profile_ignore) {
4506 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4507 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4508 }
4509 }
4510#endif
4511
165a31c0 4512 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4513 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4514 * CAP_SETPCAP. */
4515 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4516 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4517 * effective set here.
4518 * The effective set is overwritten during execve with the following values:
4519 * - ambient set (for non-root processes)
4520 * - (inheritable | bounding) set for root processes)
4521 *
4522 * Hence there is no security impact to raise it in the effective set before execve
4523 */
4524 r = capability_gain_cap_setpcap(NULL);
4525 if (r < 0) {
4526 *exit_status = EXIT_CAPABILITIES;
4527 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4528 }
755d4b67 4529 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4530 *exit_status = EXIT_SECUREBITS;
12145637 4531 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4532 }
dbdc4098 4533 }
5b6319dc 4534
59eeb84b 4535 if (context_has_no_new_privileges(context))
d35fbf6b 4536 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4537 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4538 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4539 }
4540
349cc4a5 4541#if HAVE_SECCOMP
469830d1
LP
4542 r = apply_address_families(unit, context);
4543 if (r < 0) {
4544 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4545 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4546 }
04aa0cb9 4547
469830d1
LP
4548 r = apply_memory_deny_write_execute(unit, context);
4549 if (r < 0) {
4550 *exit_status = EXIT_SECCOMP;
12145637 4551 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4552 }
f4170c67 4553
469830d1
LP
4554 r = apply_restrict_realtime(unit, context);
4555 if (r < 0) {
4556 *exit_status = EXIT_SECCOMP;
12145637 4557 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4558 }
4559
f69567cb
LP
4560 r = apply_restrict_suid_sgid(unit, context);
4561 if (r < 0) {
4562 *exit_status = EXIT_SECCOMP;
4563 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4564 }
4565
add00535
LP
4566 r = apply_restrict_namespaces(unit, context);
4567 if (r < 0) {
4568 *exit_status = EXIT_SECCOMP;
12145637 4569 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4570 }
4571
469830d1
LP
4572 r = apply_protect_sysctl(unit, context);
4573 if (r < 0) {
4574 *exit_status = EXIT_SECCOMP;
12145637 4575 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4576 }
4577
469830d1
LP
4578 r = apply_protect_kernel_modules(unit, context);
4579 if (r < 0) {
4580 *exit_status = EXIT_SECCOMP;
12145637 4581 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4582 }
4583
84703040
KK
4584 r = apply_protect_kernel_logs(unit, context);
4585 if (r < 0) {
4586 *exit_status = EXIT_SECCOMP;
4587 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4588 }
4589
fc64760d
KK
4590 r = apply_protect_clock(unit, context);
4591 if (r < 0) {
4592 *exit_status = EXIT_SECCOMP;
4593 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4594 }
4595
469830d1
LP
4596 r = apply_private_devices(unit, context);
4597 if (r < 0) {
4598 *exit_status = EXIT_SECCOMP;
12145637 4599 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4600 }
4601
4602 r = apply_syscall_archs(unit, context);
4603 if (r < 0) {
4604 *exit_status = EXIT_SECCOMP;
12145637 4605 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4606 }
4607
78e864e5
TM
4608 r = apply_lock_personality(unit, context);
4609 if (r < 0) {
4610 *exit_status = EXIT_SECCOMP;
12145637 4611 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4612 }
4613
9df2cdd8
TM
4614 r = apply_syscall_log(unit, context);
4615 if (r < 0) {
4616 *exit_status = EXIT_SECCOMP;
4617 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4618 }
4619
5cd9cd35
LP
4620 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4621 * by the filter as little as possible. */
165a31c0 4622 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4623 if (r < 0) {
4624 *exit_status = EXIT_SECCOMP;
12145637 4625 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4626 }
4627#endif
d35fbf6b 4628 }
034c6ed7 4629
00819cc1
LP
4630 if (!strv_isempty(context->unset_environment)) {
4631 char **ee = NULL;
4632
4633 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4634 if (!ee) {
4635 *exit_status = EXIT_MEMORY;
12145637 4636 return log_oom();
00819cc1
LP
4637 }
4638
130d3d22 4639 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4640 }
4641
7ca69792
AZ
4642 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4643 replaced_argv = replace_env_argv(command->argv, accum_env);
4644 if (!replaced_argv) {
4645 *exit_status = EXIT_MEMORY;
4646 return log_oom();
4647 }
4648 final_argv = replaced_argv;
4649 } else
4650 final_argv = command->argv;
034c6ed7 4651
f1d34068 4652 if (DEBUG_LOGGING) {
c2b2df60 4653 _cleanup_free_ char *line = NULL;
81a2b7ce 4654
d35fbf6b 4655 line = exec_command_line(final_argv);
a1230ff9 4656 if (line)
c2503e35
RH
4657 log_unit_struct(unit, LOG_DEBUG,
4658 "EXECUTABLE=%s", executable,
4659 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4660 LOG_UNIT_INVOCATION_ID(unit));
d35fbf6b 4661 }
dd305ec9 4662
5686391b
LP
4663 if (exec_fd >= 0) {
4664 uint8_t hot = 1;
4665
4666 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4667 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4668
4669 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4670 *exit_status = EXIT_EXEC;
4671 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4672 }
4673 }
4674
a6d9111c 4675 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4676
4677 if (exec_fd >= 0) {
4678 uint8_t hot = 0;
4679
4680 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4681 * that POLLHUP on it no longer means execve() succeeded. */
4682
4683 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4684 *exit_status = EXIT_EXEC;
4685 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4686 }
4687 }
12145637 4688
ff0af2a1 4689 *exit_status = EXIT_EXEC;
9f71ba8d 4690 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4691}
81a2b7ce 4692
34cf6c43 4693static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4694static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4695
f2341e0a
LP
4696int exec_spawn(Unit *unit,
4697 ExecCommand *command,
d35fbf6b
DM
4698 const ExecContext *context,
4699 const ExecParameters *params,
4700 ExecRuntime *runtime,
29206d46 4701 DynamicCreds *dcreds,
d35fbf6b 4702 pid_t *ret) {
8351ceae 4703
ee39ca20 4704 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4705 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4706 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4707 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4708 _cleanup_free_ char *line = NULL;
d35fbf6b 4709 pid_t pid;
8351ceae 4710
f2341e0a 4711 assert(unit);
d35fbf6b
DM
4712 assert(command);
4713 assert(context);
4714 assert(ret);
4715 assert(params);
25b583d7 4716 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4717
d35fbf6b
DM
4718 if (context->std_input == EXEC_INPUT_SOCKET ||
4719 context->std_output == EXEC_OUTPUT_SOCKET ||
4720 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4721
d85ff944
YW
4722 if (params->n_socket_fds > 1)
4723 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4724
d85ff944
YW
4725 if (params->n_socket_fds == 0)
4726 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4727
d35fbf6b
DM
4728 socket_fd = params->fds[0];
4729 } else {
4730 socket_fd = -1;
4731 fds = params->fds;
9b141911 4732 n_socket_fds = params->n_socket_fds;
25b583d7 4733 n_storage_fds = params->n_storage_fds;
d35fbf6b 4734 }
94f04347 4735
34cf6c43 4736 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4737 if (r < 0)
4738 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4739
f2341e0a 4740 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4741 if (r < 0)
f2341e0a 4742 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4743
ee39ca20 4744 line = exec_command_line(command->argv);
d35fbf6b
DM
4745 if (!line)
4746 return log_oom();
fab56fc5 4747
9f71ba8d
ZJS
4748 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4749 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4750 mac_selinux_maybe_reload();
4751
c2503e35
RH
4752 log_unit_struct(unit, LOG_DEBUG,
4753 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4754 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4755 the mount namespace in the child, but we want to log
4756 from the parent, so we need to use the (possibly
4757 inaccurate) path here. */
4758 LOG_UNIT_INVOCATION_ID(unit));
12145637 4759
78f93209
LP
4760 if (params->cgroup_path) {
4761 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4762 if (r < 0)
4763 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4764 if (r > 0) { /* We are using a child cgroup */
4765 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4766 if (r < 0)
4767 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
4768
4769 /* Normally we would not propagate the oomd xattrs to children but since we created this
4770 * sub-cgroup internally we should do it. */
4771 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
4772 }
4773 }
4774
d35fbf6b
DM
4775 pid = fork();
4776 if (pid < 0)
74129a12 4777 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
4778
4779 if (pid == 0) {
12145637 4780 int exit_status = EXIT_SUCCESS;
ff0af2a1 4781
f2341e0a
LP
4782 r = exec_child(unit,
4783 command,
ff0af2a1
LP
4784 context,
4785 params,
4786 runtime,
29206d46 4787 dcreds,
ff0af2a1 4788 socket_fd,
52c239d7 4789 named_iofds,
4c47affc 4790 fds,
9b141911 4791 n_socket_fds,
25b583d7 4792 n_storage_fds,
ff0af2a1 4793 files_env,
00d9ef85 4794 unit->manager->user_lookup_fds[1],
12145637
LP
4795 &exit_status);
4796
e1714f02
ZJS
4797 if (r < 0) {
4798 const char *status =
4799 exit_status_to_string(exit_status,
e04ed6db 4800 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 4801
c2503e35
RH
4802 log_unit_struct_errno(unit, LOG_ERR, r,
4803 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4804 LOG_UNIT_INVOCATION_ID(unit),
4805 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4806 status, command->path),
4807 "EXECUTABLE=%s", command->path);
e1714f02 4808 }
4c2630eb 4809
ff0af2a1 4810 _exit(exit_status);
034c6ed7
LP
4811 }
4812
f2341e0a 4813 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 4814
78f93209
LP
4815 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4816 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4817 * process will be killed too). */
4818 if (subcgroup_path)
4819 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 4820
b58b4116 4821 exec_status_start(&command->exec_status, pid);
9fb86720 4822
034c6ed7 4823 *ret = pid;
5cb5a6ff
LP
4824 return 0;
4825}
4826
034c6ed7
LP
4827void exec_context_init(ExecContext *c) {
4828 assert(c);
4829
4c12626c 4830 c->umask = 0022;
9eba9da4 4831 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 4832 c->cpu_sched_policy = SCHED_OTHER;
071830ff 4833 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 4834 c->syslog_level_prefix = true;
353e12c2 4835 c->ignore_sigpipe = true;
3a43da28 4836 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 4837 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
4838 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4839 c->directories[t].mode = 0755;
12213aed 4840 c->timeout_clean_usec = USEC_INFINITY;
a103496c 4841 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
4842 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4843 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 4844 c->log_level_max = -1;
005bfaf1
TM
4845#if HAVE_SECCOMP
4846 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4847#endif
b070c7c0 4848 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
4849}
4850
613b411c 4851void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
4852 assert(c);
4853
6796073e
LP
4854 c->environment = strv_free(c->environment);
4855 c->environment_files = strv_free(c->environment_files);
b4c14404 4856 c->pass_environment = strv_free(c->pass_environment);
00819cc1 4857 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 4858
31ce987c 4859 rlimit_free_all(c->rlimit);
034c6ed7 4860
5b10116e 4861 for (size_t l = 0; l < 3; l++) {
52c239d7 4862 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
4863 c->stdio_file[l] = mfree(c->stdio_file[l]);
4864 }
52c239d7 4865
a1e58e8e
LP
4866 c->working_directory = mfree(c->working_directory);
4867 c->root_directory = mfree(c->root_directory);
915e6d16 4868 c->root_image = mfree(c->root_image);
18d73705 4869 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
4870 c->root_hash = mfree(c->root_hash);
4871 c->root_hash_size = 0;
4872 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
4873 c->root_hash_sig = mfree(c->root_hash_sig);
4874 c->root_hash_sig_size = 0;
4875 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 4876 c->root_verity = mfree(c->root_verity);
93f59701 4877 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
4878 c->tty_path = mfree(c->tty_path);
4879 c->syslog_identifier = mfree(c->syslog_identifier);
4880 c->user = mfree(c->user);
4881 c->group = mfree(c->group);
034c6ed7 4882
6796073e 4883 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 4884
a1e58e8e 4885 c->pam_name = mfree(c->pam_name);
5b6319dc 4886
2a624c36
AP
4887 c->read_only_paths = strv_free(c->read_only_paths);
4888 c->read_write_paths = strv_free(c->read_write_paths);
4889 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
4890 c->exec_paths = strv_free(c->exec_paths);
4891 c->no_exec_paths = strv_free(c->no_exec_paths);
82c121a4 4892
d2d6c096 4893 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
4894 c->bind_mounts = NULL;
4895 c->n_bind_mounts = 0;
2abd4e38
YW
4896 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4897 c->temporary_filesystems = NULL;
4898 c->n_temporary_filesystems = 0;
b3d13314 4899 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 4900
0985c7c4 4901 cpu_set_reset(&c->cpu_set);
b070c7c0 4902 numa_policy_reset(&c->numa_policy);
86a3475b 4903
a1e58e8e
LP
4904 c->utmp_id = mfree(c->utmp_id);
4905 c->selinux_context = mfree(c->selinux_context);
4906 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 4907 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 4908
8cfa775f 4909 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
4910 c->syscall_archs = set_free(c->syscall_archs);
4911 c->address_families = set_free(c->address_families);
e66cf1a3 4912
5b10116e
ZJS
4913 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4914 c->directories[t].paths = strv_free(c->directories[t].paths);
d3070fbd
LP
4915
4916 c->log_level_max = -1;
4917
4918 exec_context_free_log_extra_fields(c);
08f3be7a 4919
5ac1530e
ZJS
4920 c->log_ratelimit_interval_usec = 0;
4921 c->log_ratelimit_burst = 0;
90fc172e 4922
08f3be7a
LP
4923 c->stdin_data = mfree(c->stdin_data);
4924 c->stdin_data_size = 0;
a8d08f39
LP
4925
4926 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 4927 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
4928
4929 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f
LP
4930
4931 c->load_credentials = strv_free(c->load_credentials);
4932 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
4933}
4934
34cf6c43 4935int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
4936 char **i;
4937
4938 assert(c);
4939
4940 if (!runtime_prefix)
4941 return 0;
4942
3536f49e 4943 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
c2b2df60 4944 _cleanup_free_ char *p = NULL;
e66cf1a3 4945
494d0247
YW
4946 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4947 p = path_join(runtime_prefix, "private", *i);
4948 else
4949 p = path_join(runtime_prefix, *i);
e66cf1a3
LP
4950 if (!p)
4951 return -ENOMEM;
4952
7bc4bf4a
LP
4953 /* We execute this synchronously, since we need to be sure this is gone when we start the
4954 * service next. */
c6878637 4955 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
4956 }
4957
4958 return 0;
5cb5a6ff
LP
4959}
4960
bb0c0d6f
LP
4961int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4962 _cleanup_free_ char *p = NULL;
4963
4964 assert(c);
4965
4966 if (!runtime_prefix || !unit)
4967 return 0;
4968
4969 p = path_join(runtime_prefix, "credentials", unit);
4970 if (!p)
4971 return -ENOMEM;
4972
4973 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4974 * unmount it, and afterwards remove the mount point */
4975 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4976 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4977
4978 return 0;
4979}
4980
34cf6c43 4981static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
4982 assert(c);
4983
a1e58e8e 4984 c->path = mfree(c->path);
6796073e 4985 c->argv = strv_free(c->argv);
43d0fcbd
LP
4986}
4987
da6053d0 4988void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 4989 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
4990 exec_command_done(c+i);
4991}
4992
f1acf85a 4993ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
4994 ExecCommand *i;
4995
4996 while ((i = c)) {
71fda00f 4997 LIST_REMOVE(command, c, i);
43d0fcbd 4998 exec_command_done(i);
5cb5a6ff
LP
4999 free(i);
5000 }
f1acf85a
ZJS
5001
5002 return NULL;
5cb5a6ff
LP
5003}
5004
da6053d0 5005void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5006 for (size_t i = 0; i < n; i++)
f1acf85a 5007 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5008}
5009
6a1d4d9f 5010void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5011 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5012 exec_status_reset(&c[i].exec_status);
5013}
5014
5015void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5016 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5017 ExecCommand *z;
5018
5019 LIST_FOREACH(command, z, c[i])
5020 exec_status_reset(&z->exec_status);
5021 }
5022}
5023
039f0e70 5024typedef struct InvalidEnvInfo {
34cf6c43 5025 const Unit *unit;
039f0e70
LP
5026 const char *path;
5027} InvalidEnvInfo;
5028
5029static void invalid_env(const char *p, void *userdata) {
5030 InvalidEnvInfo *info = userdata;
5031
f2341e0a 5032 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5033}
5034
52c239d7
LB
5035const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5036 assert(c);
5037
5038 switch (fd_index) {
5073ff6b 5039
52c239d7
LB
5040 case STDIN_FILENO:
5041 if (c->std_input != EXEC_INPUT_NAMED_FD)
5042 return NULL;
5073ff6b 5043
52c239d7 5044 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5045
52c239d7
LB
5046 case STDOUT_FILENO:
5047 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5048 return NULL;
5073ff6b 5049
52c239d7 5050 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5051
52c239d7
LB
5052 case STDERR_FILENO:
5053 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5054 return NULL;
5073ff6b 5055
52c239d7 5056 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5057
52c239d7
LB
5058 default:
5059 return NULL;
5060 }
5061}
5062
2caa38e9
LP
5063static int exec_context_named_iofds(
5064 const ExecContext *c,
5065 const ExecParameters *p,
5066 int named_iofds[static 3]) {
5067
5b10116e 5068 size_t targets;
56fbd561 5069 const char* stdio_fdname[3];
da6053d0 5070 size_t n_fds;
52c239d7
LB
5071
5072 assert(c);
5073 assert(p);
2caa38e9 5074 assert(named_iofds);
52c239d7
LB
5075
5076 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5077 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5078 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5079
5b10116e 5080 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5081 stdio_fdname[i] = exec_context_fdname(c, i);
5082
4c47affc
FB
5083 n_fds = p->n_storage_fds + p->n_socket_fds;
5084
5b10116e 5085 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5086 if (named_iofds[STDIN_FILENO] < 0 &&
5087 c->std_input == EXEC_INPUT_NAMED_FD &&
5088 stdio_fdname[STDIN_FILENO] &&
5089 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5090
52c239d7
LB
5091 named_iofds[STDIN_FILENO] = p->fds[i];
5092 targets--;
56fbd561
ZJS
5093
5094 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5095 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5096 stdio_fdname[STDOUT_FILENO] &&
5097 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5098
52c239d7
LB
5099 named_iofds[STDOUT_FILENO] = p->fds[i];
5100 targets--;
56fbd561
ZJS
5101
5102 } else if (named_iofds[STDERR_FILENO] < 0 &&
5103 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5104 stdio_fdname[STDERR_FILENO] &&
5105 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5106
52c239d7
LB
5107 named_iofds[STDERR_FILENO] = p->fds[i];
5108 targets--;
5109 }
5110
56fbd561 5111 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5112}
5113
34cf6c43 5114static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5115 char **i, **r = NULL;
5116
5117 assert(c);
5118 assert(l);
5119
5120 STRV_FOREACH(i, c->environment_files) {
5121 char *fn;
52511fae 5122 int k;
8c7be95e
LP
5123 bool ignore = false;
5124 char **p;
7fd1b19b 5125 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5126
5127 fn = *i;
5128
5129 if (fn[0] == '-') {
5130 ignore = true;
313cefa1 5131 fn++;
8c7be95e
LP
5132 }
5133
5134 if (!path_is_absolute(fn)) {
8c7be95e
LP
5135 if (ignore)
5136 continue;
5137
5138 strv_free(r);
5139 return -EINVAL;
5140 }
5141
2bef10ab 5142 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5143 k = safe_glob(fn, 0, &pglob);
5144 if (k < 0) {
2bef10ab
PL
5145 if (ignore)
5146 continue;
8c7be95e 5147
2bef10ab 5148 strv_free(r);
d8c92e8b 5149 return k;
2bef10ab 5150 }
8c7be95e 5151
d8c92e8b
ZJS
5152 /* When we don't match anything, -ENOENT should be returned */
5153 assert(pglob.gl_pathc > 0);
5154
5b10116e 5155 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5156 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5157 if (k < 0) {
5158 if (ignore)
5159 continue;
8c7be95e 5160
2bef10ab 5161 strv_free(r);
2bef10ab 5162 return k;
e9c1ea9d 5163 }
ebc05a09 5164 /* Log invalid environment variables with filename */
039f0e70
LP
5165 if (p) {
5166 InvalidEnvInfo info = {
f2341e0a 5167 .unit = unit,
039f0e70
LP
5168 .path = pglob.gl_pathv[n]
5169 };
5170
5171 p = strv_env_clean_with_callback(p, invalid_env, &info);
5172 }
8c7be95e 5173
234519ae 5174 if (!r)
2bef10ab
PL
5175 r = p;
5176 else {
5177 char **m;
8c7be95e 5178
2bef10ab
PL
5179 m = strv_env_merge(2, r, p);
5180 strv_free(r);
5181 strv_free(p);
c84a9488 5182 if (!m)
2bef10ab 5183 return -ENOMEM;
2bef10ab
PL
5184
5185 r = m;
5186 }
8c7be95e
LP
5187 }
5188 }
5189
5190 *l = r;
5191
5192 return 0;
5193}
5194
6ac8fdc9 5195static bool tty_may_match_dev_console(const char *tty) {
7b912648 5196 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5197
1e22b5cd
LP
5198 if (!tty)
5199 return true;
5200
a119ec7c 5201 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5202
5203 /* trivial identity? */
5204 if (streq(tty, "console"))
5205 return true;
5206
7b912648
LP
5207 if (resolve_dev_console(&resolved) < 0)
5208 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5209
5210 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5211 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5212}
5213
6c0ae739
LP
5214static bool exec_context_may_touch_tty(const ExecContext *ec) {
5215 assert(ec);
1e22b5cd 5216
6c0ae739 5217 return ec->tty_reset ||
1e22b5cd
LP
5218 ec->tty_vhangup ||
5219 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5220 is_terminal_input(ec->std_input) ||
5221 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5222 is_terminal_output(ec->std_error);
5223}
5224
5225bool exec_context_may_touch_console(const ExecContext *ec) {
5226
5227 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5228 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5229}
5230
15ae422b
LP
5231static void strv_fprintf(FILE *f, char **l) {
5232 char **g;
5233
5234 assert(f);
5235
5236 STRV_FOREACH(g, l)
5237 fprintf(f, " %s", *g);
5238}
5239
ddc155b2
TM
5240static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5241 assert(f);
5242 assert(prefix);
5243 assert(name);
5244
5245 if (!strv_isempty(strv)) {
a7bd1656 5246 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5247 strv_fprintf(f, strv);
5248 fputs("\n", f);
5249 }
5250}
5251
34cf6c43 5252void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
12213aed 5253 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
add00535 5254 int r;
9eba9da4 5255
5cb5a6ff
LP
5256 assert(c);
5257 assert(f);
5258
4ad49000 5259 prefix = strempty(prefix);
5cb5a6ff
LP
5260
5261 fprintf(f,
94f04347
LP
5262 "%sUMask: %04o\n"
5263 "%sWorkingDirectory: %s\n"
451a074f 5264 "%sRootDirectory: %s\n"
15ae422b 5265 "%sNonBlocking: %s\n"
64747e2d 5266 "%sPrivateTmp: %s\n"
7f112f50 5267 "%sPrivateDevices: %s\n"
59eeb84b 5268 "%sProtectKernelTunables: %s\n"
e66a2f65 5269 "%sProtectKernelModules: %s\n"
84703040 5270 "%sProtectKernelLogs: %s\n"
fc64760d 5271 "%sProtectClock: %s\n"
59eeb84b 5272 "%sProtectControlGroups: %s\n"
d251207d
LP
5273 "%sPrivateNetwork: %s\n"
5274 "%sPrivateUsers: %s\n"
1b8689f9
LP
5275 "%sProtectHome: %s\n"
5276 "%sProtectSystem: %s\n"
5d997827 5277 "%sMountAPIVFS: %s\n"
f3e43635 5278 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5279 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5280 "%sRestrictRealtime: %s\n"
f69567cb 5281 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5282 "%sKeyringMode: %s\n"
4e399953
LP
5283 "%sProtectHostname: %s\n"
5284 "%sProtectProc: %s\n"
5285 "%sProcSubset: %s\n",
5cb5a6ff 5286 prefix, c->umask,
14eb3285
LP
5287 prefix, empty_to_root(c->working_directory),
5288 prefix, empty_to_root(c->root_directory),
15ae422b 5289 prefix, yes_no(c->non_blocking),
64747e2d 5290 prefix, yes_no(c->private_tmp),
7f112f50 5291 prefix, yes_no(c->private_devices),
59eeb84b 5292 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5293 prefix, yes_no(c->protect_kernel_modules),
84703040 5294 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5295 prefix, yes_no(c->protect_clock),
59eeb84b 5296 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5297 prefix, yes_no(c->private_network),
5298 prefix, yes_no(c->private_users),
1b8689f9
LP
5299 prefix, protect_home_to_string(c->protect_home),
5300 prefix, protect_system_to_string(c->protect_system),
5e98086d 5301 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5302 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5303 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5304 prefix, yes_no(c->restrict_realtime),
f69567cb 5305 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5306 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5307 prefix, yes_no(c->protect_hostname),
5308 prefix, protect_proc_to_string(c->protect_proc),
5309 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5310
915e6d16
LP
5311 if (c->root_image)
5312 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5313
18d73705
LB
5314 if (c->root_image_options) {
5315 MountOptions *o;
5316
5317 fprintf(f, "%sRootImageOptions:", prefix);
5318 LIST_FOREACH(mount_options, o, c->root_image_options)
5319 if (!isempty(o->options))
9ece6444
LB
5320 fprintf(f, " %s:%s",
5321 partition_designator_to_string(o->partition_designator),
5322 o->options);
18d73705
LB
5323 fprintf(f, "\n");
5324 }
5325
0389f4fa
LB
5326 if (c->root_hash) {
5327 _cleanup_free_ char *encoded = NULL;
5328 encoded = hexmem(c->root_hash, c->root_hash_size);
5329 if (encoded)
5330 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5331 }
5332
5333 if (c->root_hash_path)
5334 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5335
d4d55b0d
LB
5336 if (c->root_hash_sig) {
5337 _cleanup_free_ char *encoded = NULL;
5338 ssize_t len;
5339 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5340 if (len)
5341 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5342 }
5343
5344 if (c->root_hash_sig_path)
5345 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5346
0389f4fa
LB
5347 if (c->root_verity)
5348 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5349
8c7be95e
LP
5350 STRV_FOREACH(e, c->environment)
5351 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5352
5353 STRV_FOREACH(e, c->environment_files)
5354 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5355
b4c14404
FB
5356 STRV_FOREACH(e, c->pass_environment)
5357 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5358
00819cc1
LP
5359 STRV_FOREACH(e, c->unset_environment)
5360 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5361
53f47dfc
YW
5362 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5363
5b10116e 5364 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5365 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5366
5367 STRV_FOREACH(d, c->directories[dt].paths)
5368 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5369 }
c2bbd90b 5370
12213aed
YW
5371 fprintf(f,
5372 "%sTimeoutCleanSec: %s\n",
5373 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5374
fb33a393
LP
5375 if (c->nice_set)
5376 fprintf(f,
5377 "%sNice: %i\n",
5378 prefix, c->nice);
5379
dd6c17b1 5380 if (c->oom_score_adjust_set)
fb33a393 5381 fprintf(f,
dd6c17b1
LP
5382 "%sOOMScoreAdjust: %i\n",
5383 prefix, c->oom_score_adjust);
9eba9da4 5384
ad21e542
ZJS
5385 if (c->coredump_filter_set)
5386 fprintf(f,
5387 "%sCoredumpFilter: 0x%"PRIx64"\n",
5388 prefix, c->coredump_filter);
5389
5b10116e 5390 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5391 if (c->rlimit[i]) {
4c3a2b84 5392 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5393 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5394 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5395 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5396 }
94f04347 5397
f8b69d1d 5398 if (c->ioprio_set) {
1756a011 5399 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5400
837df140
YW
5401 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5402 if (r >= 0)
5403 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5404
5405 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 5406 }
94f04347 5407
f8b69d1d 5408 if (c->cpu_sched_set) {
1756a011 5409 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5410
837df140
YW
5411 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5412 if (r >= 0)
5413 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5414
94f04347 5415 fprintf(f,
38b48754
LP
5416 "%sCPUSchedulingPriority: %i\n"
5417 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5418 prefix, c->cpu_sched_priority,
5419 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5420 }
94f04347 5421
0985c7c4 5422 if (c->cpu_set.set) {
e7fca352
MS
5423 _cleanup_free_ char *affinity = NULL;
5424
5425 affinity = cpu_set_to_range_string(&c->cpu_set);
5426 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5427 }
5428
b070c7c0
MS
5429 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5430 _cleanup_free_ char *nodes = NULL;
5431
5432 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5433 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5434 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5435 }
5436
3a43da28 5437 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5438 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5439
5440 fprintf(f,
80876c20
LP
5441 "%sStandardInput: %s\n"
5442 "%sStandardOutput: %s\n"
5443 "%sStandardError: %s\n",
5444 prefix, exec_input_to_string(c->std_input),
5445 prefix, exec_output_to_string(c->std_output),
5446 prefix, exec_output_to_string(c->std_error));
5447
befc4a80
LP
5448 if (c->std_input == EXEC_INPUT_NAMED_FD)
5449 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5450 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5451 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5452 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5453 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5454
5455 if (c->std_input == EXEC_INPUT_FILE)
5456 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5457 if (c->std_output == EXEC_OUTPUT_FILE)
5458 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5459 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5460 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5461 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5462 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5463 if (c->std_error == EXEC_OUTPUT_FILE)
5464 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5465 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5466 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5467 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5468 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5469
80876c20
LP
5470 if (c->tty_path)
5471 fprintf(f,
6ea832a2
LP
5472 "%sTTYPath: %s\n"
5473 "%sTTYReset: %s\n"
5474 "%sTTYVHangup: %s\n"
5475 "%sTTYVTDisallocate: %s\n",
5476 prefix, c->tty_path,
5477 prefix, yes_no(c->tty_reset),
5478 prefix, yes_no(c->tty_vhangup),
5479 prefix, yes_no(c->tty_vt_disallocate));
94f04347 5480
9f6444eb 5481 if (IN_SET(c->std_output,
9f6444eb
LP
5482 EXEC_OUTPUT_KMSG,
5483 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5484 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5485 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5486 IN_SET(c->std_error,
9f6444eb
LP
5487 EXEC_OUTPUT_KMSG,
5488 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5489 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5490 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5491
5ce70e5b 5492 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5493
837df140
YW
5494 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5495 if (r >= 0)
5496 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5497
837df140
YW
5498 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5499 if (r >= 0)
5500 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5501 }
94f04347 5502
d3070fbd
LP
5503 if (c->log_level_max >= 0) {
5504 _cleanup_free_ char *t = NULL;
5505
5506 (void) log_level_to_string_alloc(c->log_level_max, &t);
5507
5508 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5509 }
5510
5ac1530e 5511 if (c->log_ratelimit_interval_usec > 0) {
90fc172e
AZ
5512 char buf_timespan[FORMAT_TIMESPAN_MAX];
5513
5514 fprintf(f,
5515 "%sLogRateLimitIntervalSec: %s\n",
5ac1530e 5516 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e
AZ
5517 }
5518
5ac1530e
ZJS
5519 if (c->log_ratelimit_burst > 0)
5520 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5521
5b10116e
ZJS
5522 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5523 fprintf(f, "%sLogExtraFields: ", prefix);
5524 fwrite(c->log_extra_fields[j].iov_base,
5525 1, c->log_extra_fields[j].iov_len,
5526 f);
5527 fputc('\n', f);
d3070fbd
LP
5528 }
5529
91dd5f7c
LP
5530 if (c->log_namespace)
5531 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5532
07d46372
YW
5533 if (c->secure_bits) {
5534 _cleanup_free_ char *str = NULL;
5535
5536 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5537 if (r >= 0)
5538 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5539 }
94f04347 5540
a103496c 5541 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5542 _cleanup_free_ char *str = NULL;
94f04347 5543
dd1f5bd0
YW
5544 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5545 if (r >= 0)
5546 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5547 }
5548
5549 if (c->capability_ambient_set != 0) {
dd1f5bd0 5550 _cleanup_free_ char *str = NULL;
755d4b67 5551
dd1f5bd0
YW
5552 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5553 if (r >= 0)
5554 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5555 }
5556
5557 if (c->user)
f2d3769a 5558 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5559 if (c->group)
f2d3769a 5560 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5561
29206d46
LP
5562 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5563
ddc155b2 5564 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5565
5b6319dc 5566 if (c->pam_name)
f2d3769a 5567 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5568
ddc155b2
TM
5569 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5570 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5571 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5572 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5573 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
2e22afe9 5574
5b10116e
ZJS
5575 for (size_t i = 0; i < c->n_bind_mounts; i++)
5576 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5577 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5578 c->bind_mounts[i].ignore_enoent ? "-": "",
5579 c->bind_mounts[i].source,
5580 c->bind_mounts[i].destination,
5581 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5582
5b10116e
ZJS
5583 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5584 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5585
5b10116e
ZJS
5586 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5587 t->path,
5588 isempty(t->options) ? "" : ":",
5589 strempty(t->options));
5590 }
2abd4e38 5591
169c1bda
LP
5592 if (c->utmp_id)
5593 fprintf(f,
5594 "%sUtmpIdentifier: %s\n",
5595 prefix, c->utmp_id);
7b52a628
MS
5596
5597 if (c->selinux_context)
5598 fprintf(f,
5f8640fb
LP
5599 "%sSELinuxContext: %s%s\n",
5600 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5601
80c21aea
WC
5602 if (c->apparmor_profile)
5603 fprintf(f,
5604 "%sAppArmorProfile: %s%s\n",
5605 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5606
5607 if (c->smack_process_label)
5608 fprintf(f,
5609 "%sSmackProcessLabel: %s%s\n",
5610 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5611
050f7277 5612 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5613 fprintf(f,
5614 "%sPersonality: %s\n",
5615 prefix, strna(personality_to_string(c->personality)));
5616
78e864e5
TM
5617 fprintf(f,
5618 "%sLockPersonality: %s\n",
5619 prefix, yes_no(c->lock_personality));
5620
17df7223 5621 if (c->syscall_filter) {
349cc4a5 5622#if HAVE_SECCOMP
8cfa775f 5623 void *id, *val;
17df7223 5624 bool first = true;
351a19b1 5625#endif
17df7223
LP
5626
5627 fprintf(f,
57183d11 5628 "%sSystemCallFilter: ",
17df7223
LP
5629 prefix);
5630
6b000af4 5631 if (!c->syscall_allow_list)
17df7223
LP
5632 fputc('~', f);
5633
349cc4a5 5634#if HAVE_SECCOMP
90e74a66 5635 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5636 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5637 const char *errno_name = NULL;
5638 int num = PTR_TO_INT(val);
17df7223
LP
5639
5640 if (first)
5641 first = false;
5642 else
5643 fputc(' ', f);
5644
57183d11 5645 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5646 fputs(strna(name), f);
8cfa775f
YW
5647
5648 if (num >= 0) {
005bfaf1 5649 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5650 if (errno_name)
5651 fprintf(f, ":%s", errno_name);
5652 else
5653 fprintf(f, ":%d", num);
5654 }
17df7223 5655 }
351a19b1 5656#endif
17df7223
LP
5657
5658 fputc('\n', f);
5659 }
5660
57183d11 5661 if (c->syscall_archs) {
349cc4a5 5662#if HAVE_SECCOMP
57183d11
LP
5663 void *id;
5664#endif
5665
5666 fprintf(f,
5667 "%sSystemCallArchitectures:",
5668 prefix);
5669
349cc4a5 5670#if HAVE_SECCOMP
90e74a66 5671 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5672 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5673#endif
5674 fputc('\n', f);
5675 }
5676
add00535
LP
5677 if (exec_context_restrict_namespaces_set(c)) {
5678 _cleanup_free_ char *s = NULL;
5679
86c2a9f1 5680 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5681 if (r >= 0)
5682 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5683 prefix, strna(s));
add00535
LP
5684 }
5685
a8d08f39
LP
5686 if (c->network_namespace_path)
5687 fprintf(f,
5688 "%sNetworkNamespacePath: %s\n",
5689 prefix, c->network_namespace_path);
5690
3df90f24 5691 if (c->syscall_errno > 0) {
005bfaf1 5692#if HAVE_SECCOMP
3df90f24 5693 const char *errno_name;
005bfaf1 5694#endif
3df90f24
YW
5695
5696 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5697
005bfaf1
TM
5698#if HAVE_SECCOMP
5699 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5700 if (errno_name)
005bfaf1 5701 fputs(errno_name, f);
3df90f24 5702 else
005bfaf1
TM
5703 fprintf(f, "%d", c->syscall_errno);
5704#endif
5705 fputc('\n', f);
3df90f24 5706 }
b3d13314 5707
5b10116e 5708 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5709 MountOptions *o;
5710
79e20ceb 5711 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5712 c->mount_images[i].ignore_enoent ? "-": "",
5713 c->mount_images[i].source,
79e20ceb 5714 c->mount_images[i].destination);
427353f6 5715 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5716 fprintf(f, ":%s:%s",
427353f6 5717 partition_designator_to_string(o->partition_designator),
79e20ceb 5718 strempty(o->options));
427353f6
LB
5719 fprintf(f, "\n");
5720 }
93f59701
LB
5721
5722 for (size_t i = 0; i < c->n_extension_images; i++) {
5723 MountOptions *o;
5724
5725 fprintf(f, "%sExtensionImages: %s%s", prefix,
5726 c->extension_images[i].ignore_enoent ? "-": "",
5727 c->extension_images[i].source);
5728 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5729 fprintf(f, ":%s:%s",
5730 partition_designator_to_string(o->partition_designator),
5731 strempty(o->options));
5732 fprintf(f, "\n");
5733 }
5cb5a6ff
LP
5734}
5735
34cf6c43 5736bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5737 assert(c);
5738
61233823 5739 /* Returns true if the process forked off would run under
a931ad47
LP
5740 * an unchanged UID or as root. */
5741
5742 if (!c->user)
5743 return true;
5744
5745 if (streq(c->user, "root") || streq(c->user, "0"))
5746 return true;
5747
5748 return false;
5749}
5750
34cf6c43 5751int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
5752 int p;
5753
5754 assert(c);
5755
5756 if (c->ioprio_set)
5757 return c->ioprio;
5758
5759 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5760 if (p < 0)
5761 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5762
5763 return p;
5764}
5765
5e98086d
ZJS
5766bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5767 assert(c);
5768
61198784 5769 /* Explicit setting wins */
5e98086d
ZJS
5770 if (c->mount_apivfs_set)
5771 return c->mount_apivfs;
5772
61198784 5773 /* Default to "yes" if root directory or image are specified */
74e12520 5774 if (exec_context_with_rootfs(c))
61198784
ZJS
5775 return true;
5776
5e98086d
ZJS
5777 return false;
5778}
5779
d3070fbd 5780void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
5781 assert(c);
5782
5b10116e 5783 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
5784 free(c->log_extra_fields[l].iov_base);
5785 c->log_extra_fields = mfree(c->log_extra_fields);
5786 c->n_log_extra_fields = 0;
5787}
5788
6f765baf 5789void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
5790 _cleanup_close_ int fd = -1;
5791 const char *path;
5792 struct stat st;
6f765baf
LP
5793 int r;
5794
5795 assert(c);
5796
5797 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5798 exec_context_tty_reset(c, NULL);
5799
5800 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5801 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5802 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
5803 if (!exec_context_may_touch_tty(c))
5804 return;
6f765baf 5805
0ba976e8
LP
5806 path = exec_context_tty_path(c);
5807 if (!path)
5808 return;
6f765baf 5809
0ba976e8
LP
5810 fd = open(path, O_PATH|O_CLOEXEC);
5811 if (fd < 0)
5812 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5813 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5814 path);
5815
5816 if (fstat(fd, &st) < 0)
5817 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5818
5819 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5820 * if things are a character device, since a proper check either means we'd have to open the TTY and
5821 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5822 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5823 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5824 if (!S_ISCHR(st.st_mode))
5825 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5826
5827 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5828 if (r < 0)
5829 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
5830}
5831
4c2f5842
LP
5832int exec_context_get_clean_directories(
5833 ExecContext *c,
5834 char **prefix,
5835 ExecCleanMask mask,
5836 char ***ret) {
5837
5838 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
5839 int r;
5840
5841 assert(c);
5842 assert(prefix);
5843 assert(ret);
5844
5b10116e 5845 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
5846 char **i;
5847
5848 if (!FLAGS_SET(mask, 1U << t))
5849 continue;
5850
5851 if (!prefix[t])
5852 continue;
5853
5854 STRV_FOREACH(i, c->directories[t].paths) {
5855 char *j;
5856
5857 j = path_join(prefix[t], *i);
5858 if (!j)
5859 return -ENOMEM;
5860
5861 r = strv_consume(&l, j);
5862 if (r < 0)
5863 return r;
7f622a19
YW
5864
5865 /* Also remove private directories unconditionally. */
5866 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5867 j = path_join(prefix[t], "private", *i);
5868 if (!j)
5869 return -ENOMEM;
5870
5871 r = strv_consume(&l, j);
5872 if (r < 0)
5873 return r;
5874 }
4c2f5842
LP
5875 }
5876 }
5877
5878 *ret = TAKE_PTR(l);
5879 return 0;
5880}
5881
5882int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5883 ExecCleanMask mask = 0;
5884
5885 assert(c);
5886 assert(ret);
5887
5888 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5889 if (!strv_isempty(c->directories[t].paths))
5890 mask |= 1U << t;
5891
5892 *ret = mask;
5893 return 0;
5894}
5895
b58b4116 5896void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 5897 assert(s);
5cb5a6ff 5898
2ed26ed0
LP
5899 *s = (ExecStatus) {
5900 .pid = pid,
5901 };
5902
b58b4116
LP
5903 dual_timestamp_get(&s->start_timestamp);
5904}
5905
34cf6c43 5906void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
5907 assert(s);
5908
d46b79bb 5909 if (s->pid != pid)
2ed26ed0
LP
5910 *s = (ExecStatus) {
5911 .pid = pid,
5912 };
b58b4116 5913
63983207 5914 dual_timestamp_get(&s->exit_timestamp);
9fb86720 5915
034c6ed7
LP
5916 s->code = code;
5917 s->status = status;
169c1bda 5918
6f765baf
LP
5919 if (context && context->utmp_id)
5920 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
5921}
5922
6a1d4d9f
LP
5923void exec_status_reset(ExecStatus *s) {
5924 assert(s);
5925
5926 *s = (ExecStatus) {};
5927}
5928
34cf6c43 5929void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
5930 char buf[FORMAT_TIMESTAMP_MAX];
5931
5932 assert(s);
5933 assert(f);
5934
9fb86720
LP
5935 if (s->pid <= 0)
5936 return;
5937
4c940960
LP
5938 prefix = strempty(prefix);
5939
9fb86720 5940 fprintf(f,
ccd06097
ZJS
5941 "%sPID: "PID_FMT"\n",
5942 prefix, s->pid);
9fb86720 5943
af9d16e1 5944 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
5945 fprintf(f,
5946 "%sStart Timestamp: %s\n",
63983207 5947 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
9fb86720 5948
af9d16e1 5949 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
5950 fprintf(f,
5951 "%sExit Timestamp: %s\n"
5952 "%sExit Code: %s\n"
5953 "%sExit Status: %i\n",
63983207 5954 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
9fb86720
LP
5955 prefix, sigchld_code_to_string(s->code),
5956 prefix, s->status);
5cb5a6ff 5957}
44d8db9e 5958
34cf6c43 5959static char *exec_command_line(char **argv) {
44d8db9e
LP
5960 size_t k;
5961 char *n, *p, **a;
5962 bool first = true;
5963
9e2f7c11 5964 assert(argv);
44d8db9e 5965
9164977d 5966 k = 1;
9e2f7c11 5967 STRV_FOREACH(a, argv)
44d8db9e
LP
5968 k += strlen(*a)+3;
5969
5cd9cd35
LP
5970 n = new(char, k);
5971 if (!n)
44d8db9e
LP
5972 return NULL;
5973
5974 p = n;
9e2f7c11 5975 STRV_FOREACH(a, argv) {
44d8db9e
LP
5976
5977 if (!first)
5978 *(p++) = ' ';
5979 else
5980 first = false;
5981
5982 if (strpbrk(*a, WHITESPACE)) {
5983 *(p++) = '\'';
5984 p = stpcpy(p, *a);
5985 *(p++) = '\'';
5986 } else
5987 p = stpcpy(p, *a);
5988
5989 }
5990
9164977d
LP
5991 *p = 0;
5992
44d8db9e
LP
5993 /* FIXME: this doesn't really handle arguments that have
5994 * spaces and ticks in them */
5995
5996 return n;
5997}
5998
34cf6c43 5999static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6000 _cleanup_free_ char *cmd = NULL;
4c940960 6001 const char *prefix2;
44d8db9e
LP
6002
6003 assert(c);
6004 assert(f);
6005
4c940960 6006 prefix = strempty(prefix);
63c372cb 6007 prefix2 = strjoina(prefix, "\t");
44d8db9e 6008
9e2f7c11 6009 cmd = exec_command_line(c->argv);
44d8db9e
LP
6010 fprintf(f,
6011 "%sCommand Line: %s\n",
4bbccb02 6012 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 6013
9fb86720 6014 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6015}
6016
6017void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6018 assert(f);
6019
4c940960 6020 prefix = strempty(prefix);
44d8db9e
LP
6021
6022 LIST_FOREACH(command, c, c)
6023 exec_command_dump(c, f, prefix);
6024}
94f04347 6025
a6a80b4f
LP
6026void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6027 ExecCommand *end;
6028
6029 assert(l);
6030 assert(e);
6031
6032 if (*l) {
35b8ca3a 6033 /* It's kind of important, that we keep the order here */
71fda00f
LP
6034 LIST_FIND_TAIL(command, *l, end);
6035 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6036 } else
6037 *l = e;
6038}
6039
26fd040d
LP
6040int exec_command_set(ExecCommand *c, const char *path, ...) {
6041 va_list ap;
6042 char **l, *p;
6043
6044 assert(c);
6045 assert(path);
6046
6047 va_start(ap, path);
6048 l = strv_new_ap(path, ap);
6049 va_end(ap);
6050
6051 if (!l)
6052 return -ENOMEM;
6053
250a918d
LP
6054 p = strdup(path);
6055 if (!p) {
26fd040d
LP
6056 strv_free(l);
6057 return -ENOMEM;
6058 }
6059
6897dfe8 6060 free_and_replace(c->path, p);
26fd040d 6061
130d3d22 6062 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6063}
6064
86b23b07 6065int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6066 _cleanup_strv_free_ char **l = NULL;
86b23b07 6067 va_list ap;
86b23b07
JS
6068 int r;
6069
6070 assert(c);
6071 assert(path);
6072
6073 va_start(ap, path);
6074 l = strv_new_ap(path, ap);
6075 va_end(ap);
6076
6077 if (!l)
6078 return -ENOMEM;
6079
e287086b 6080 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6081 if (r < 0)
86b23b07 6082 return r;
86b23b07
JS
6083
6084 return 0;
6085}
6086
e8a565cb
YW
6087static void *remove_tmpdir_thread(void *p) {
6088 _cleanup_free_ char *path = p;
86b23b07 6089
e8a565cb
YW
6090 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6091 return NULL;
6092}
6093
6094static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6095 int r;
6096
6097 if (!rt)
6098 return NULL;
6099
6100 if (rt->manager)
6101 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6102
6103 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6104
6105 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6106 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6107
6108 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6109 if (r < 0)
e8a565cb 6110 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6111 else
6112 rt->tmp_dir = NULL;
e8a565cb 6113 }
613b411c 6114
56a13a49 6115 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6116 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6117
6118 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6119 if (r < 0)
e8a565cb 6120 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6121 else
6122 rt->var_tmp_dir = NULL;
e8a565cb
YW
6123 }
6124
6125 rt->id = mfree(rt->id);
6126 rt->tmp_dir = mfree(rt->tmp_dir);
6127 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6128 safe_close_pair(rt->netns_storage_socket);
a70581ff 6129 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6130 return mfree(rt);
6131}
6132
6133static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6134 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6135}
6136
56a13a49
ZJS
6137static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6138 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6139 ExecRuntime *n;
613b411c 6140
8e8009dc 6141 assert(ret);
613b411c 6142
56a13a49
ZJS
6143 id_copy = strdup(id);
6144 if (!id_copy)
6145 return -ENOMEM;
6146
8e8009dc
LP
6147 n = new(ExecRuntime, 1);
6148 if (!n)
613b411c
LP
6149 return -ENOMEM;
6150
8e8009dc 6151 *n = (ExecRuntime) {
56a13a49 6152 .id = TAKE_PTR(id_copy),
8e8009dc 6153 .netns_storage_socket = { -1, -1 },
a70581ff 6154 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6155 };
6156
6157 *ret = n;
613b411c
LP
6158 return 0;
6159}
6160
e8a565cb
YW
6161static int exec_runtime_add(
6162 Manager *m,
6163 const char *id,
56a13a49
ZJS
6164 char **tmp_dir,
6165 char **var_tmp_dir,
6166 int netns_storage_socket[2],
a70581ff 6167 int ipcns_storage_socket[2],
e8a565cb
YW
6168 ExecRuntime **ret) {
6169
6170 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6171 int r;
6172
e8a565cb 6173 assert(m);
613b411c
LP
6174 assert(id);
6175
a70581ff 6176 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6177
56a13a49 6178 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6179 if (r < 0)
6180 return r;
6181
63083706 6182 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6183 if (r < 0)
6184 return r;
e8a565cb 6185
56a13a49
ZJS
6186 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6187 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6188 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6189
6190 if (netns_storage_socket) {
56a13a49
ZJS
6191 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6192 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6193 }
6194
a70581ff
XR
6195 if (ipcns_storage_socket) {
6196 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6197 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6198 }
6199
e8a565cb
YW
6200 rt->manager = m;
6201
6202 if (ret)
6203 *ret = rt;
e8a565cb 6204 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6205 TAKE_PTR(rt);
e8a565cb
YW
6206 return 0;
6207}
6208
74aaf59b
LP
6209static int exec_runtime_make(
6210 Manager *m,
6211 const ExecContext *c,
6212 const char *id,
6213 ExecRuntime **ret) {
6214
56a13a49 6215 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6216 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6217 int r;
6218
6219 assert(m);
6220 assert(c);
6221 assert(id);
6222
6223 /* It is not necessary to create ExecRuntime object. */
a70581ff 6224 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6225 *ret = NULL;
e8a565cb 6226 return 0;
74aaf59b 6227 }
e8a565cb 6228
efa2f3a1
TM
6229 if (c->private_tmp &&
6230 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6231 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6232 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6233 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6234 if (r < 0)
6235 return r;
6236 }
6237
a8d08f39 6238 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6239 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6240 return -errno;
6241 }
6242
a70581ff
XR
6243 if (c->private_ipc || c->ipc_namespace_path) {
6244 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6245 return -errno;
6246 }
6247
6248 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6249 if (r < 0)
6250 return r;
6251
613b411c
LP
6252 return 1;
6253}
6254
e8a565cb
YW
6255int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6256 ExecRuntime *rt;
6257 int r;
613b411c 6258
e8a565cb
YW
6259 assert(m);
6260 assert(id);
6261 assert(ret);
6262
6263 rt = hashmap_get(m->exec_runtime_by_id, id);
6264 if (rt)
387f6955 6265 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6266 goto ref;
6267
74aaf59b
LP
6268 if (!create) {
6269 *ret = NULL;
e8a565cb 6270 return 0;
74aaf59b 6271 }
e8a565cb
YW
6272
6273 /* If not found, then create a new object. */
6274 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6275 if (r < 0)
e8a565cb 6276 return r;
74aaf59b
LP
6277 if (r == 0) {
6278 /* When r == 0, it is not necessary to create ExecRuntime object. */
6279 *ret = NULL;
6280 return 0;
6281 }
613b411c 6282
e8a565cb
YW
6283ref:
6284 /* increment reference counter. */
6285 rt->n_ref++;
6286 *ret = rt;
6287 return 1;
6288}
613b411c 6289
e8a565cb
YW
6290ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6291 if (!rt)
613b411c
LP
6292 return NULL;
6293
e8a565cb 6294 assert(rt->n_ref > 0);
613b411c 6295
e8a565cb
YW
6296 rt->n_ref--;
6297 if (rt->n_ref > 0)
f2341e0a
LP
6298 return NULL;
6299
e8a565cb 6300 return exec_runtime_free(rt, destroy);
613b411c
LP
6301}
6302
e8a565cb
YW
6303int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6304 ExecRuntime *rt;
e8a565cb
YW
6305
6306 assert(m);
613b411c
LP
6307 assert(f);
6308 assert(fds);
6309
90e74a66 6310 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6311 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6312
e8a565cb
YW
6313 if (rt->tmp_dir)
6314 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6315
e8a565cb
YW
6316 if (rt->var_tmp_dir)
6317 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6318
e8a565cb
YW
6319 if (rt->netns_storage_socket[0] >= 0) {
6320 int copy;
613b411c 6321
e8a565cb
YW
6322 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6323 if (copy < 0)
6324 return copy;
613b411c 6325
e8a565cb
YW
6326 fprintf(f, " netns-socket-0=%i", copy);
6327 }
613b411c 6328
e8a565cb
YW
6329 if (rt->netns_storage_socket[1] >= 0) {
6330 int copy;
613b411c 6331
e8a565cb
YW
6332 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6333 if (copy < 0)
6334 return copy;
613b411c 6335
e8a565cb
YW
6336 fprintf(f, " netns-socket-1=%i", copy);
6337 }
6338
a70581ff
XR
6339 if (rt->ipcns_storage_socket[0] >= 0) {
6340 int copy;
6341
6342 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6343 if (copy < 0)
6344 return copy;
6345
6346 fprintf(f, " ipcns-socket-0=%i", copy);
6347 }
6348
6349 if (rt->ipcns_storage_socket[1] >= 0) {
6350 int copy;
6351
6352 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6353 if (copy < 0)
6354 return copy;
6355
6356 fprintf(f, " ipcns-socket-1=%i", copy);
6357 }
6358
e8a565cb 6359 fputc('\n', f);
613b411c
LP
6360 }
6361
6362 return 0;
6363}
6364
e8a565cb
YW
6365int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6366 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6367 ExecRuntime *rt;
613b411c
LP
6368 int r;
6369
e8a565cb
YW
6370 /* This is for the migration from old (v237 or earlier) deserialization text.
6371 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6372 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6373 * so or not from the serialized text, then we always creates a new object owned by this. */
6374
6375 assert(u);
613b411c
LP
6376 assert(key);
6377 assert(value);
6378
e8a565cb
YW
6379 /* Manager manages ExecRuntime objects by the unit id.
6380 * So, we omit the serialized text when the unit does not have id (yet?)... */
6381 if (isempty(u->id)) {
6382 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6383 return 0;
6384 }
613b411c 6385
cbc165d1
ZJS
6386 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6387 return log_oom();
e8a565cb
YW
6388
6389 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6390 if (!rt) {
cbc165d1 6391 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6392 return log_oom();
613b411c 6393
e8a565cb
YW
6394 rt = rt_create;
6395 }
6396
6397 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6398 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6399 return -ENOMEM;
613b411c
LP
6400
6401 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6402 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6403 return -ENOMEM;
613b411c
LP
6404
6405 } else if (streq(key, "netns-socket-0")) {
6406 int fd;
6407
e8a565cb 6408 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6409 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6410 return 0;
613b411c 6411 }
e8a565cb
YW
6412
6413 safe_close(rt->netns_storage_socket[0]);
6414 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6415
613b411c
LP
6416 } else if (streq(key, "netns-socket-1")) {
6417 int fd;
6418
e8a565cb 6419 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6420 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6421 return 0;
613b411c 6422 }
e8a565cb
YW
6423
6424 safe_close(rt->netns_storage_socket[1]);
6425 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6426
613b411c
LP
6427 } else
6428 return 0;
6429
e8a565cb
YW
6430 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6431 if (rt_create) {
6432 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6433 if (r < 0) {
3fe91079 6434 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6435 return 0;
6436 }
613b411c 6437
e8a565cb 6438 rt_create->manager = u->manager;
613b411c 6439
e8a565cb 6440 /* Avoid cleanup */
56a13a49 6441 TAKE_PTR(rt_create);
e8a565cb 6442 }
98b47d54 6443
e8a565cb
YW
6444 return 1;
6445}
613b411c 6446
56a13a49
ZJS
6447int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6448 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6449 char *id = NULL;
a70581ff 6450 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6451 const char *p, *v = value;
6452 size_t n;
613b411c 6453
e8a565cb
YW
6454 assert(m);
6455 assert(value);
6456 assert(fds);
98b47d54 6457
e8a565cb
YW
6458 n = strcspn(v, " ");
6459 id = strndupa(v, n);
6460 if (v[n] != ' ')
6461 goto finalize;
6462 p = v + n + 1;
6463
6464 v = startswith(p, "tmp-dir=");
6465 if (v) {
6466 n = strcspn(v, " ");
56a13a49
ZJS
6467 tmp_dir = strndup(v, n);
6468 if (!tmp_dir)
6469 return log_oom();
e8a565cb
YW
6470 if (v[n] != ' ')
6471 goto finalize;
6472 p = v + n + 1;
6473 }
6474
6475 v = startswith(p, "var-tmp-dir=");
6476 if (v) {
6477 n = strcspn(v, " ");
56a13a49
ZJS
6478 var_tmp_dir = strndup(v, n);
6479 if (!var_tmp_dir)
6480 return log_oom();
e8a565cb
YW
6481 if (v[n] != ' ')
6482 goto finalize;
6483 p = v + n + 1;
6484 }
6485
6486 v = startswith(p, "netns-socket-0=");
6487 if (v) {
6488 char *buf;
6489
6490 n = strcspn(v, " ");
6491 buf = strndupa(v, n);
c413bb28 6492
a70581ff 6493 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6494 if (r < 0)
6495 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6496 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6497 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6498 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6499 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6500 if (v[n] != ' ')
6501 goto finalize;
6502 p = v + n + 1;
613b411c
LP
6503 }
6504
e8a565cb
YW
6505 v = startswith(p, "netns-socket-1=");
6506 if (v) {
6507 char *buf;
98b47d54 6508
e8a565cb
YW
6509 n = strcspn(v, " ");
6510 buf = strndupa(v, n);
a70581ff
XR
6511
6512 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6513 if (r < 0)
6514 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6515 if (!fdset_contains(fds, netns_fdpair[1]))
6516 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6517 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6518 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6519 if (v[n] != ' ')
6520 goto finalize;
6521 p = v + n + 1;
6522 }
6523
6524 v = startswith(p, "ipcns-socket-0=");
6525 if (v) {
6526 char *buf;
6527
6528 n = strcspn(v, " ");
6529 buf = strndupa(v, n);
6530
6531 r = safe_atoi(buf, &ipcns_fdpair[0]);
6532 if (r < 0)
6533 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6534 if (!fdset_contains(fds, ipcns_fdpair[0]))
6535 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6536 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6537 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6538 if (v[n] != ' ')
6539 goto finalize;
6540 p = v + n + 1;
6541 }
6542
6543 v = startswith(p, "ipcns-socket-1=");
6544 if (v) {
6545 char *buf;
6546
6547 n = strcspn(v, " ");
6548 buf = strndupa(v, n);
6549
6550 r = safe_atoi(buf, &ipcns_fdpair[1]);
6551 if (r < 0)
6552 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6553 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6554 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6555 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6556 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6557 }
98b47d54 6558
e8a565cb 6559finalize:
a70581ff 6560 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6561 if (r < 0)
56a13a49
ZJS
6562 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6563 return 0;
e8a565cb 6564}
613b411c 6565
e8a565cb
YW
6566void exec_runtime_vacuum(Manager *m) {
6567 ExecRuntime *rt;
e8a565cb
YW
6568
6569 assert(m);
6570
6571 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6572
90e74a66 6573 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6574 if (rt->n_ref > 0)
6575 continue;
6576
6577 (void) exec_runtime_free(rt, false);
6578 }
613b411c
LP
6579}
6580
b9c04eaf
YW
6581void exec_params_clear(ExecParameters *p) {
6582 if (!p)
6583 return;
6584
c3f8a065
LP
6585 p->environment = strv_free(p->environment);
6586 p->fd_names = strv_free(p->fd_names);
6587 p->fds = mfree(p->fds);
6588 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6589}
6590
bb0c0d6f
LP
6591ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6592 if (!sc)
6593 return NULL;
6594
6595 free(sc->id);
6596 free(sc->data);
6597 return mfree(sc);
6598}
6599
6600DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6601
80876c20
LP
6602static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6603 [EXEC_INPUT_NULL] = "null",
6604 [EXEC_INPUT_TTY] = "tty",
6605 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6606 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6607 [EXEC_INPUT_SOCKET] = "socket",
6608 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6609 [EXEC_INPUT_DATA] = "data",
2038c3f5 6610 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6611};
6612
8a0867d6
LP
6613DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6614
94f04347 6615static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6616 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6617 [EXEC_OUTPUT_NULL] = "null",
80876c20 6618 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6619 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6620 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6621 [EXEC_OUTPUT_JOURNAL] = "journal",
6622 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6623 [EXEC_OUTPUT_SOCKET] = "socket",
6624 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6625 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6626 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6627 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6628};
6629
6630DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6631
6632static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6633 [EXEC_UTMP_INIT] = "init",
6634 [EXEC_UTMP_LOGIN] = "login",
6635 [EXEC_UTMP_USER] = "user",
6636};
6637
6638DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6639
6640static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6641 [EXEC_PRESERVE_NO] = "no",
6642 [EXEC_PRESERVE_YES] = "yes",
6643 [EXEC_PRESERVE_RESTART] = "restart",
6644};
6645
6646DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6647
6b7b2ed9 6648/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6649static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6650 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6651 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6652 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6653 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6654 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6655};
6656
6657DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6658
6b7b2ed9
LP
6659/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6660 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6661 * directories, specifically .timer units with their timestamp touch file. */
6662static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6663 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6664 [EXEC_DIRECTORY_STATE] = "state",
6665 [EXEC_DIRECTORY_CACHE] = "cache",
6666 [EXEC_DIRECTORY_LOGS] = "logs",
6667 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6668};
6669
6670DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6671
6672/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6673 * the service payload in. */
fb2042dd
YW
6674static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6675 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6676 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6677 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6678 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6679 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6680};
6681
6682DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6683
b1edf445
LP
6684static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6685 [EXEC_KEYRING_INHERIT] = "inherit",
6686 [EXEC_KEYRING_PRIVATE] = "private",
6687 [EXEC_KEYRING_SHARED] = "shared",
6688};
6689
6690DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);