]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #20346 from poettering/strlen-unsigned-fix
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
fdb3deca 46#include "cgroup-setup.h"
bb0c0d6f 47#include "chown-recursive.h"
da681e1b 48#include "cpu-set-util.h"
43144be4 49#include "creds-util.h"
6a818c3c 50#include "data-fd-util.h"
f6a6225e 51#include "def.h"
686d13b9 52#include "env-file.h"
4d1a6904 53#include "env-util.h"
17df7223 54#include "errno-list.h"
8a62620e 55#include "escape.h"
3ffd4af2 56#include "execute.h"
8dd4c05b 57#include "exit-status.h"
3ffd4af2 58#include "fd-util.h"
bb0c0d6f 59#include "fileio.h"
f97b34a6 60#include "format-util.h"
f4f15635 61#include "fs-util.h"
7d50b32a 62#include "glob-util.h"
0389f4fa 63#include "hexdecoct.h"
c004493c 64#include "io-util.h"
8dd4c05b 65#include "ioprio.h"
a1164ae3 66#include "label.h"
8dd4c05b
LP
67#include "log.h"
68#include "macro.h"
e8a565cb 69#include "manager.h"
2a341bb9 70#include "manager-dump.h"
0a970718 71#include "memory-util.h"
f5947a5e 72#include "missing_fs.h"
8dd4c05b 73#include "mkdir.h"
21935150 74#include "mount-util.h"
bb0c0d6f 75#include "mountpoint-util.h"
8dd4c05b 76#include "namespace.h"
6bedfcbb 77#include "parse-util.h"
8dd4c05b 78#include "path-util.h"
0b452006 79#include "process-util.h"
d3dcf4e3 80#include "random-util.h"
78f22b97 81#include "rlimit-util.h"
8dd4c05b 82#include "rm-rf.h"
349cc4a5 83#if HAVE_SECCOMP
3ffd4af2
LP
84#include "seccomp-util.h"
85#endif
07d46372 86#include "securebits-util.h"
8dd4c05b 87#include "selinux-util.h"
24882e06 88#include "signal-util.h"
8dd4c05b 89#include "smack-util.h"
57b7a260 90#include "socket-util.h"
fd63e712 91#include "special.h"
949befd3 92#include "stat-util.h"
8b43440b 93#include "string-table.h"
07630cea 94#include "string-util.h"
8dd4c05b 95#include "strv.h"
7ccbd1ae 96#include "syslog-util.h"
8dd4c05b 97#include "terminal-util.h"
bb0c0d6f 98#include "tmpfile-util.h"
566b7d23 99#include "umask-util.h"
2d3b784d 100#include "unit-serialize.h"
b1d4f8e1 101#include "user-util.h"
8dd4c05b 102#include "utmp-wtmp.h"
5cb5a6ff 103
e056b01d 104#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 105#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 106
531dca78
LP
107#define SNDBUF_SIZE (8*1024*1024)
108
da6053d0 109static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
110 if (n_fds <= 0)
111 return 0;
112
a0d40ac5
LP
113 /* Modifies the fds array! (sorts it) */
114
034c6ed7
LP
115 assert(fds);
116
5b10116e
ZJS
117 for (int start = 0;;) {
118 int restart_from = -1;
034c6ed7 119
5b10116e 120 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
3cc2aff1
LP
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
034c6ed7
LP
129 return -errno;
130
03e334a1 131 safe_close(fds[i]);
034c6ed7
LP
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 135 * let's remember that and try again from here */
034c6ed7
LP
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147}
148
25b583d7 149static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 150 size_t n_fds;
e2c76839 151 int r;
47a71eed 152
25b583d7 153 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
154 if (n_fds <= 0)
155 return 0;
156
157 assert(fds);
158
9b141911
FB
159 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
160 * O_NONBLOCK only applies to socket activation though. */
47a71eed 161
5b10116e 162 for (size_t i = 0; i < n_fds; i++) {
47a71eed 163
9b141911
FB
164 if (i < n_socket_fds) {
165 r = fd_nonblock(fds[i], nonblock);
166 if (r < 0)
167 return r;
168 }
47a71eed 169
451a074f
LP
170 /* We unconditionally drop FD_CLOEXEC from the fds,
171 * since after all we want to pass these fds to our
172 * children */
47a71eed 173
3cc2aff1
LP
174 r = fd_cloexec(fds[i], false);
175 if (r < 0)
e2c76839 176 return r;
47a71eed
LP
177 }
178
179 return 0;
180}
181
1e22b5cd 182static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
183 assert(context);
184
1e22b5cd
LP
185 if (context->stdio_as_fds)
186 return NULL;
187
80876c20
LP
188 if (context->tty_path)
189 return context->tty_path;
190
191 return "/dev/console";
192}
193
1e22b5cd
LP
194static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
195 const char *path;
196
6ea832a2
LP
197 assert(context);
198
1e22b5cd 199 path = exec_context_tty_path(context);
6ea832a2 200
1e22b5cd
LP
201 if (context->tty_vhangup) {
202 if (p && p->stdin_fd >= 0)
203 (void) terminal_vhangup_fd(p->stdin_fd);
204 else if (path)
205 (void) terminal_vhangup(path);
206 }
6ea832a2 207
1e22b5cd
LP
208 if (context->tty_reset) {
209 if (p && p->stdin_fd >= 0)
210 (void) reset_terminal_fd(p->stdin_fd, true);
211 else if (path)
212 (void) reset_terminal(path);
213 }
214
215 if (context->tty_vt_disallocate && path)
216 (void) vt_disallocate(path);
6ea832a2
LP
217}
218
6af760f3
LP
219static bool is_terminal_input(ExecInput i) {
220 return IN_SET(i,
221 EXEC_INPUT_TTY,
222 EXEC_INPUT_TTY_FORCE,
223 EXEC_INPUT_TTY_FAIL);
224}
225
3a1286b6 226static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
227 return IN_SET(o,
228 EXEC_OUTPUT_TTY,
6af760f3
LP
229 EXEC_OUTPUT_KMSG_AND_CONSOLE,
230 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
231}
232
aac8c0c3
LP
233static bool is_kmsg_output(ExecOutput o) {
234 return IN_SET(o,
235 EXEC_OUTPUT_KMSG,
236 EXEC_OUTPUT_KMSG_AND_CONSOLE);
237}
238
6af760f3
LP
239static bool exec_context_needs_term(const ExecContext *c) {
240 assert(c);
241
242 /* Return true if the execution context suggests we should set $TERM to something useful. */
243
244 if (is_terminal_input(c->std_input))
245 return true;
246
247 if (is_terminal_output(c->std_output))
248 return true;
249
250 if (is_terminal_output(c->std_error))
251 return true;
252
253 return !!c->tty_path;
3a1286b6
MS
254}
255
80876c20 256static int open_null_as(int flags, int nfd) {
046a82c1 257 int fd;
071830ff 258
80876c20 259 assert(nfd >= 0);
071830ff 260
613b411c
LP
261 fd = open("/dev/null", flags|O_NOCTTY);
262 if (fd < 0)
071830ff
LP
263 return -errno;
264
046a82c1 265 return move_fd(fd, nfd, false);
071830ff
LP
266}
267
91dd5f7c
LP
268static int connect_journal_socket(
269 int fd,
270 const char *log_namespace,
271 uid_t uid,
272 gid_t gid) {
273
f36a9d59
ZJS
274 union sockaddr_union sa;
275 socklen_t sa_len;
524daa8c
ZJS
276 uid_t olduid = UID_INVALID;
277 gid_t oldgid = GID_INVALID;
91dd5f7c 278 const char *j;
524daa8c
ZJS
279 int r;
280
91dd5f7c
LP
281 j = log_namespace ?
282 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
283 "/run/systemd/journal/stdout";
284 r = sockaddr_un_set_path(&sa.un, j);
285 if (r < 0)
286 return r;
f36a9d59 287 sa_len = r;
91dd5f7c 288
cad93f29 289 if (gid_is_valid(gid)) {
524daa8c
ZJS
290 oldgid = getgid();
291
92a17af9 292 if (setegid(gid) < 0)
524daa8c
ZJS
293 return -errno;
294 }
295
cad93f29 296 if (uid_is_valid(uid)) {
524daa8c
ZJS
297 olduid = getuid();
298
92a17af9 299 if (seteuid(uid) < 0) {
524daa8c
ZJS
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
f36a9d59 305 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
524daa8c
ZJS
306
307 /* If we fail to restore the uid or gid, things will likely
308 fail later on. This should only happen if an LSM interferes. */
309
cad93f29 310 if (uid_is_valid(uid))
524daa8c
ZJS
311 (void) seteuid(olduid);
312
313 restore_gid:
cad93f29 314 if (gid_is_valid(gid))
524daa8c
ZJS
315 (void) setegid(oldgid);
316
317 return r;
318}
319
fd1f9c89 320static int connect_logger_as(
34cf6c43 321 const Unit *unit,
fd1f9c89 322 const ExecContext *context,
af635cf3 323 const ExecParameters *params,
fd1f9c89
LP
324 ExecOutput output,
325 const char *ident,
fd1f9c89
LP
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
2ac1ff68
EV
330 _cleanup_close_ int fd = -1;
331 int r;
071830ff
LP
332
333 assert(context);
af635cf3 334 assert(params);
80876c20
LP
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
071830ff 338
54fe0cdb
LP
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
80876c20 341 return -errno;
071830ff 342
91dd5f7c 343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
344 if (r < 0)
345 return r;
071830ff 346
2ac1ff68 347 if (shutdown(fd, SHUT_RD) < 0)
80876c20 348 return -errno;
071830ff 349
fd1f9c89 350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 351
2ac1ff68 352 if (dprintf(fd,
62bca2c6 353 "%s\n"
80876c20
LP
354 "%s\n"
355 "%i\n"
54fe0cdb
LP
356 "%i\n"
357 "%i\n"
358 "%i\n"
4f4a1dbf 359 "%i\n",
c867611e 360 context->syslog_identifier ?: ident,
af635cf3 361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
f3dc6af2 364 false,
aac8c0c3 365 is_kmsg_output(output),
2ac1ff68
EV
366 is_terminal_output(output)) < 0)
367 return -errno;
80876c20 368
2ac1ff68 369 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 370}
2ac1ff68 371
3a274a21 372static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 373 int fd;
071830ff 374
80876c20
LP
375 assert(path);
376 assert(nfd >= 0);
fd1f9c89 377
3a274a21 378 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 379 if (fd < 0)
80876c20 380 return fd;
071830ff 381
046a82c1 382 return move_fd(fd, nfd, false);
80876c20 383}
071830ff 384
2038c3f5 385static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
386 union sockaddr_union sa;
387 socklen_t sa_len;
15a3e96f 388 _cleanup_close_ int fd = -1;
86fca584 389 int r;
071830ff 390
80876c20 391 assert(path);
071830ff 392
2038c3f5
LP
393 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
394 flags |= O_CREAT;
395
396 fd = open(path, flags|O_NOCTTY, mode);
397 if (fd >= 0)
15a3e96f 398 return TAKE_FD(fd);
071830ff 399
2038c3f5
LP
400 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
401 return -errno;
2038c3f5
LP
402
403 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
404
86fca584
ZJS
405 r = sockaddr_un_set_path(&sa.un, path);
406 if (r < 0)
407 return r == -EINVAL ? -ENXIO : r;
408 sa_len = r;
409
2038c3f5
LP
410 fd = socket(AF_UNIX, SOCK_STREAM, 0);
411 if (fd < 0)
412 return -errno;
413
86fca584 414 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 415 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 416 * indication that this wasn't an AF_UNIX socket after all */
071830ff 417
2038c3f5
LP
418 if ((flags & O_ACCMODE) == O_RDONLY)
419 r = shutdown(fd, SHUT_WR);
420 else if ((flags & O_ACCMODE) == O_WRONLY)
421 r = shutdown(fd, SHUT_RD);
422 else
86fca584 423 r = 0;
15a3e96f 424 if (r < 0)
2038c3f5 425 return -errno;
2038c3f5 426
15a3e96f 427 return TAKE_FD(fd);
80876c20 428}
071830ff 429
08f3be7a
LP
430static int fixup_input(
431 const ExecContext *context,
432 int socket_fd,
433 bool apply_tty_stdin) {
434
435 ExecInput std_input;
436
437 assert(context);
438
439 std_input = context->std_input;
1e3ad081
LP
440
441 if (is_terminal_input(std_input) && !apply_tty_stdin)
442 return EXEC_INPUT_NULL;
071830ff 443
03fd9c49 444 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
445 return EXEC_INPUT_NULL;
446
08f3be7a
LP
447 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
448 return EXEC_INPUT_NULL;
449
03fd9c49 450 return std_input;
4f2d528d
LP
451}
452
7966a916 453static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 454
7966a916 455 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
456 return EXEC_OUTPUT_INHERIT;
457
7966a916 458 return output;
4f2d528d
LP
459}
460
a34ceba6
LP
461static int setup_input(
462 const ExecContext *context,
463 const ExecParameters *params,
52c239d7 464 int socket_fd,
2caa38e9 465 const int named_iofds[static 3]) {
a34ceba6 466
4f2d528d
LP
467 ExecInput i;
468
469 assert(context);
a34ceba6 470 assert(params);
2caa38e9 471 assert(named_iofds);
a34ceba6
LP
472
473 if (params->stdin_fd >= 0) {
474 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
475 return -errno;
476
477 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
478 if (isatty(STDIN_FILENO)) {
479 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
480 (void) reset_terminal_fd(STDIN_FILENO, true);
481 }
a34ceba6
LP
482
483 return STDIN_FILENO;
484 }
4f2d528d 485
08f3be7a 486 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
487
488 switch (i) {
071830ff 489
80876c20
LP
490 case EXEC_INPUT_NULL:
491 return open_null_as(O_RDONLY, STDIN_FILENO);
492
493 case EXEC_INPUT_TTY:
494 case EXEC_INPUT_TTY_FORCE:
495 case EXEC_INPUT_TTY_FAIL: {
046a82c1 496 int fd;
071830ff 497
1e22b5cd 498 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
499 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
500 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
501 ACQUIRE_TERMINAL_WAIT,
3a43da28 502 USEC_INFINITY);
970edce6 503 if (fd < 0)
80876c20
LP
504 return fd;
505
046a82c1 506 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
507 }
508
4f2d528d 509 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
510 assert(socket_fd >= 0);
511
4f2d528d
LP
512 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
52c239d7 514 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
515 assert(named_iofds[STDIN_FILENO] >= 0);
516
52c239d7
LB
517 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
518 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
519
08f3be7a
LP
520 case EXEC_INPUT_DATA: {
521 int fd;
522
523 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
524 if (fd < 0)
525 return fd;
526
527 return move_fd(fd, STDIN_FILENO, false);
528 }
529
2038c3f5
LP
530 case EXEC_INPUT_FILE: {
531 bool rw;
532 int fd;
533
534 assert(context->stdio_file[STDIN_FILENO]);
535
536 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
537 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
538
539 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
540 if (fd < 0)
541 return fd;
542
543 return move_fd(fd, STDIN_FILENO, false);
544 }
545
80876c20
LP
546 default:
547 assert_not_reached("Unknown input type");
548 }
549}
550
41fc585a
LP
551static bool can_inherit_stderr_from_stdout(
552 const ExecContext *context,
553 ExecOutput o,
554 ExecOutput e) {
555
556 assert(context);
557
558 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
559 * stderr fd */
560
561 if (e == EXEC_OUTPUT_INHERIT)
562 return true;
563 if (e != o)
564 return false;
565
566 if (e == EXEC_OUTPUT_NAMED_FD)
567 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
568
8d7dab1f 569 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
570 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
571
572 return true;
573}
574
a34ceba6 575static int setup_output(
34cf6c43 576 const Unit *unit,
a34ceba6
LP
577 const ExecContext *context,
578 const ExecParameters *params,
579 int fileno,
580 int socket_fd,
2caa38e9 581 const int named_iofds[static 3],
a34ceba6 582 const char *ident,
7bce046b
LP
583 uid_t uid,
584 gid_t gid,
585 dev_t *journal_stream_dev,
586 ino_t *journal_stream_ino) {
a34ceba6 587
4f2d528d
LP
588 ExecOutput o;
589 ExecInput i;
47c1d80d 590 int r;
4f2d528d 591
f2341e0a 592 assert(unit);
80876c20 593 assert(context);
a34ceba6 594 assert(params);
80876c20 595 assert(ident);
7bce046b
LP
596 assert(journal_stream_dev);
597 assert(journal_stream_ino);
80876c20 598
a34ceba6
LP
599 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
600
601 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
602 return -errno;
603
604 return STDOUT_FILENO;
605 }
606
607 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
608 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
609 return -errno;
610
611 return STDERR_FILENO;
612 }
613
08f3be7a 614 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 615 o = fixup_output(context->std_output, socket_fd);
4f2d528d 616
eb17e935
MS
617 if (fileno == STDERR_FILENO) {
618 ExecOutput e;
619 e = fixup_output(context->std_error, socket_fd);
80876c20 620
eb17e935
MS
621 /* This expects the input and output are already set up */
622
623 /* Don't change the stderr file descriptor if we inherit all
624 * the way and are not on a tty */
625 if (e == EXEC_OUTPUT_INHERIT &&
626 o == EXEC_OUTPUT_INHERIT &&
627 i == EXEC_INPUT_NULL &&
628 !is_terminal_input(context->std_input) &&
7966a916 629 getppid() != 1)
eb17e935
MS
630 return fileno;
631
632 /* Duplicate from stdout if possible */
41fc585a 633 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 634 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 635
eb17e935 636 o = e;
80876c20 637
eb17e935 638 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
639 /* If input got downgraded, inherit the original value */
640 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 641 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 642
08f3be7a
LP
643 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
644 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 645 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 646
acb591e4
LP
647 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
648 if (getppid() != 1)
eb17e935 649 return fileno;
94f04347 650
eb17e935
MS
651 /* We need to open /dev/null here anew, to get the right access mode. */
652 return open_null_as(O_WRONLY, fileno);
071830ff 653 }
94f04347 654
eb17e935 655 switch (o) {
80876c20
LP
656
657 case EXEC_OUTPUT_NULL:
eb17e935 658 return open_null_as(O_WRONLY, fileno);
80876c20
LP
659
660 case EXEC_OUTPUT_TTY:
4f2d528d 661 if (is_terminal_input(i))
eb17e935 662 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
663
664 /* We don't reset the terminal if this is just about output */
1e22b5cd 665 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 666
9a6bca7a 667 case EXEC_OUTPUT_KMSG:
28dbc1e8 668 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
669 case EXEC_OUTPUT_JOURNAL:
670 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 671 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 672 if (r < 0) {
7966a916
ZJS
673 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
674 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 675 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
676 } else {
677 struct stat st;
678
679 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
680 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
681 * services to detect whether they are connected to the journal or not.
682 *
683 * If both stdout and stderr are connected to a stream then let's make sure to store the data
684 * about STDERR as that's usually the best way to do logging. */
7bce046b 685
ab2116b1
LP
686 if (fstat(fileno, &st) >= 0 &&
687 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
688 *journal_stream_dev = st.st_dev;
689 *journal_stream_ino = st.st_ino;
690 }
47c1d80d
MS
691 }
692 return r;
4f2d528d
LP
693
694 case EXEC_OUTPUT_SOCKET:
695 assert(socket_fd >= 0);
e75a9ed1 696
eb17e935 697 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 698
52c239d7 699 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
700 assert(named_iofds[fileno] >= 0);
701
52c239d7
LB
702 (void) fd_nonblock(named_iofds[fileno], false);
703 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
704
566b7d23 705 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
706 case EXEC_OUTPUT_FILE_APPEND:
707 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 708 bool rw;
566b7d23 709 int fd, flags;
2038c3f5
LP
710
711 assert(context->stdio_file[fileno]);
712
713 rw = context->std_input == EXEC_INPUT_FILE &&
714 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
715
716 if (rw)
717 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
718
566b7d23
ZD
719 flags = O_WRONLY;
720 if (o == EXEC_OUTPUT_FILE_APPEND)
721 flags |= O_APPEND;
8d7dab1f
LW
722 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
723 flags |= O_TRUNC;
566b7d23
ZD
724
725 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
726 if (fd < 0)
727 return fd;
728
566b7d23 729 return move_fd(fd, fileno, 0);
2038c3f5
LP
730 }
731
94f04347 732 default:
80876c20 733 assert_not_reached("Unknown error type");
94f04347 734 }
071830ff
LP
735}
736
02a51aba 737static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 738 int r;
02a51aba
LP
739
740 assert(fd >= 0);
02a51aba 741
1ff74fb6 742 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
743 if (isatty(fd) < 1) {
744 if (IN_SET(errno, EINVAL, ENOTTY))
745 return 0; /* not a tty */
1ff74fb6 746
02a51aba 747 return -errno;
4b3b5bc7 748 }
02a51aba 749
4b3b5bc7 750 /* This might fail. What matters are the results. */
f2df231f 751 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
752 if (r < 0)
753 return r;
02a51aba 754
4b3b5bc7 755 return 1;
02a51aba
LP
756}
757
7d5ceb64 758static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
759 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
760 int r;
80876c20 761
80876c20
LP
762 assert(_saved_stdin);
763 assert(_saved_stdout);
764
af6da548
LP
765 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
766 if (saved_stdin < 0)
767 return -errno;
80876c20 768
af6da548 769 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
770 if (saved_stdout < 0)
771 return -errno;
80876c20 772
8854d795 773 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
774 if (fd < 0)
775 return fd;
80876c20 776
af6da548
LP
777 r = chown_terminal(fd, getuid());
778 if (r < 0)
3d18b167 779 return r;
02a51aba 780
3d18b167
LP
781 r = reset_terminal_fd(fd, true);
782 if (r < 0)
783 return r;
80876c20 784
2b33ab09 785 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 786 fd = -1;
2b33ab09
LP
787 if (r < 0)
788 return r;
80876c20
LP
789
790 *_saved_stdin = saved_stdin;
791 *_saved_stdout = saved_stdout;
792
3d18b167 793 saved_stdin = saved_stdout = -1;
80876c20 794
3d18b167 795 return 0;
80876c20
LP
796}
797
63d77c92 798static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
799 assert(err < 0);
800
801 if (err == -ETIMEDOUT)
63d77c92 802 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
803 else {
804 errno = -err;
63d77c92 805 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
806 }
807}
808
63d77c92 809static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 810 _cleanup_close_ int fd = -1;
80876c20 811
3b20f877 812 assert(vc);
80876c20 813
7d5ceb64 814 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 815 if (fd < 0)
3b20f877 816 return;
80876c20 817
63d77c92 818 write_confirm_error_fd(err, fd, u);
af6da548 819}
80876c20 820
3d18b167 821static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 822 int r = 0;
80876c20 823
af6da548
LP
824 assert(saved_stdin);
825 assert(saved_stdout);
826
827 release_terminal();
828
829 if (*saved_stdin >= 0)
80876c20 830 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 831 r = -errno;
80876c20 832
af6da548 833 if (*saved_stdout >= 0)
80876c20 834 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 835 r = -errno;
80876c20 836
3d18b167
LP
837 *saved_stdin = safe_close(*saved_stdin);
838 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
839
840 return r;
841}
842
3b20f877
FB
843enum {
844 CONFIRM_PRETEND_FAILURE = -1,
845 CONFIRM_PRETEND_SUCCESS = 0,
846 CONFIRM_EXECUTE = 1,
847};
848
eedf223a 849static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 850 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 851 _cleanup_free_ char *e = NULL;
3b20f877 852 char c;
af6da548 853
3b20f877 854 /* For any internal errors, assume a positive response. */
7d5ceb64 855 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 856 if (r < 0) {
63d77c92 857 write_confirm_error(r, vc, u);
3b20f877
FB
858 return CONFIRM_EXECUTE;
859 }
af6da548 860
b0eb2944
FB
861 /* confirm_spawn might have been disabled while we were sleeping. */
862 if (manager_is_confirm_spawn_disabled(u->manager)) {
863 r = 1;
864 goto restore_stdio;
865 }
af6da548 866
2bcd3c26
FB
867 e = ellipsize(cmdline, 60, 100);
868 if (!e) {
869 log_oom();
870 r = CONFIRM_EXECUTE;
871 goto restore_stdio;
872 }
af6da548 873
d172b175 874 for (;;) {
539622bd 875 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 876 if (r < 0) {
63d77c92 877 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
878 r = CONFIRM_EXECUTE;
879 goto restore_stdio;
880 }
af6da548 881
d172b175 882 switch (c) {
b0eb2944
FB
883 case 'c':
884 printf("Resuming normal execution.\n");
885 manager_disable_confirm_spawn();
886 r = 1;
887 break;
dd6f9ac0
FB
888 case 'D':
889 unit_dump(u, stdout, " ");
890 continue; /* ask again */
d172b175
FB
891 case 'f':
892 printf("Failing execution.\n");
893 r = CONFIRM_PRETEND_FAILURE;
894 break;
895 case 'h':
b0eb2944
FB
896 printf(" c - continue, proceed without asking anymore\n"
897 " D - dump, show the state of the unit\n"
dd6f9ac0 898 " f - fail, don't execute the command and pretend it failed\n"
d172b175 899 " h - help\n"
eedf223a 900 " i - info, show a short summary of the unit\n"
56fde33a 901 " j - jobs, show jobs that are in progress\n"
d172b175
FB
902 " s - skip, don't execute the command and pretend it succeeded\n"
903 " y - yes, execute the command\n");
dd6f9ac0 904 continue; /* ask again */
eedf223a
FB
905 case 'i':
906 printf(" Description: %s\n"
907 " Unit: %s\n"
908 " Command: %s\n",
909 u->id, u->description, cmdline);
910 continue; /* ask again */
56fde33a
FB
911 case 'j':
912 manager_dump_jobs(u->manager, stdout, " ");
913 continue; /* ask again */
539622bd
FB
914 case 'n':
915 /* 'n' was removed in favor of 'f'. */
916 printf("Didn't understand 'n', did you mean 'f'?\n");
917 continue; /* ask again */
d172b175
FB
918 case 's':
919 printf("Skipping execution.\n");
920 r = CONFIRM_PRETEND_SUCCESS;
921 break;
922 case 'y':
923 r = CONFIRM_EXECUTE;
924 break;
925 default:
926 assert_not_reached("Unhandled choice");
927 }
3b20f877 928 break;
3b20f877 929 }
af6da548 930
3b20f877 931restore_stdio:
af6da548 932 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 933 return r;
80876c20
LP
934}
935
4d885bd3
DH
936static int get_fixed_user(const ExecContext *c, const char **user,
937 uid_t *uid, gid_t *gid,
938 const char **home, const char **shell) {
81a2b7ce 939 int r;
4d885bd3 940 const char *name;
81a2b7ce 941
4d885bd3 942 assert(c);
81a2b7ce 943
23deef88
LP
944 if (!c->user)
945 return 0;
946
4d885bd3
DH
947 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
948 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 949
23deef88 950 name = c->user;
fafff8f1 951 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
952 if (r < 0)
953 return r;
81a2b7ce 954
4d885bd3
DH
955 *user = name;
956 return 0;
957}
958
959static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
960 int r;
961 const char *name;
962
963 assert(c);
964
965 if (!c->group)
966 return 0;
967
968 name = c->group;
fafff8f1 969 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
970 if (r < 0)
971 return r;
972
973 *group = name;
974 return 0;
975}
976
cdc5d5c5
DH
977static int get_supplementary_groups(const ExecContext *c, const char *user,
978 const char *group, gid_t gid,
979 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
980 char **i;
981 int r, k = 0;
982 int ngroups_max;
983 bool keep_groups = false;
984 gid_t *groups = NULL;
985 _cleanup_free_ gid_t *l_gids = NULL;
986
987 assert(c);
988
bbeea271
DH
989 /*
990 * If user is given, then lookup GID and supplementary groups list.
991 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
992 * here and as early as possible so we keep the list of supplementary
993 * groups of the caller.
bbeea271
DH
994 */
995 if (user && gid_is_valid(gid) && gid != 0) {
996 /* First step, initialize groups from /etc/groups */
997 if (initgroups(user, gid) < 0)
998 return -errno;
999
1000 keep_groups = true;
1001 }
1002
ac6e8be6 1003 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1004 return 0;
1005
366ddd25
DH
1006 /*
1007 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1008 * be positive, otherwise fail.
1009 */
1010 errno = 0;
1011 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1012 if (ngroups_max <= 0)
1013 return errno_or_else(EOPNOTSUPP);
366ddd25 1014
4d885bd3
DH
1015 l_gids = new(gid_t, ngroups_max);
1016 if (!l_gids)
1017 return -ENOMEM;
81a2b7ce 1018
4d885bd3
DH
1019 if (keep_groups) {
1020 /*
1021 * Lookup the list of groups that the user belongs to, we
1022 * avoid NSS lookups here too for gid=0.
1023 */
1024 k = ngroups_max;
1025 if (getgrouplist(user, gid, l_gids, &k) < 0)
1026 return -EINVAL;
1027 } else
1028 k = 0;
81a2b7ce 1029
4d885bd3
DH
1030 STRV_FOREACH(i, c->supplementary_groups) {
1031 const char *g;
81a2b7ce 1032
4d885bd3
DH
1033 if (k >= ngroups_max)
1034 return -E2BIG;
81a2b7ce 1035
4d885bd3 1036 g = *i;
fafff8f1 1037 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1038 if (r < 0)
1039 return r;
81a2b7ce 1040
4d885bd3
DH
1041 k++;
1042 }
81a2b7ce 1043
4d885bd3
DH
1044 /*
1045 * Sets ngids to zero to drop all supplementary groups, happens
1046 * when we are under root and SupplementaryGroups= is empty.
1047 */
1048 if (k == 0) {
1049 *ngids = 0;
1050 return 0;
1051 }
81a2b7ce 1052
4d885bd3
DH
1053 /* Otherwise get the final list of supplementary groups */
1054 groups = memdup(l_gids, sizeof(gid_t) * k);
1055 if (!groups)
1056 return -ENOMEM;
1057
1058 *supplementary_gids = groups;
1059 *ngids = k;
1060
1061 groups = NULL;
1062
1063 return 0;
1064}
1065
34cf6c43 1066static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1067 int r;
1068
709dbeac
YW
1069 /* Handle SupplementaryGroups= if it is not empty */
1070 if (ngids > 0) {
4d885bd3
DH
1071 r = maybe_setgroups(ngids, supplementary_gids);
1072 if (r < 0)
97f0e76f 1073 return r;
4d885bd3 1074 }
81a2b7ce 1075
4d885bd3
DH
1076 if (gid_is_valid(gid)) {
1077 /* Then set our gids */
1078 if (setresgid(gid, gid, gid) < 0)
1079 return -errno;
81a2b7ce
LP
1080 }
1081
1082 return 0;
1083}
1084
dbdc4098
TK
1085static int set_securebits(int bits, int mask) {
1086 int current, applied;
1087 current = prctl(PR_GET_SECUREBITS);
1088 if (current < 0)
1089 return -errno;
1090 /* Clear all securebits defined in mask and set bits */
1091 applied = (current & ~mask) | bits;
1092 if (current == applied)
1093 return 0;
1094 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1095 return -errno;
1096 return 1;
1097}
1098
81a2b7ce 1099static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1100 assert(context);
dbdc4098 1101 int r;
81a2b7ce 1102
4d885bd3
DH
1103 if (!uid_is_valid(uid))
1104 return 0;
1105
479050b3 1106 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1107 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1108 * required, so we also need keep-caps in this case.
1109 */
81a2b7ce 1110
dbdc4098 1111 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1112
1113 /* First step: If we need to keep capabilities but
1114 * drop privileges we need to make sure we keep our
cbb21cca 1115 * caps, while we drop privileges. */
693ced48 1116 if (uid != 0) {
dbdc4098
TK
1117 /* Add KEEP_CAPS to the securebits */
1118 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1119 if (r < 0)
1120 return r;
693ced48 1121 }
81a2b7ce
LP
1122 }
1123
479050b3 1124 /* Second step: actually set the uids */
81a2b7ce
LP
1125 if (setresuid(uid, uid, uid) < 0)
1126 return -errno;
1127
1128 /* At this point we should have all necessary capabilities but
1129 are otherwise a normal user. However, the caps might got
1130 corrupted due to the setresuid() so we need clean them up
1131 later. This is done outside of this call. */
1132
1133 return 0;
1134}
1135
349cc4a5 1136#if HAVE_PAM
5b6319dc
LP
1137
1138static int null_conv(
1139 int num_msg,
1140 const struct pam_message **msg,
1141 struct pam_response **resp,
1142 void *appdata_ptr) {
1143
1144 /* We don't support conversations */
1145
1146 return PAM_CONV_ERR;
1147}
1148
cefc33ae
LP
1149#endif
1150
5b6319dc
LP
1151static int setup_pam(
1152 const char *name,
1153 const char *user,
940c5210 1154 uid_t uid,
2d6fce8d 1155 gid_t gid,
5b6319dc 1156 const char *tty,
2065ca69 1157 char ***env,
5b8d1f6b 1158 const int fds[], size_t n_fds) {
5b6319dc 1159
349cc4a5 1160#if HAVE_PAM
cefc33ae 1161
5b6319dc
LP
1162 static const struct pam_conv conv = {
1163 .conv = null_conv,
1164 .appdata_ptr = NULL
1165 };
1166
2d7c6aa2 1167 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1168 pam_handle_t *handle = NULL;
d6e5f3ad 1169 sigset_t old_ss;
7bb70b6e 1170 int pam_code = PAM_SUCCESS, r;
84eada2f 1171 char **nv, **e = NULL;
5b6319dc
LP
1172 bool close_session = false;
1173 pid_t pam_pid = 0, parent_pid;
970edce6 1174 int flags = 0;
5b6319dc
LP
1175
1176 assert(name);
1177 assert(user);
2065ca69 1178 assert(env);
5b6319dc
LP
1179
1180 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1181 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1182 * systemd via the cgroup logic. It will then remove the PAM
1183 * session again. The parent process will exec() the actual
1184 * daemon. We do things this way to ensure that the main PID
1185 * of the daemon is the one we initially fork()ed. */
1186
7bb70b6e
LP
1187 r = barrier_create(&barrier);
1188 if (r < 0)
2d7c6aa2
DH
1189 goto fail;
1190
553d2243 1191 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1192 flags |= PAM_SILENT;
1193
f546241b
ZJS
1194 pam_code = pam_start(name, user, &conv, &handle);
1195 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1196 handle = NULL;
1197 goto fail;
1198 }
1199
3cd24c1a
LP
1200 if (!tty) {
1201 _cleanup_free_ char *q = NULL;
1202
1203 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1204 * out if that's the case, and read the TTY off it. */
1205
1206 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1207 tty = strjoina("/dev/", q);
1208 }
1209
f546241b
ZJS
1210 if (tty) {
1211 pam_code = pam_set_item(handle, PAM_TTY, tty);
1212 if (pam_code != PAM_SUCCESS)
5b6319dc 1213 goto fail;
f546241b 1214 }
5b6319dc 1215
84eada2f
JW
1216 STRV_FOREACH(nv, *env) {
1217 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1218 if (pam_code != PAM_SUCCESS)
1219 goto fail;
1220 }
1221
970edce6 1222 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1223 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1224 goto fail;
1225
3bb39ea9
DG
1226 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1227 if (pam_code != PAM_SUCCESS)
46d7c6af 1228 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1229
970edce6 1230 pam_code = pam_open_session(handle, flags);
f546241b 1231 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1232 goto fail;
1233
1234 close_session = true;
1235
f546241b
ZJS
1236 e = pam_getenvlist(handle);
1237 if (!e) {
5b6319dc
LP
1238 pam_code = PAM_BUF_ERR;
1239 goto fail;
1240 }
1241
1242 /* Block SIGTERM, so that we know that it won't get lost in
1243 * the child */
ce30c8dc 1244
72c0a2c2 1245 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1246
df0ff127 1247 parent_pid = getpid_cached();
5b6319dc 1248
4c253ed1
LP
1249 r = safe_fork("(sd-pam)", 0, &pam_pid);
1250 if (r < 0)
5b6319dc 1251 goto fail;
4c253ed1 1252 if (r == 0) {
7bb70b6e 1253 int sig, ret = EXIT_PAM;
5b6319dc
LP
1254
1255 /* The child's job is to reset the PAM session on
1256 * termination */
2d7c6aa2 1257 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1258
1da37e58
ZJS
1259 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1260 * those fds are open here that have been opened by PAM. */
4c253ed1 1261 (void) close_many(fds, n_fds);
5b6319dc 1262
940c5210
AK
1263 /* Drop privileges - we don't need any to pam_close_session
1264 * and this will make PR_SET_PDEATHSIG work in most cases.
1265 * If this fails, ignore the error - but expect sd-pam threads
1266 * to fail to exit normally */
2d6fce8d 1267
97f0e76f
LP
1268 r = maybe_setgroups(0, NULL);
1269 if (r < 0)
1270 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1271 if (setresgid(gid, gid, gid) < 0)
1272 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1273 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1274 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1275
9c274488 1276 (void) ignore_signals(SIGPIPE);
ce30c8dc 1277
940c5210
AK
1278 /* Wait until our parent died. This will only work if
1279 * the above setresuid() succeeds, otherwise the kernel
1280 * will not allow unprivileged parents kill their privileged
1281 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1282 * to do the rest for us. */
1283 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1284 goto child_finish;
1285
2d7c6aa2
DH
1286 /* Tell the parent that our setup is done. This is especially
1287 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1288 * setup might race against our setresuid(2) call.
1289 *
1290 * If the parent aborted, we'll detect this below, hence ignore
1291 * return failure here. */
1292 (void) barrier_place(&barrier);
2d7c6aa2 1293
643f4706 1294 /* Check if our parent process might already have died? */
5b6319dc 1295 if (getppid() == parent_pid) {
d6e5f3ad
DM
1296 sigset_t ss;
1297
1298 assert_se(sigemptyset(&ss) >= 0);
1299 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1300
3dead8d9
LP
1301 for (;;) {
1302 if (sigwait(&ss, &sig) < 0) {
1303 if (errno == EINTR)
1304 continue;
1305
1306 goto child_finish;
1307 }
5b6319dc 1308
3dead8d9
LP
1309 assert(sig == SIGTERM);
1310 break;
1311 }
5b6319dc
LP
1312 }
1313
3bb39ea9
DG
1314 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1315 if (pam_code != PAM_SUCCESS)
1316 goto child_finish;
1317
3dead8d9 1318 /* If our parent died we'll end the session */
f546241b 1319 if (getppid() != parent_pid) {
970edce6 1320 pam_code = pam_close_session(handle, flags);
f546241b 1321 if (pam_code != PAM_SUCCESS)
5b6319dc 1322 goto child_finish;
f546241b 1323 }
5b6319dc 1324
7bb70b6e 1325 ret = 0;
5b6319dc
LP
1326
1327 child_finish:
970edce6 1328 pam_end(handle, pam_code | flags);
7bb70b6e 1329 _exit(ret);
5b6319dc
LP
1330 }
1331
2d7c6aa2
DH
1332 barrier_set_role(&barrier, BARRIER_PARENT);
1333
5b6319dc
LP
1334 /* If the child was forked off successfully it will do all the
1335 * cleanups, so forget about the handle here. */
1336 handle = NULL;
1337
3b8bddde 1338 /* Unblock SIGTERM again in the parent */
72c0a2c2 1339 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1340
1341 /* We close the log explicitly here, since the PAM modules
1342 * might have opened it, but we don't want this fd around. */
1343 closelog();
1344
2d7c6aa2
DH
1345 /* Synchronously wait for the child to initialize. We don't care for
1346 * errors as we cannot recover. However, warn loudly if it happens. */
1347 if (!barrier_place_and_sync(&barrier))
1348 log_error("PAM initialization failed");
1349
130d3d22 1350 return strv_free_and_replace(*env, e);
5b6319dc
LP
1351
1352fail:
970edce6
ZJS
1353 if (pam_code != PAM_SUCCESS) {
1354 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1355 r = -EPERM; /* PAM errors do not map to errno */
1356 } else
1357 log_error_errno(r, "PAM failed: %m");
9ba35398 1358
5b6319dc
LP
1359 if (handle) {
1360 if (close_session)
970edce6 1361 pam_code = pam_close_session(handle, flags);
5b6319dc 1362
970edce6 1363 pam_end(handle, pam_code | flags);
5b6319dc
LP
1364 }
1365
1366 strv_free(e);
5b6319dc
LP
1367 closelog();
1368
7bb70b6e 1369 return r;
cefc33ae
LP
1370#else
1371 return 0;
5b6319dc 1372#endif
cefc33ae 1373}
5b6319dc 1374
5d6b1584
LP
1375static void rename_process_from_path(const char *path) {
1376 char process_name[11];
1377 const char *p;
1378 size_t l;
1379
1380 /* This resulting string must fit in 10 chars (i.e. the length
1381 * of "/sbin/init") to look pretty in /bin/ps */
1382
2b6bf07d 1383 p = basename(path);
5d6b1584
LP
1384 if (isempty(p)) {
1385 rename_process("(...)");
1386 return;
1387 }
1388
1389 l = strlen(p);
1390 if (l > 8) {
1391 /* The end of the process name is usually more
1392 * interesting, since the first bit might just be
1393 * "systemd-" */
1394 p = p + l - 8;
1395 l = 8;
1396 }
1397
1398 process_name[0] = '(';
1399 memcpy(process_name+1, p, l);
1400 process_name[1+l] = ')';
1401 process_name[1+l+1] = 0;
1402
1403 rename_process(process_name);
1404}
1405
469830d1
LP
1406static bool context_has_address_families(const ExecContext *c) {
1407 assert(c);
1408
6b000af4 1409 return c->address_families_allow_list ||
469830d1
LP
1410 !set_isempty(c->address_families);
1411}
1412
1413static bool context_has_syscall_filters(const ExecContext *c) {
1414 assert(c);
1415
6b000af4 1416 return c->syscall_allow_list ||
8cfa775f 1417 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1418}
1419
9df2cdd8
TM
1420static bool context_has_syscall_logs(const ExecContext *c) {
1421 assert(c);
1422
1423 return c->syscall_log_allow_list ||
1424 !hashmap_isempty(c->syscall_log);
1425}
1426
469830d1
LP
1427static bool context_has_no_new_privileges(const ExecContext *c) {
1428 assert(c);
1429
1430 if (c->no_new_privileges)
1431 return true;
1432
1433 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1434 return false;
1435
1436 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1437 return c->lock_personality ||
469830d1 1438 c->memory_deny_write_execute ||
0538d2a8 1439 c->private_devices ||
fc64760d 1440 c->protect_clock ||
0538d2a8 1441 c->protect_hostname ||
469830d1
LP
1442 c->protect_kernel_tunables ||
1443 c->protect_kernel_modules ||
84703040 1444 c->protect_kernel_logs ||
0538d2a8
YW
1445 context_has_address_families(c) ||
1446 exec_context_restrict_namespaces_set(c) ||
1447 c->restrict_realtime ||
1448 c->restrict_suid_sgid ||
78e864e5 1449 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1450 context_has_syscall_filters(c) ||
1451 context_has_syscall_logs(c);
469830d1
LP
1452}
1453
bb0c0d6f
LP
1454static bool exec_context_has_credentials(const ExecContext *context) {
1455
1456 assert(context);
1457
1458 return !hashmap_isempty(context->set_credentials) ||
43144be4 1459 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1460}
1461
349cc4a5 1462#if HAVE_SECCOMP
17df7223 1463
83f12b27 1464static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1465
1466 if (is_seccomp_available())
1467 return false;
1468
f673b62d 1469 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1470 return true;
83f12b27
FS
1471}
1472
165a31c0 1473static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1474 uint32_t negative_action, default_action, action;
165a31c0 1475 int r;
8351ceae 1476
469830d1 1477 assert(u);
c0467cf3 1478 assert(c);
8351ceae 1479
469830d1 1480 if (!context_has_syscall_filters(c))
83f12b27
FS
1481 return 0;
1482
469830d1
LP
1483 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1484 return 0;
e9642be2 1485
005bfaf1 1486 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1487
6b000af4 1488 if (c->syscall_allow_list) {
469830d1
LP
1489 default_action = negative_action;
1490 action = SCMP_ACT_ALLOW;
7c66bae2 1491 } else {
469830d1
LP
1492 default_action = SCMP_ACT_ALLOW;
1493 action = negative_action;
57183d11 1494 }
8351ceae 1495
165a31c0 1496 if (needs_ambient_hack) {
6b000af4 1497 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1498 if (r < 0)
1499 return r;
1500 }
1501
b54f36c6 1502 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1503}
1504
9df2cdd8
TM
1505static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1506#ifdef SCMP_ACT_LOG
1507 uint32_t default_action, action;
1508#endif
1509
1510 assert(u);
1511 assert(c);
1512
1513 if (!context_has_syscall_logs(c))
1514 return 0;
1515
1516#ifdef SCMP_ACT_LOG
1517 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1518 return 0;
1519
1520 if (c->syscall_log_allow_list) {
1521 /* Log nothing but the ones listed */
1522 default_action = SCMP_ACT_ALLOW;
1523 action = SCMP_ACT_LOG;
1524 } else {
1525 /* Log everything but the ones listed */
1526 default_action = SCMP_ACT_LOG;
1527 action = SCMP_ACT_ALLOW;
1528 }
1529
1530 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1531#else
1532 /* old libseccomp */
1533 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1534 return 0;
1535#endif
1536}
1537
469830d1
LP
1538static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1539 assert(u);
4298d0b5
LP
1540 assert(c);
1541
469830d1 1542 if (set_isempty(c->syscall_archs))
83f12b27
FS
1543 return 0;
1544
469830d1
LP
1545 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1546 return 0;
4298d0b5 1547
469830d1
LP
1548 return seccomp_restrict_archs(c->syscall_archs);
1549}
4298d0b5 1550
469830d1
LP
1551static int apply_address_families(const Unit* u, const ExecContext *c) {
1552 assert(u);
1553 assert(c);
4298d0b5 1554
469830d1
LP
1555 if (!context_has_address_families(c))
1556 return 0;
4298d0b5 1557
469830d1
LP
1558 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1559 return 0;
4298d0b5 1560
6b000af4 1561 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1562}
4298d0b5 1563
83f12b27 1564static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1565 assert(u);
f3e43635
TM
1566 assert(c);
1567
469830d1 1568 if (!c->memory_deny_write_execute)
83f12b27
FS
1569 return 0;
1570
469830d1
LP
1571 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1572 return 0;
f3e43635 1573
469830d1 1574 return seccomp_memory_deny_write_execute();
f3e43635
TM
1575}
1576
83f12b27 1577static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1578 assert(u);
f4170c67
LP
1579 assert(c);
1580
469830d1 1581 if (!c->restrict_realtime)
83f12b27
FS
1582 return 0;
1583
469830d1
LP
1584 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1585 return 0;
f4170c67 1586
469830d1 1587 return seccomp_restrict_realtime();
f4170c67
LP
1588}
1589
f69567cb
LP
1590static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1591 assert(u);
1592 assert(c);
1593
1594 if (!c->restrict_suid_sgid)
1595 return 0;
1596
1597 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1598 return 0;
1599
1600 return seccomp_restrict_suid_sgid();
1601}
1602
59e856c7 1603static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1604 assert(u);
59eeb84b
LP
1605 assert(c);
1606
1607 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1608 * let's protect even those systems where this is left on in the kernel. */
1609
469830d1 1610 if (!c->protect_kernel_tunables)
59eeb84b
LP
1611 return 0;
1612
469830d1
LP
1613 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1614 return 0;
59eeb84b 1615
469830d1 1616 return seccomp_protect_sysctl();
59eeb84b
LP
1617}
1618
59e856c7 1619static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1620 assert(u);
502d704e
DH
1621 assert(c);
1622
25a8d8a0 1623 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1624
469830d1
LP
1625 if (!c->protect_kernel_modules)
1626 return 0;
1627
502d704e
DH
1628 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1629 return 0;
1630
b54f36c6 1631 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1632}
1633
84703040
KK
1634static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1635 assert(u);
1636 assert(c);
1637
1638 if (!c->protect_kernel_logs)
1639 return 0;
1640
1641 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1642 return 0;
1643
1644 return seccomp_protect_syslog();
1645}
1646
daf8f72b 1647static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1648 assert(u);
1649 assert(c);
1650
1651 if (!c->protect_clock)
1652 return 0;
1653
1654 if (skip_seccomp_unavailable(u, "ProtectClock="))
1655 return 0;
1656
1657 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1658}
1659
59e856c7 1660static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1661 assert(u);
ba128bb8
LP
1662 assert(c);
1663
8f81a5f6 1664 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1665
469830d1
LP
1666 if (!c->private_devices)
1667 return 0;
1668
ba128bb8
LP
1669 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1670 return 0;
1671
b54f36c6 1672 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1673}
1674
34cf6c43 1675static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1676 assert(u);
add00535
LP
1677 assert(c);
1678
1679 if (!exec_context_restrict_namespaces_set(c))
1680 return 0;
1681
1682 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1683 return 0;
1684
1685 return seccomp_restrict_namespaces(c->restrict_namespaces);
1686}
1687
78e864e5 1688static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1689 unsigned long personality;
1690 int r;
78e864e5
TM
1691
1692 assert(u);
1693 assert(c);
1694
1695 if (!c->lock_personality)
1696 return 0;
1697
1698 if (skip_seccomp_unavailable(u, "LockPersonality="))
1699 return 0;
1700
e8132d63
LP
1701 personality = c->personality;
1702
1703 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1704 if (personality == PERSONALITY_INVALID) {
1705
1706 r = opinionated_personality(&personality);
1707 if (r < 0)
1708 return r;
1709 }
78e864e5
TM
1710
1711 return seccomp_lock_personality(personality);
1712}
1713
c0467cf3 1714#endif
8351ceae 1715
daf8f72b 1716static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1717 assert(u);
1718 assert(c);
1719
1720 if (!c->protect_hostname)
1721 return 0;
1722
1723 if (ns_type_supported(NAMESPACE_UTS)) {
1724 if (unshare(CLONE_NEWUTS) < 0) {
1725 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1726 *ret_exit_status = EXIT_NAMESPACE;
1727 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1728 }
1729
1730 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1731 }
1732 } else
1733 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1734
1735#if HAVE_SECCOMP
8f3e342f
ZJS
1736 int r;
1737
daf8f72b
LP
1738 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1739 return 0;
1740
1741 r = seccomp_protect_hostname();
1742 if (r < 0) {
1743 *ret_exit_status = EXIT_SECCOMP;
1744 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1745 }
1746#endif
1747
1748 return 0;
1749}
1750
3042bbeb 1751static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1752 assert(idle_pipe);
1753
54eb2300
LP
1754 idle_pipe[1] = safe_close(idle_pipe[1]);
1755 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1756
1757 if (idle_pipe[0] >= 0) {
1758 int r;
1759
1760 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1761
1762 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1763 ssize_t n;
1764
31a7eb86 1765 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1766 n = write(idle_pipe[3], "x", 1);
1767 if (n > 0)
cd972d69 1768 /* Wait for systemd to react to the signal above. */
54756dce 1769 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1770 }
1771
54eb2300 1772 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1773
1774 }
1775
54eb2300 1776 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1777}
1778
fb2042dd
YW
1779static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1780
7cae38c4 1781static int build_environment(
34cf6c43 1782 const Unit *u,
9fa95f85 1783 const ExecContext *c,
1e22b5cd 1784 const ExecParameters *p,
da6053d0 1785 size_t n_fds,
7cae38c4
LP
1786 const char *home,
1787 const char *username,
1788 const char *shell,
7bce046b
LP
1789 dev_t journal_stream_dev,
1790 ino_t journal_stream_ino,
7cae38c4
LP
1791 char ***ret) {
1792
1793 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1794 size_t n_env = 0;
7cae38c4
LP
1795 char *x;
1796
4b58153d 1797 assert(u);
7cae38c4 1798 assert(c);
7c1cb6f1 1799 assert(p);
7cae38c4
LP
1800 assert(ret);
1801
dc4e2940 1802#define N_ENV_VARS 17
8d5bb13d 1803 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1804 if (!our_env)
1805 return -ENOMEM;
1806
1807 if (n_fds > 0) {
8dd4c05b
LP
1808 _cleanup_free_ char *joined = NULL;
1809
df0ff127 1810 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1811 return -ENOMEM;
1812 our_env[n_env++] = x;
1813
da6053d0 1814 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1815 return -ENOMEM;
1816 our_env[n_env++] = x;
8dd4c05b 1817
1e22b5cd 1818 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1819 if (!joined)
1820 return -ENOMEM;
1821
605405c6 1822 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1823 if (!x)
1824 return -ENOMEM;
1825 our_env[n_env++] = x;
7cae38c4
LP
1826 }
1827
b08af3b1 1828 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1829 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1830 return -ENOMEM;
1831 our_env[n_env++] = x;
1832
1e22b5cd 1833 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
1836 }
1837
fd63e712
LP
1838 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1839 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1840 * check the database directly. */
ac647978 1841 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1842 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1843 if (!x)
1844 return -ENOMEM;
1845 our_env[n_env++] = x;
1846 }
1847
7cae38c4 1848 if (home) {
b910cc72 1849 x = strjoin("HOME=", home);
7cae38c4
LP
1850 if (!x)
1851 return -ENOMEM;
7bbead1d 1852
4ff361cc 1853 path_simplify(x + 5);
7cae38c4
LP
1854 our_env[n_env++] = x;
1855 }
1856
1857 if (username) {
b910cc72 1858 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1859 if (!x)
1860 return -ENOMEM;
1861 our_env[n_env++] = x;
1862
b910cc72 1863 x = strjoin("USER=", username);
7cae38c4
LP
1864 if (!x)
1865 return -ENOMEM;
1866 our_env[n_env++] = x;
1867 }
1868
1869 if (shell) {
b910cc72 1870 x = strjoin("SHELL=", shell);
7cae38c4
LP
1871 if (!x)
1872 return -ENOMEM;
7bbead1d 1873
4ff361cc 1874 path_simplify(x + 6);
7cae38c4
LP
1875 our_env[n_env++] = x;
1876 }
1877
4b58153d
LP
1878 if (!sd_id128_is_null(u->invocation_id)) {
1879 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1880 return -ENOMEM;
1881
1882 our_env[n_env++] = x;
1883 }
1884
6af760f3
LP
1885 if (exec_context_needs_term(c)) {
1886 const char *tty_path, *term = NULL;
1887
1888 tty_path = exec_context_tty_path(c);
1889
e8cf09b2
LP
1890 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1891 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1892 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1893
e8cf09b2 1894 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1895 term = getenv("TERM");
e8cf09b2 1896
6af760f3
LP
1897 if (!term)
1898 term = default_term_for_tty(tty_path);
7cae38c4 1899
b910cc72 1900 x = strjoin("TERM=", term);
7cae38c4
LP
1901 if (!x)
1902 return -ENOMEM;
1903 our_env[n_env++] = x;
1904 }
1905
7bce046b
LP
1906 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1907 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1908 return -ENOMEM;
1909
1910 our_env[n_env++] = x;
1911 }
1912
91dd5f7c
LP
1913 if (c->log_namespace) {
1914 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1915 if (!x)
1916 return -ENOMEM;
1917
1918 our_env[n_env++] = x;
1919 }
1920
5b10116e 1921 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
fb2042dd
YW
1922 _cleanup_free_ char *pre = NULL, *joined = NULL;
1923 const char *n;
1924
1925 if (!p->prefix[t])
1926 continue;
1927
1928 if (strv_isempty(c->directories[t].paths))
1929 continue;
1930
1931 n = exec_directory_env_name_to_string(t);
1932 if (!n)
1933 continue;
1934
1935 pre = strjoin(p->prefix[t], "/");
1936 if (!pre)
1937 return -ENOMEM;
1938
48904c8b 1939 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
fb2042dd
YW
1940 if (!joined)
1941 return -ENOMEM;
1942
1943 x = strjoin(n, "=", joined);
1944 if (!x)
1945 return -ENOMEM;
1946
1947 our_env[n_env++] = x;
1948 }
1949
bb0c0d6f
LP
1950 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1951 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1952 if (!x)
1953 return -ENOMEM;
1954
1955 our_env[n_env++] = x;
1956 }
1957
dc4e2940
YW
1958 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1959 return -ENOMEM;
1960
1961 our_env[n_env++] = x;
1962
7cae38c4 1963 our_env[n_env++] = NULL;
8d5bb13d
LP
1964 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1965#undef N_ENV_VARS
7cae38c4 1966
ae2a15bc 1967 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1968
1969 return 0;
1970}
1971
b4c14404
FB
1972static int build_pass_environment(const ExecContext *c, char ***ret) {
1973 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1974 size_t n_env = 0;
b4c14404
FB
1975 char **i;
1976
1977 STRV_FOREACH(i, c->pass_environment) {
1978 _cleanup_free_ char *x = NULL;
1979 char *v;
1980
1981 v = getenv(*i);
1982 if (!v)
1983 continue;
605405c6 1984 x = strjoin(*i, "=", v);
b4c14404
FB
1985 if (!x)
1986 return -ENOMEM;
00819cc1 1987
319a4f4b 1988 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 1989 return -ENOMEM;
00819cc1 1990
1cc6c93a 1991 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1992 pass_env[n_env] = NULL;
b4c14404
FB
1993 }
1994
ae2a15bc 1995 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1996
1997 return 0;
1998}
1999
5e8deb94 2000bool exec_needs_mount_namespace(
8b44a3d2
LP
2001 const ExecContext *context,
2002 const ExecParameters *params,
4657abb5 2003 const ExecRuntime *runtime) {
8b44a3d2
LP
2004
2005 assert(context);
8b44a3d2 2006
915e6d16
LP
2007 if (context->root_image)
2008 return true;
2009
2a624c36
AP
2010 if (!strv_isempty(context->read_write_paths) ||
2011 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2012 !strv_isempty(context->inaccessible_paths) ||
2013 !strv_isempty(context->exec_paths) ||
2014 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2015 return true;
2016
42b1d8e0 2017 if (context->n_bind_mounts > 0)
d2d6c096
LP
2018 return true;
2019
2abd4e38
YW
2020 if (context->n_temporary_filesystems > 0)
2021 return true;
2022
b3d13314
LB
2023 if (context->n_mount_images > 0)
2024 return true;
2025
93f59701
LB
2026 if (context->n_extension_images > 0)
2027 return true;
2028
37ed15d7 2029 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2030 return true;
2031
2032 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2033 return true;
2034
8b44a3d2 2035 if (context->private_devices ||
228af36f 2036 context->private_mounts ||
8b44a3d2 2037 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2038 context->protect_home != PROTECT_HOME_NO ||
2039 context->protect_kernel_tunables ||
c575770b 2040 context->protect_kernel_modules ||
94a7b275 2041 context->protect_kernel_logs ||
4e399953
LP
2042 context->protect_control_groups ||
2043 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2044 context->proc_subset != PROC_SUBSET_ALL ||
2045 context->private_ipc ||
2046 context->ipc_namespace_path)
8b44a3d2
LP
2047 return true;
2048
37c56f89 2049 if (context->root_directory) {
5e98086d 2050 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2051 return true;
2052
5b10116e 2053 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2054 if (params && !params->prefix[t])
37c56f89
YW
2055 continue;
2056
2057 if (!strv_isempty(context->directories[t].paths))
2058 return true;
2059 }
2060 }
5d997827 2061
42b1d8e0 2062 if (context->dynamic_user &&
b43ee82f 2063 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
2064 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2065 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2066 return true;
2067
91dd5f7c
LP
2068 if (context->log_namespace)
2069 return true;
2070
8b44a3d2
LP
2071 return false;
2072}
2073
5749f855 2074static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2075 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2076 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2077 _cleanup_close_ int unshare_ready_fd = -1;
2078 _cleanup_(sigkill_waitp) pid_t pid = 0;
2079 uint64_t c = 1;
d251207d
LP
2080 ssize_t n;
2081 int r;
2082
5749f855
AZ
2083 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2084 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2085 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2086 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2087 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2088 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2089 * continues execution normally.
2090 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2091 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2092
5749f855
AZ
2093 /* Can only set up multiple mappings with CAP_SETUID. */
2094 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2095 r = asprintf(&uid_map,
5749f855 2096 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2097 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2098 ouid, ouid, uid, uid);
2099 else
2100 r = asprintf(&uid_map,
2101 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2102 ouid, ouid);
d251207d 2103
5749f855
AZ
2104 if (r < 0)
2105 return -ENOMEM;
2106
2107 /* Can only set up multiple mappings with CAP_SETGID. */
2108 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2109 r = asprintf(&gid_map,
5749f855 2110 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2111 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2112 ogid, ogid, gid, gid);
2113 else
2114 r = asprintf(&gid_map,
2115 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2116 ogid, ogid);
2117
2118 if (r < 0)
2119 return -ENOMEM;
d251207d
LP
2120
2121 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2122 * namespace. */
2123 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2124 if (unshare_ready_fd < 0)
2125 return -errno;
2126
2127 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2128 * failed. */
2129 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2130 return -errno;
2131
4c253ed1
LP
2132 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2133 if (r < 0)
2134 return r;
2135 if (r == 0) {
d251207d
LP
2136 _cleanup_close_ int fd = -1;
2137 const char *a;
2138 pid_t ppid;
2139
2140 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2141 * here, after the parent opened its own user namespace. */
2142
2143 ppid = getppid();
2144 errno_pipe[0] = safe_close(errno_pipe[0]);
2145
2146 /* Wait until the parent unshared the user namespace */
2147 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2148 r = -errno;
2149 goto child_fail;
2150 }
2151
2152 /* Disable the setgroups() system call in the child user namespace, for good. */
2153 a = procfs_file_alloca(ppid, "setgroups");
2154 fd = open(a, O_WRONLY|O_CLOEXEC);
2155 if (fd < 0) {
2156 if (errno != ENOENT) {
2157 r = -errno;
2158 goto child_fail;
2159 }
2160
2161 /* If the file is missing the kernel is too old, let's continue anyway. */
2162 } else {
2163 if (write(fd, "deny\n", 5) < 0) {
2164 r = -errno;
2165 goto child_fail;
2166 }
2167
2168 fd = safe_close(fd);
2169 }
2170
2171 /* First write the GID map */
2172 a = procfs_file_alloca(ppid, "gid_map");
2173 fd = open(a, O_WRONLY|O_CLOEXEC);
2174 if (fd < 0) {
2175 r = -errno;
2176 goto child_fail;
2177 }
2178 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2179 r = -errno;
2180 goto child_fail;
2181 }
2182 fd = safe_close(fd);
2183
2184 /* The write the UID map */
2185 a = procfs_file_alloca(ppid, "uid_map");
2186 fd = open(a, O_WRONLY|O_CLOEXEC);
2187 if (fd < 0) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2192 r = -errno;
2193 goto child_fail;
2194 }
2195
2196 _exit(EXIT_SUCCESS);
2197
2198 child_fail:
2199 (void) write(errno_pipe[1], &r, sizeof(r));
2200 _exit(EXIT_FAILURE);
2201 }
2202
2203 errno_pipe[1] = safe_close(errno_pipe[1]);
2204
2205 if (unshare(CLONE_NEWUSER) < 0)
2206 return -errno;
2207
2208 /* Let the child know that the namespace is ready now */
2209 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2210 return -errno;
2211
2212 /* Try to read an error code from the child */
2213 n = read(errno_pipe[0], &r, sizeof(r));
2214 if (n < 0)
2215 return -errno;
2216 if (n == sizeof(r)) { /* an error code was sent to us */
2217 if (r < 0)
2218 return r;
2219 return -EIO;
2220 }
2221 if (n != 0) /* on success we should have read 0 bytes */
2222 return -EIO;
2223
2e87a1fd
LP
2224 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2225 pid = 0;
d251207d
LP
2226 if (r < 0)
2227 return r;
2e87a1fd 2228 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2229 return -EIO;
2230
2231 return 0;
2232}
2233
494d0247
YW
2234static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2235 if (!context->dynamic_user)
2236 return false;
2237
2238 if (type == EXEC_DIRECTORY_CONFIGURATION)
2239 return false;
2240
2241 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2242 return false;
2243
2244 return true;
2245}
2246
3536f49e 2247static int setup_exec_directory(
07689d5d
LP
2248 const ExecContext *context,
2249 const ExecParameters *params,
2250 uid_t uid,
3536f49e 2251 gid_t gid,
3536f49e
YW
2252 ExecDirectoryType type,
2253 int *exit_status) {
07689d5d 2254
72fd1768 2255 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2256 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2257 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2258 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2259 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2260 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2261 };
07689d5d
LP
2262 char **rt;
2263 int r;
2264
2265 assert(context);
2266 assert(params);
72fd1768 2267 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2268 assert(exit_status);
07689d5d 2269
3536f49e
YW
2270 if (!params->prefix[type])
2271 return 0;
2272
8679efde 2273 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2274 if (!uid_is_valid(uid))
2275 uid = 0;
2276 if (!gid_is_valid(gid))
2277 gid = 0;
2278 }
2279
2280 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 2281 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2282
edbfeb12 2283 p = path_join(params->prefix[type], *rt);
3536f49e
YW
2284 if (!p) {
2285 r = -ENOMEM;
2286 goto fail;
2287 }
07689d5d 2288
23a7448e
YW
2289 r = mkdir_parents_label(p, 0755);
2290 if (r < 0)
3536f49e 2291 goto fail;
23a7448e 2292
494d0247 2293 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2294 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2295 * case we want to avoid leaving a directory around fully accessible that is owned by
2296 * a dynamic user whose UID is later on reused. To lock this down we use the same
2297 * trick used by container managers to prohibit host users to get access to files of
2298 * the same UID in containers: we place everything inside a directory that has an
2299 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2300 * for unprivileged host code. We then use fs namespacing to make this directory
2301 * permeable for the service itself.
6c47cd7d 2302 *
3f5b1508
LP
2303 * Specifically: for a service which wants a special directory "foo/" we first create
2304 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2305 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2306 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2307 * unprivileged host users can't look into it. Inside of the namespace of the unit
2308 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2309 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2310 * for the service and making sure it only gets access to the dirs it needs but no
2311 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2312 *
3f5b1508
LP
2313 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2314 * to be owned by the service itself.
2315 *
2316 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2317 * for sharing files or sockets with other services. */
6c47cd7d 2318
4ede9802
LP
2319 pp = path_join(params->prefix[type], "private");
2320 if (!pp) {
6c47cd7d
LP
2321 r = -ENOMEM;
2322 goto fail;
2323 }
2324
2325 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2326 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2327 if (r < 0)
2328 goto fail;
2329
4ede9802 2330 if (!path_extend(&pp, *rt)) {
6c47cd7d
LP
2331 r = -ENOMEM;
2332 goto fail;
2333 }
2334
2335 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2336 r = mkdir_parents_label(pp, 0755);
2337 if (r < 0)
2338 goto fail;
2339
949befd3
LP
2340 if (is_dir(p, false) > 0 &&
2341 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2342
2343 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2344 * it over. Most likely the service has been upgraded from one that didn't use
2345 * DynamicUser=1, to one that does. */
2346
cf52c45d
LP
2347 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2348 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2349 exec_directory_type_to_string(type), p, pp);
2350
949befd3
LP
2351 if (rename(p, pp) < 0) {
2352 r = -errno;
2353 goto fail;
2354 }
2355 } else {
2356 /* Otherwise, create the actual directory for the service */
2357
2358 r = mkdir_label(pp, context->directories[type].mode);
2359 if (r < 0 && r != -EEXIST)
2360 goto fail;
2361 }
6c47cd7d 2362
6c47cd7d 2363 /* And link it up from the original place */
6c9c51e5 2364 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2365 if (r < 0)
2366 goto fail;
2367
6c47cd7d 2368 } else {
5c6d40d1
LP
2369 _cleanup_free_ char *target = NULL;
2370
2371 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2372 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2373 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2374
2375 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2376 * by DynamicUser=1 (see above)?
2377 *
2378 * We do this for all directory types except for ConfigurationDirectory=,
2379 * since they all support the private/ symlink logic at least in some
2380 * configurations, see above. */
5c6d40d1 2381
578dc69f
YW
2382 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2383 if (r < 0)
2384 goto fail;
2385
5c6d40d1
LP
2386 q = path_join(params->prefix[type], "private", *rt);
2387 if (!q) {
2388 r = -ENOMEM;
2389 goto fail;
2390 }
2391
578dc69f
YW
2392 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2393 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2394 if (r < 0)
2395 goto fail;
2396
2397 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2398
2399 /* Hmm, apparently DynamicUser= was once turned on for this service,
2400 * but is no longer. Let's move the directory back up. */
2401
cf52c45d
LP
2402 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2403 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2404 exec_directory_type_to_string(type), q, p);
2405
5c6d40d1
LP
2406 if (unlink(p) < 0) {
2407 r = -errno;
2408 goto fail;
2409 }
2410
2411 if (rename(q, p) < 0) {
2412 r = -errno;
2413 goto fail;
2414 }
2415 }
2416 }
2417
6c47cd7d 2418 r = mkdir_label(p, context->directories[type].mode);
d484580c 2419 if (r < 0) {
d484580c
LP
2420 if (r != -EEXIST)
2421 goto fail;
2422
206e9864
LP
2423 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2424 struct stat st;
2425
2426 /* Don't change the owner/access mode of the configuration directory,
2427 * as in the common case it is not written to by a service, and shall
2428 * not be writable. */
2429
2430 if (stat(p, &st) < 0) {
2431 r = -errno;
2432 goto fail;
2433 }
2434
2435 /* Still complain if the access mode doesn't match */
2436 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2437 log_warning("%s \'%s\' already exists but the mode is different. "
2438 "(File system: %o %sMode: %o)",
2439 exec_directory_type_to_string(type), *rt,
2440 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2441
6cff72eb 2442 continue;
206e9864 2443 }
6cff72eb 2444 }
a1164ae3 2445 }
07689d5d 2446
206e9864 2447 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2448 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2449 * current UID/GID ownership.) */
2450 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2451 if (r < 0)
2452 goto fail;
c71b2eb7 2453
607b358e
LP
2454 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2455 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2456 * assignments to exist. */
607b358e 2457 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2458 if (r < 0)
3536f49e 2459 goto fail;
07689d5d
LP
2460 }
2461
2462 return 0;
3536f49e
YW
2463
2464fail:
2465 *exit_status = exit_status_table[type];
3536f49e 2466 return r;
07689d5d
LP
2467}
2468
bb0c0d6f
LP
2469static int write_credential(
2470 int dfd,
2471 const char *id,
2472 const void *data,
2473 size_t size,
2474 uid_t uid,
2475 bool ownership_ok) {
2476
2477 _cleanup_(unlink_and_freep) char *tmp = NULL;
2478 _cleanup_close_ int fd = -1;
2479 int r;
2480
2481 r = tempfn_random_child("", "cred", &tmp);
2482 if (r < 0)
2483 return r;
2484
2485 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2486 if (fd < 0) {
2487 tmp = mfree(tmp);
2488 return -errno;
2489 }
2490
43144be4 2491 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2492 if (r < 0)
2493 return r;
2494
2495 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2496 return -errno;
2497
2498 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2499 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2500 if (r < 0) {
2501 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2502 return r;
2503
2504 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2505 * to express: that the user gets read access and nothing
2506 * else. But if the backing fs can't support that (e.g. ramfs)
2507 * then we can use file ownership instead. But that's only safe if
2508 * we can then re-mount the whole thing read-only, so that the
2509 * user can no longer chmod() the file to gain write access. */
2510 return r;
2511
f5fbe71d 2512 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2513 return -errno;
2514 }
2515 }
2516
2517 if (renameat(dfd, tmp, dfd, id) < 0)
2518 return -errno;
2519
2520 tmp = mfree(tmp);
2521 return 0;
2522}
2523
bb0c0d6f
LP
2524static int acquire_credentials(
2525 const ExecContext *context,
2526 const ExecParameters *params,
d3dcf4e3 2527 const char *unit,
bb0c0d6f
LP
2528 const char *p,
2529 uid_t uid,
2530 bool ownership_ok) {
2531
43144be4 2532 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2533 _cleanup_close_ int dfd = -1;
43144be4 2534 ExecLoadCredential *lc;
bb0c0d6f 2535 ExecSetCredential *sc;
bb0c0d6f
LP
2536 int r;
2537
2538 assert(context);
2539 assert(p);
2540
2541 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2542 if (dfd < 0)
2543 return -errno;
2544
43144be4
LP
2545 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2546 HASHMAP_FOREACH(lc, context->load_credentials) {
2547 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
bb0c0d6f 2548 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2549 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2550 bool missing_ok = true;
bb0c0d6f
LP
2551 const char *source;
2552 size_t size, add;
2553
43144be4 2554 if (path_is_absolute(lc->path)) {
bb0c0d6f 2555 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
43144be4 2556 source = lc->path;
bb0c0d6f 2557 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2558
2559 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2560 * via the source socket address in case we read off an AF_UNIX socket. */
43144be4 2561 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
d3dcf4e3
LP
2562 return -ENOMEM;
2563
fc682be2
LP
2564 missing_ok = false;
2565
bb0c0d6f
LP
2566 } else if (params->received_credentials) {
2567 /* If this is a relative path, take it relative to the credentials we received
2568 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2569 * on a credential store, i.e. this is guaranteed to be regular files. */
43144be4 2570 j = path_join(params->received_credentials, lc->path);
bb0c0d6f
LP
2571 if (!j)
2572 return -ENOMEM;
2573
2574 source = j;
2575 } else
2576 source = NULL;
2577
2578 if (source)
43144be4
LP
2579 r = read_full_file_full(
2580 AT_FDCWD, source,
2581 UINT64_MAX,
2582 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2583 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2584 bindname,
2585 &data, &size);
bb0c0d6f
LP
2586 else
2587 r = -ENOENT;
43144be4 2588 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
fc682be2
LP
2589 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2590 * will get clear errors if we don't pass such a missing credential on as they
2591 * themselves will get ENOENT when trying to read them, which should not be much
2592 * worse than when we handle the error here and make it fatal.
2593 *
43144be4
LP
2594 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2595 * we are fine, too. */
2596 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
bb0c0d6f 2597 continue;
fc682be2 2598 }
bb0c0d6f 2599 if (r < 0)
43144be4
LP
2600 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2601
2602 if (lc->encrypted) {
2603 _cleanup_free_ void *plaintext = NULL;
2604 size_t plaintext_size = 0;
2605
2606 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2607 if (r < 0)
2608 return r;
bb0c0d6f 2609
43144be4
LP
2610 free_and_replace(data, plaintext);
2611 size = plaintext_size;
2612 }
2613
2614 add = strlen(lc->id) + size;
bb0c0d6f
LP
2615 if (add > left)
2616 return -E2BIG;
2617
43144be4 2618 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
bb0c0d6f
LP
2619 if (r < 0)
2620 return r;
2621
2622 left -= add;
2623 }
2624
43144be4
LP
2625 /* First we use the literally specified credentials. Note that they might be overridden again below,
2626 * and thus act as a "default" if the same credential is specified multiple times */
2627 HASHMAP_FOREACH(sc, context->set_credentials) {
2628 _cleanup_(erase_and_freep) void *plaintext = NULL;
2629 const char *data;
2630 size_t size, add;
2631
2632 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2633 continue;
2634 if (errno != ENOENT)
2635 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2636
2637 if (sc->encrypted) {
2638 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2639 if (r < 0)
2640 return r;
2641
2642 data = plaintext;
2643 } else {
2644 data = sc->data;
2645 size = sc->size;
2646 }
2647
2648 add = strlen(sc->id) + size;
2649 if (add > left)
2650 return -E2BIG;
2651
2652 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2653 if (r < 0)
2654 return r;
2655
2656
2657 left -= add;
2658 }
2659
bb0c0d6f
LP
2660 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2661 return -errno;
2662
2663 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2664 * accessible */
2665
2666 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2667 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2668 if (r < 0) {
2669 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2670 return r;
2671
2672 if (!ownership_ok)
2673 return r;
2674
f5fbe71d 2675 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2676 return -errno;
2677 }
2678 }
2679
2680 return 0;
2681}
2682
2683static int setup_credentials_internal(
2684 const ExecContext *context,
2685 const ExecParameters *params,
d3dcf4e3 2686 const char *unit,
bb0c0d6f
LP
2687 const char *final, /* This is where the credential store shall eventually end up at */
2688 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2689 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2690 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2691 uid_t uid) {
2692
2693 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2694 * if we mounted something; false if we definitely can't mount anything */
2695 bool final_mounted;
2696 const char *where;
2697
2698 assert(context);
2699 assert(final);
2700 assert(workspace);
2701
2702 if (reuse_workspace) {
2703 r = path_is_mount_point(workspace, NULL, 0);
2704 if (r < 0)
2705 return r;
2706 if (r > 0)
2707 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2708 else
2709 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2710 } else
2711 workspace_mounted = -1; /* ditto */
2712
2713 r = path_is_mount_point(final, NULL, 0);
2714 if (r < 0)
2715 return r;
2716 if (r > 0) {
2717 /* If the final place already has something mounted, we use that. If the workspace also has
2718 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2719 * different). */
2720 final_mounted = true;
2721
2722 if (workspace_mounted < 0) {
2723 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2724 * the final version to the workspace, and make it writable, so that we can make
2725 * changes */
2726
21935150
LP
2727 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2728 if (r < 0)
2729 return r;
bb0c0d6f 2730
21935150
LP
2731 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2732 if (r < 0)
2733 return r;
bb0c0d6f
LP
2734
2735 workspace_mounted = true;
2736 }
2737 } else
2738 final_mounted = false;
2739
2740 if (workspace_mounted < 0) {
2741 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2742 for (int try = 0;; try++) {
2743
2744 if (try == 0) {
2745 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2746 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2747 if (r >= 0) {
bb0c0d6f
LP
2748 workspace_mounted = true;
2749 break;
2750 }
2751
2752 } else if (try == 1) {
2753 _cleanup_free_ char *opts = NULL;
2754
43144be4 2755 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
2756 return -ENOMEM;
2757
2758 /* Fall back to "tmpfs" otherwise */
21935150
LP
2759 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2760 if (r >= 0) {
bb0c0d6f
LP
2761 workspace_mounted = true;
2762 break;
2763 }
2764
2765 } else {
2766 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2767 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2768 if (r < 0) {
2769 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2770 return r;
bb0c0d6f
LP
2771
2772 if (must_mount) /* If we it's not OK to use the plain directory
2773 * fallback, propagate all errors too */
21935150 2774 return r;
bb0c0d6f
LP
2775
2776 /* If we lack privileges to bind mount stuff, then let's gracefully
2777 * proceed for compat with container envs, and just use the final dir
2778 * as is. */
2779
2780 workspace_mounted = false;
2781 break;
2782 }
2783
2784 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2785 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2786 if (r < 0)
2787 return r;
bb0c0d6f
LP
2788
2789 workspace_mounted = true;
2790 break;
2791 }
2792 }
2793 }
2794
2795 assert(!must_mount || workspace_mounted > 0);
2796 where = workspace_mounted ? workspace : final;
2797
d3dcf4e3 2798 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2799 if (r < 0)
2800 return r;
2801
2802 if (workspace_mounted) {
2803 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2804 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2805 if (r < 0)
2806 return r;
bb0c0d6f
LP
2807
2808 /* And mount it to the final place, read-only */
21935150
LP
2809 if (final_mounted)
2810 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2811 else
2812 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2813 if (r < 0)
2814 return r;
bb0c0d6f
LP
2815 } else {
2816 _cleanup_free_ char *parent = NULL;
2817
2818 /* If we do not have our own mount put used the plain directory fallback, then we need to
2819 * open access to the top-level credential directory and the per-service directory now */
2820
2821 parent = dirname_malloc(final);
2822 if (!parent)
2823 return -ENOMEM;
2824 if (chmod(parent, 0755) < 0)
2825 return -errno;
2826 }
2827
2828 return 0;
2829}
2830
2831static int setup_credentials(
2832 const ExecContext *context,
2833 const ExecParameters *params,
2834 const char *unit,
2835 uid_t uid) {
2836
2837 _cleanup_free_ char *p = NULL, *q = NULL;
2838 const char *i;
2839 int r;
2840
2841 assert(context);
2842 assert(params);
2843
2844 if (!exec_context_has_credentials(context))
2845 return 0;
2846
2847 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2848 return -EINVAL;
2849
2850 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2851 * and the subdir we mount over with a read-only file system readable by the service's user */
2852 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2853 if (!q)
2854 return -ENOMEM;
2855
2856 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2857 if (r < 0 && r != -EEXIST)
2858 return r;
2859
2860 p = path_join(q, unit);
2861 if (!p)
2862 return -ENOMEM;
2863
2864 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2865 if (r < 0 && r != -EEXIST)
2866 return r;
2867
2868 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2869 if (r < 0) {
2870 _cleanup_free_ char *t = NULL, *u = NULL;
2871
2872 /* If this is not a privilege or support issue then propagate the error */
2873 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2874 return r;
2875
2876 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2877 * it into place, so that users can't access half-initialized credential stores. */
2878 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2879 if (!t)
2880 return -ENOMEM;
2881
2882 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2883 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2884 * after it is fully set up */
2885 u = path_join(t, unit);
2886 if (!u)
2887 return -ENOMEM;
2888
2889 FOREACH_STRING(i, t, u) {
2890 r = mkdir_label(i, 0700);
2891 if (r < 0 && r != -EEXIST)
2892 return r;
2893 }
2894
2895 r = setup_credentials_internal(
2896 context,
2897 params,
d3dcf4e3 2898 unit,
bb0c0d6f
LP
2899 p, /* final mount point */
2900 u, /* temporary workspace to overmount */
2901 true, /* reuse the workspace if it is already a mount */
2902 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2903 uid);
2904
2905 (void) rmdir(u); /* remove the workspace again if we can. */
2906
2907 if (r < 0)
2908 return r;
2909
2910 } else if (r == 0) {
2911
2912 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2913 * we can use the same directory for all cases, after turning off propagation. Question
2914 * though is: where do we turn off propagation exactly, and where do we place the workspace
2915 * directory? We need some place that is guaranteed to be a mount point in the host, and
2916 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2917 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2918 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2919 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2920 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2921 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2922 * propagation on the former, and then overmount the latter.
2923 *
2924 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2925 * for this purpose, but there are few other candidates that work equally well for us, and
2926 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 2927 * that no one else sees this should be OK to do. */
bb0c0d6f 2928
21935150
LP
2929 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2930 if (r < 0)
bb0c0d6f
LP
2931 goto child_fail;
2932
2933 r = setup_credentials_internal(
2934 context,
2935 params,
d3dcf4e3 2936 unit,
bb0c0d6f
LP
2937 p, /* final mount point */
2938 "/dev/shm", /* temporary workspace to overmount */
2939 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2940 true, /* insist that something is mounted, do not allow fallback to plain directory */
2941 uid);
2942 if (r < 0)
2943 goto child_fail;
2944
2945 _exit(EXIT_SUCCESS);
2946
2947 child_fail:
2948 _exit(EXIT_FAILURE);
2949 }
2950
2951 return 0;
2952}
2953
92b423b9 2954#if ENABLE_SMACK
cefc33ae
LP
2955static int setup_smack(
2956 const ExecContext *context,
b83d5050 2957 int executable_fd) {
cefc33ae
LP
2958 int r;
2959
2960 assert(context);
b83d5050 2961 assert(executable_fd >= 0);
cefc33ae 2962
cefc33ae
LP
2963 if (context->smack_process_label) {
2964 r = mac_smack_apply_pid(0, context->smack_process_label);
2965 if (r < 0)
2966 return r;
2967 }
2968#ifdef SMACK_DEFAULT_PROCESS_LABEL
2969 else {
2970 _cleanup_free_ char *exec_label = NULL;
2971
b83d5050 2972 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 2973 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2974 return r;
2975
2976 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2977 if (r < 0)
2978 return r;
2979 }
cefc33ae
LP
2980#endif
2981
2982 return 0;
2983}
92b423b9 2984#endif
cefc33ae 2985
6c47cd7d
LP
2986static int compile_bind_mounts(
2987 const ExecContext *context,
2988 const ExecParameters *params,
2989 BindMount **ret_bind_mounts,
da6053d0 2990 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2991 char ***ret_empty_directories) {
2992
2993 _cleanup_strv_free_ char **empty_directories = NULL;
2994 BindMount *bind_mounts;
5b10116e 2995 size_t n, h = 0;
6c47cd7d
LP
2996 int r;
2997
2998 assert(context);
2999 assert(params);
3000 assert(ret_bind_mounts);
3001 assert(ret_n_bind_mounts);
3002 assert(ret_empty_directories);
3003
3004 n = context->n_bind_mounts;
5b10116e 3005 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3006 if (!params->prefix[t])
3007 continue;
3008
3009 n += strv_length(context->directories[t].paths);
3010 }
3011
3012 if (n <= 0) {
3013 *ret_bind_mounts = NULL;
3014 *ret_n_bind_mounts = 0;
3015 *ret_empty_directories = NULL;
3016 return 0;
3017 }
3018
3019 bind_mounts = new(BindMount, n);
3020 if (!bind_mounts)
3021 return -ENOMEM;
3022
5b10116e 3023 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3024 BindMount *item = context->bind_mounts + i;
3025 char *s, *d;
3026
3027 s = strdup(item->source);
3028 if (!s) {
3029 r = -ENOMEM;
3030 goto finish;
3031 }
3032
3033 d = strdup(item->destination);
3034 if (!d) {
3035 free(s);
3036 r = -ENOMEM;
3037 goto finish;
3038 }
3039
3040 bind_mounts[h++] = (BindMount) {
3041 .source = s,
3042 .destination = d,
3043 .read_only = item->read_only,
3044 .recursive = item->recursive,
3045 .ignore_enoent = item->ignore_enoent,
3046 };
3047 }
3048
5b10116e 3049 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3050 char **suffix;
3051
3052 if (!params->prefix[t])
3053 continue;
3054
3055 if (strv_isempty(context->directories[t].paths))
3056 continue;
3057
494d0247 3058 if (exec_directory_is_private(context, t) &&
74e12520 3059 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3060 char *private_root;
3061
3062 /* So this is for a dynamic user, and we need to make sure the process can access its own
3063 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3064 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3065
657ee2d8 3066 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3067 if (!private_root) {
3068 r = -ENOMEM;
3069 goto finish;
3070 }
3071
3072 r = strv_consume(&empty_directories, private_root);
a635a7ae 3073 if (r < 0)
6c47cd7d 3074 goto finish;
6c47cd7d
LP
3075 }
3076
3077 STRV_FOREACH(suffix, context->directories[t].paths) {
3078 char *s, *d;
3079
494d0247 3080 if (exec_directory_is_private(context, t))
657ee2d8 3081 s = path_join(params->prefix[t], "private", *suffix);
6c47cd7d 3082 else
657ee2d8 3083 s = path_join(params->prefix[t], *suffix);
6c47cd7d
LP
3084 if (!s) {
3085 r = -ENOMEM;
3086 goto finish;
3087 }
3088
494d0247 3089 if (exec_directory_is_private(context, t) &&
74e12520 3090 exec_context_with_rootfs(context))
5609f688
YW
3091 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3092 * directory is not created on the root directory. So, let's bind-mount the directory
3093 * on the 'non-private' place. */
657ee2d8 3094 d = path_join(params->prefix[t], *suffix);
5609f688
YW
3095 else
3096 d = strdup(s);
6c47cd7d
LP
3097 if (!d) {
3098 free(s);
3099 r = -ENOMEM;
3100 goto finish;
3101 }
3102
3103 bind_mounts[h++] = (BindMount) {
3104 .source = s,
3105 .destination = d,
3106 .read_only = false,
9ce4e4b0 3107 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3108 .recursive = true,
3109 .ignore_enoent = false,
3110 };
3111 }
3112 }
3113
3114 assert(h == n);
3115
3116 *ret_bind_mounts = bind_mounts;
3117 *ret_n_bind_mounts = n;
ae2a15bc 3118 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3119
3120 return (int) n;
3121
3122finish:
3123 bind_mount_free_many(bind_mounts, h);
3124 return r;
3125}
3126
4e677599
LP
3127static bool insist_on_sandboxing(
3128 const ExecContext *context,
3129 const char *root_dir,
3130 const char *root_image,
3131 const BindMount *bind_mounts,
3132 size_t n_bind_mounts) {
3133
4e677599
LP
3134 assert(context);
3135 assert(n_bind_mounts == 0 || bind_mounts);
3136
3137 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3138 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3139 * rearrange stuff in a way we cannot ignore gracefully. */
3140
3141 if (context->n_temporary_filesystems > 0)
3142 return true;
3143
3144 if (root_dir || root_image)
3145 return true;
3146
b3d13314
LB
3147 if (context->n_mount_images > 0)
3148 return true;
3149
4e677599
LP
3150 if (context->dynamic_user)
3151 return true;
3152
3153 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3154 * essential. */
5b10116e 3155 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3156 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3157 return true;
3158
91dd5f7c
LP
3159 if (context->log_namespace)
3160 return true;
3161
4e677599
LP
3162 return false;
3163}
3164
6818c54c 3165static int apply_mount_namespace(
34cf6c43 3166 const Unit *u,
9f71ba8d 3167 ExecCommandFlags command_flags,
6818c54c
LP
3168 const ExecContext *context,
3169 const ExecParameters *params,
7cc5ef5f
ZJS
3170 const ExecRuntime *runtime,
3171 char **error_path) {
6818c54c 3172
7bcef4ef 3173 _cleanup_strv_free_ char **empty_directories = NULL;
56a13a49 3174 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3175 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3176 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3177 NamespaceInfo ns_info;
165a31c0 3178 bool needs_sandboxing;
6c47cd7d 3179 BindMount *bind_mounts = NULL;
da6053d0 3180 size_t n_bind_mounts = 0;
6818c54c 3181 int r;
93c6bb51 3182
2b3c1b9e
DH
3183 assert(context);
3184
915e6d16
LP
3185 if (params->flags & EXEC_APPLY_CHROOT) {
3186 root_image = context->root_image;
3187
3188 if (!root_image)
3189 root_dir = context->root_directory;
3190 }
93c6bb51 3191
6c47cd7d
LP
3192 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3193 if (r < 0)
3194 return r;
3195
9f71ba8d 3196 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3197 if (needs_sandboxing) {
3198 /* The runtime struct only contains the parent of the private /tmp,
3199 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3200 * that is sticky, and that's the one we want to use here.
3201 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3202
3203 if (context->private_tmp && runtime) {
56a13a49
ZJS
3204 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3205 tmp_dir = runtime->tmp_dir;
3206 else if (runtime->tmp_dir)
3207 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3208
3209 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3210 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3211 else if (runtime->var_tmp_dir)
56a13a49 3212 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3213 }
3214
b5a33299
YW
3215 ns_info = (NamespaceInfo) {
3216 .ignore_protect_paths = false,
3217 .private_dev = context->private_devices,
3218 .protect_control_groups = context->protect_control_groups,
3219 .protect_kernel_tunables = context->protect_kernel_tunables,
3220 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3221 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3222 .protect_hostname = context->protect_hostname,
5e98086d 3223 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3224 .private_mounts = context->private_mounts,
52b3d652
LP
3225 .protect_home = context->protect_home,
3226 .protect_system = context->protect_system,
4e399953
LP
3227 .protect_proc = context->protect_proc,
3228 .proc_subset = context->proc_subset,
80271a44 3229 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3230 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3231 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3232 };
ecf63c91 3233 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3234 /*
3235 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3236 * sandbox info, otherwise enforce it, don't ignore protected paths and
3237 * fail if we are enable to apply the sandbox inside the mount namespace.
3238 */
3239 ns_info = (NamespaceInfo) {
3240 .ignore_protect_paths = true,
3241 };
3242 else
3243 ns_info = (NamespaceInfo) {};
b5a33299 3244
37ed15d7
FB
3245 if (context->mount_flags == MS_SHARED)
3246 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3247
a631cbfa
LP
3248 if (exec_context_has_credentials(context) &&
3249 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3250 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3251 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3252 if (!creds_path) {
3253 r = -ENOMEM;
3254 goto finalize;
3255 }
bbb4e7f3
LP
3256 }
3257
5e8deb94
LB
3258 if (MANAGER_IS_SYSTEM(u->manager)) {
3259 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3260 if (!propagate_dir) {
3261 r = -ENOMEM;
3262 goto finalize;
3263 }
3264
5e8deb94 3265 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3266 if (!incoming_dir) {
3267 r = -ENOMEM;
3268 goto finalize;
3269 }
5e8deb94
LB
3270 }
3271
18d73705 3272 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3273 &ns_info, context->read_write_paths,
165a31c0
LP
3274 needs_sandboxing ? context->read_only_paths : NULL,
3275 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3276 needs_sandboxing ? context->exec_paths : NULL,
3277 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d
LP
3278 empty_directories,
3279 bind_mounts,
3280 n_bind_mounts,
2abd4e38
YW
3281 context->temporary_filesystems,
3282 context->n_temporary_filesystems,
b3d13314
LB
3283 context->mount_images,
3284 context->n_mount_images,
56a13a49
ZJS
3285 tmp_dir,
3286 var_tmp_dir,
bbb4e7f3 3287 creds_path,
91dd5f7c 3288 context->log_namespace,
915e6d16 3289 context->mount_flags,
d4d55b0d
LB
3290 context->root_hash, context->root_hash_size, context->root_hash_path,
3291 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3292 context->root_verity,
93f59701
LB
3293 context->extension_images,
3294 context->n_extension_images,
5e8deb94
LB
3295 propagate_dir,
3296 incoming_dir,
3bdc25a4 3297 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3298 error_path);
93c6bb51 3299
1beab8b0 3300 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3301 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3302 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3303 * completely different execution environment. */
aca835ed 3304 if (r == -ENOANO) {
4e677599
LP
3305 if (insist_on_sandboxing(
3306 context,
3307 root_dir, root_image,
3308 bind_mounts,
3309 n_bind_mounts)) {
3310 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3311 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3312 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3313
3314 r = -EOPNOTSUPP;
3315 } else {
aca835ed 3316 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3317 r = 0;
aca835ed 3318 }
93c6bb51
DH
3319 }
3320
8062e643 3321finalize:
4e677599 3322 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3323 return r;
3324}
3325
915e6d16
LP
3326static int apply_working_directory(
3327 const ExecContext *context,
3328 const ExecParameters *params,
3329 const char *home,
376fecf6 3330 int *exit_status) {
915e6d16 3331
6732edab 3332 const char *d, *wd;
2b3c1b9e
DH
3333
3334 assert(context);
376fecf6 3335 assert(exit_status);
2b3c1b9e 3336
6732edab
LP
3337 if (context->working_directory_home) {
3338
376fecf6
LP
3339 if (!home) {
3340 *exit_status = EXIT_CHDIR;
6732edab 3341 return -ENXIO;
376fecf6 3342 }
6732edab 3343
2b3c1b9e 3344 wd = home;
6732edab 3345
14eb3285
LP
3346 } else
3347 wd = empty_to_root(context->working_directory);
e7f1e7c6 3348
fa97f630 3349 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3350 d = wd;
fa97f630 3351 else
3b0e5bb5 3352 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3353
376fecf6
LP
3354 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3355 *exit_status = EXIT_CHDIR;
2b3c1b9e 3356 return -errno;
376fecf6 3357 }
e7f1e7c6
DH
3358
3359 return 0;
3360}
3361
fa97f630
JB
3362static int apply_root_directory(
3363 const ExecContext *context,
3364 const ExecParameters *params,
3365 const bool needs_mount_ns,
3366 int *exit_status) {
3367
3368 assert(context);
3369 assert(exit_status);
3370
5b10116e 3371 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3372 if (!needs_mount_ns && context->root_directory)
3373 if (chroot(context->root_directory) < 0) {
3374 *exit_status = EXIT_CHROOT;
3375 return -errno;
3376 }
fa97f630
JB
3377
3378 return 0;
3379}
3380
b1edf445 3381static int setup_keyring(
34cf6c43 3382 const Unit *u,
b1edf445
LP
3383 const ExecContext *context,
3384 const ExecParameters *p,
3385 uid_t uid, gid_t gid) {
3386
74dd6b51 3387 key_serial_t keyring;
e64c2d0b
DJL
3388 int r = 0;
3389 uid_t saved_uid;
3390 gid_t saved_gid;
74dd6b51
LP
3391
3392 assert(u);
b1edf445 3393 assert(context);
74dd6b51
LP
3394 assert(p);
3395
3396 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3397 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3398 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3399 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3400 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3401 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3402
b1edf445
LP
3403 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3404 return 0;
3405
e64c2d0b
DJL
3406 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3407 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3408 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3409 * & group is just as nasty as acquiring a reference to the user keyring. */
3410
3411 saved_uid = getuid();
3412 saved_gid = getgid();
3413
3414 if (gid_is_valid(gid) && gid != saved_gid) {
3415 if (setregid(gid, -1) < 0)
3416 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3417 }
3418
3419 if (uid_is_valid(uid) && uid != saved_uid) {
3420 if (setreuid(uid, -1) < 0) {
3421 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3422 goto out;
3423 }
3424 }
3425
74dd6b51
LP
3426 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3427 if (keyring == -1) {
3428 if (errno == ENOSYS)
8002fb97 3429 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3430 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3431 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3432 else if (errno == EDQUOT)
8002fb97 3433 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3434 else
e64c2d0b 3435 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3436
e64c2d0b 3437 goto out;
74dd6b51
LP
3438 }
3439
e64c2d0b
DJL
3440 /* When requested link the user keyring into the session keyring. */
3441 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3442
3443 if (keyctl(KEYCTL_LINK,
3444 KEY_SPEC_USER_KEYRING,
3445 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3446 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3447 goto out;
3448 }
3449 }
3450
3451 /* Restore uid/gid back */
3452 if (uid_is_valid(uid) && uid != saved_uid) {
3453 if (setreuid(saved_uid, -1) < 0) {
3454 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3455 goto out;
3456 }
3457 }
3458
3459 if (gid_is_valid(gid) && gid != saved_gid) {
3460 if (setregid(saved_gid, -1) < 0)
3461 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3462 }
3463
3464 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3465 if (!sd_id128_is_null(u->invocation_id)) {
3466 key_serial_t key;
3467
3468 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3469 if (key == -1)
8002fb97 3470 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3471 else {
3472 if (keyctl(KEYCTL_SETPERM, key,
3473 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3474 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3475 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3476 }
3477 }
3478
e64c2d0b 3479out:
37b22b3b 3480 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3481 /* no extra logging, as only the first already reported error matters */
3482 if (getuid() != saved_uid)
3483 (void) setreuid(saved_uid, -1);
b1edf445 3484
e64c2d0b
DJL
3485 if (getgid() != saved_gid)
3486 (void) setregid(saved_gid, -1);
b1edf445 3487
e64c2d0b 3488 return r;
74dd6b51
LP
3489}
3490
3042bbeb 3491static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3492 assert(array);
3493 assert(n);
2caa38e9 3494 assert(pair);
29206d46
LP
3495
3496 if (pair[0] >= 0)
3497 array[(*n)++] = pair[0];
3498 if (pair[1] >= 0)
3499 array[(*n)++] = pair[1];
3500}
3501
a34ceba6
LP
3502static int close_remaining_fds(
3503 const ExecParameters *params,
34cf6c43
YW
3504 const ExecRuntime *runtime,
3505 const DynamicCreds *dcreds,
00d9ef85 3506 int user_lookup_fd,
a34ceba6 3507 int socket_fd,
5b8d1f6b 3508 const int *fds, size_t n_fds) {
a34ceba6 3509
da6053d0 3510 size_t n_dont_close = 0;
00d9ef85 3511 int dont_close[n_fds + 12];
a34ceba6
LP
3512
3513 assert(params);
3514
3515 if (params->stdin_fd >= 0)
3516 dont_close[n_dont_close++] = params->stdin_fd;
3517 if (params->stdout_fd >= 0)
3518 dont_close[n_dont_close++] = params->stdout_fd;
3519 if (params->stderr_fd >= 0)
3520 dont_close[n_dont_close++] = params->stderr_fd;
3521
3522 if (socket_fd >= 0)
3523 dont_close[n_dont_close++] = socket_fd;
3524 if (n_fds > 0) {
3525 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3526 n_dont_close += n_fds;
3527 }
3528
a70581ff 3529 if (runtime) {
29206d46 3530 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3531 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3532 }
29206d46
LP
3533
3534 if (dcreds) {
3535 if (dcreds->user)
3536 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3537 if (dcreds->group)
3538 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3539 }
3540
00d9ef85
LP
3541 if (user_lookup_fd >= 0)
3542 dont_close[n_dont_close++] = user_lookup_fd;
3543
a34ceba6
LP
3544 return close_all_fds(dont_close, n_dont_close);
3545}
3546
00d9ef85
LP
3547static int send_user_lookup(
3548 Unit *unit,
3549 int user_lookup_fd,
3550 uid_t uid,
3551 gid_t gid) {
3552
3553 assert(unit);
3554
3555 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3556 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3557 * specified. */
3558
3559 if (user_lookup_fd < 0)
3560 return 0;
3561
3562 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3563 return 0;
3564
3565 if (writev(user_lookup_fd,
3566 (struct iovec[]) {
e6a7ec4b
LP
3567 IOVEC_INIT(&uid, sizeof(uid)),
3568 IOVEC_INIT(&gid, sizeof(gid)),
3569 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3570 return -errno;
3571
3572 return 0;
3573}
3574
6732edab
LP
3575static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3576 int r;
3577
3578 assert(c);
3579 assert(home);
3580 assert(buf);
3581
3582 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3583
3584 if (*home)
3585 return 0;
3586
3587 if (!c->working_directory_home)
3588 return 0;
3589
6732edab
LP
3590 r = get_home_dir(buf);
3591 if (r < 0)
3592 return r;
3593
3594 *home = *buf;
3595 return 1;
3596}
3597
da50b85a
LP
3598static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3599 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3600 int r;
3601
3602 assert(c);
3603 assert(p);
3604 assert(ret);
3605
3606 assert(c->dynamic_user);
3607
3608 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3609 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3610 * directories. */
3611
5b10116e 3612 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3613 char **i;
3614
3615 if (t == EXEC_DIRECTORY_CONFIGURATION)
3616 continue;
3617
3618 if (!p->prefix[t])
3619 continue;
3620
3621 STRV_FOREACH(i, c->directories[t].paths) {
3622 char *e;
3623
494d0247 3624 if (exec_directory_is_private(c, t))
657ee2d8 3625 e = path_join(p->prefix[t], "private", *i);
494d0247
YW
3626 else
3627 e = path_join(p->prefix[t], *i);
da50b85a
LP
3628 if (!e)
3629 return -ENOMEM;
3630
3631 r = strv_consume(&list, e);
3632 if (r < 0)
3633 return r;
3634 }
3635 }
3636
ae2a15bc 3637 *ret = TAKE_PTR(list);
da50b85a
LP
3638
3639 return 0;
3640}
3641
78f93209
LP
3642static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3643 bool using_subcgroup;
3644 char *p;
3645
3646 assert(params);
3647 assert(ret);
3648
3649 if (!params->cgroup_path)
3650 return -EINVAL;
3651
3652 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3653 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3654 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3655 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3656 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3657 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3658 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3659 * flag, which is only passed for the former statements, not for the latter. */
3660
3661 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3662 if (using_subcgroup)
657ee2d8 3663 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3664 else
3665 p = strdup(params->cgroup_path);
3666 if (!p)
3667 return -ENOMEM;
3668
3669 *ret = p;
3670 return using_subcgroup;
3671}
3672
e2b2fb7f
MS
3673static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3674 _cleanup_(cpu_set_reset) CPUSet s = {};
3675 int r;
3676
3677 assert(c);
3678 assert(ret);
3679
3680 if (!c->numa_policy.nodes.set) {
3681 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3682 return 0;
3683 }
3684
3685 r = numa_to_cpu_set(&c->numa_policy, &s);
3686 if (r < 0)
3687 return r;
3688
3689 cpu_set_reset(ret);
3690
3691 return cpu_set_add_all(ret, &s);
3692}
3693
3694bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3695 assert(c);
3696
3697 return c->cpu_affinity_from_numa;
3698}
3699
1da37e58
ZJS
3700static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3701 int r;
3702
3703 assert(fds);
3704 assert(n_fds);
3705 assert(*n_fds < fds_size);
3706 assert(ret_fd);
3707
3708 if (fd < 0) {
3709 *ret_fd = -1;
3710 return 0;
3711 }
3712
3713 if (fd < 3 + (int) *n_fds) {
3714 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3715 * the fds we pass to the process (or which are closed only during execve). */
3716
3717 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3718 if (r < 0)
3719 return -errno;
3720
3721 CLOSE_AND_REPLACE(fd, r);
3722 }
3723
3724 *ret_fd = fds[*n_fds] = fd;
3725 (*n_fds) ++;
3726 return 1;
3727}
3728
ff0af2a1 3729static int exec_child(
f2341e0a 3730 Unit *unit,
34cf6c43 3731 const ExecCommand *command,
ff0af2a1
LP
3732 const ExecContext *context,
3733 const ExecParameters *params,
3734 ExecRuntime *runtime,
29206d46 3735 DynamicCreds *dcreds,
ff0af2a1 3736 int socket_fd,
2caa38e9 3737 const int named_iofds[static 3],
4c47affc 3738 int *fds,
da6053d0 3739 size_t n_socket_fds,
25b583d7 3740 size_t n_storage_fds,
ff0af2a1 3741 char **files_env,
00d9ef85 3742 int user_lookup_fd,
12145637 3743 int *exit_status) {
d35fbf6b 3744
7ca69792 3745 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3746 int r, ngids = 0, exec_fd;
4d885bd3
DH
3747 _cleanup_free_ gid_t *supplementary_gids = NULL;
3748 const char *username = NULL, *groupname = NULL;
5686391b 3749 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3750 const char *home = NULL, *shell = NULL;
7ca69792 3751 char **final_argv = NULL;
7bce046b
LP
3752 dev_t journal_stream_dev = 0;
3753 ino_t journal_stream_ino = 0;
5749f855 3754 bool userns_set_up = false;
165a31c0
LP
3755 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3756 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3757 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3758 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3759#if HAVE_SELINUX
7f59dd35 3760 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3761 bool use_selinux = false;
ecfbc84f 3762#endif
f9fa32f0 3763#if ENABLE_SMACK
43b1f709 3764 bool use_smack = false;
ecfbc84f 3765#endif
349cc4a5 3766#if HAVE_APPARMOR
43b1f709 3767 bool use_apparmor = false;
ecfbc84f 3768#endif
5749f855
AZ
3769 uid_t saved_uid = getuid();
3770 gid_t saved_gid = getgid();
fed1e721
LP
3771 uid_t uid = UID_INVALID;
3772 gid_t gid = GID_INVALID;
1da37e58
ZJS
3773 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3774 n_keep_fds; /* total number of fds not to close */
165a31c0 3775 int secure_bits;
afb11bf1
DG
3776 _cleanup_free_ gid_t *gids_after_pam = NULL;
3777 int ngids_after_pam = 0;
034c6ed7 3778
f2341e0a 3779 assert(unit);
5cb5a6ff
LP
3780 assert(command);
3781 assert(context);
d35fbf6b 3782 assert(params);
ff0af2a1 3783 assert(exit_status);
d35fbf6b
DM
3784
3785 rename_process_from_path(command->path);
3786
9c274488
LP
3787 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3788 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3789 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3790 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3791 SIGNALS_IGNORE);
d35fbf6b
DM
3792
3793 if (context->ignore_sigpipe)
9c274488 3794 (void) ignore_signals(SIGPIPE);
d35fbf6b 3795
ff0af2a1
LP
3796 r = reset_signal_mask();
3797 if (r < 0) {
3798 *exit_status = EXIT_SIGNAL_MASK;
12145637 3799 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3800 }
034c6ed7 3801
d35fbf6b
DM
3802 if (params->idle_pipe)
3803 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3804
2c027c62
LP
3805 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3806 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3807 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3808 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3809
d35fbf6b 3810 log_forget_fds();
2c027c62 3811 log_set_open_when_needed(true);
4f2d528d 3812
40a80078
LP
3813 /* In case anything used libc syslog(), close this here, too */
3814 closelog();
3815
b83d5050 3816 int keep_fds[n_fds + 2];
1da37e58
ZJS
3817 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3818 n_keep_fds = n_fds;
3819
3820 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3821 if (r < 0) {
3822 *exit_status = EXIT_FDS;
3823 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3824 }
3825
3826 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3827 if (r < 0) {
3828 *exit_status = EXIT_FDS;
12145637 3829 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3830 }
3831
0af07108
ZJS
3832 if (!context->same_pgrp &&
3833 setsid() < 0) {
3834 *exit_status = EXIT_SETSID;
3835 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3836 }
9e2f7c11 3837
1e22b5cd 3838 exec_context_tty_reset(context, params);
d35fbf6b 3839
c891efaf 3840 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3841 const char *vc = params->confirm_spawn;
3b20f877
FB
3842 _cleanup_free_ char *cmdline = NULL;
3843
8a62620e 3844 cmdline = quote_command_line(command->argv);
3b20f877 3845 if (!cmdline) {
0460aa5c 3846 *exit_status = EXIT_MEMORY;
12145637 3847 return log_oom();
3b20f877 3848 }
d35fbf6b 3849
eedf223a 3850 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3851 if (r != CONFIRM_EXECUTE) {
3852 if (r == CONFIRM_PRETEND_SUCCESS) {
3853 *exit_status = EXIT_SUCCESS;
3854 return 0;
3855 }
ff0af2a1 3856 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
3857 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3858 "Execution cancelled by the user");
d35fbf6b
DM
3859 }
3860 }
1a63a750 3861
d521916d
LP
3862 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3863 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3864 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3865 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3866 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3867 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3868 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3869 *exit_status = EXIT_MEMORY;
3870 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3871 }
3872
29206d46 3873 if (context->dynamic_user && dcreds) {
da50b85a 3874 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 3875
d521916d 3876 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 3877 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
3878 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3879 *exit_status = EXIT_USER;
12145637 3880 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
3881 }
3882
da50b85a
LP
3883 r = compile_suggested_paths(context, params, &suggested_paths);
3884 if (r < 0) {
3885 *exit_status = EXIT_MEMORY;
3886 return log_oom();
3887 }
3888
3889 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
3890 if (r < 0) {
3891 *exit_status = EXIT_USER;
d85ff944
YW
3892 if (r == -EILSEQ)
3893 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3894 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 3895 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 3896 }
524daa8c 3897
70dd455c 3898 if (!uid_is_valid(uid)) {
29206d46 3899 *exit_status = EXIT_USER;
d85ff944 3900 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
3901 }
3902
3903 if (!gid_is_valid(gid)) {
3904 *exit_status = EXIT_USER;
d85ff944 3905 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 3906 }
5bc7452b 3907
29206d46
LP
3908 if (dcreds->user)
3909 username = dcreds->user->name;
3910
3911 } else {
4d885bd3
DH
3912 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3913 if (r < 0) {
3914 *exit_status = EXIT_USER;
12145637 3915 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 3916 }
5bc7452b 3917
4d885bd3
DH
3918 r = get_fixed_group(context, &groupname, &gid);
3919 if (r < 0) {
3920 *exit_status = EXIT_GROUP;
12145637 3921 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 3922 }
cdc5d5c5 3923 }
29206d46 3924
cdc5d5c5
DH
3925 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3926 r = get_supplementary_groups(context, username, groupname, gid,
3927 &supplementary_gids, &ngids);
3928 if (r < 0) {
3929 *exit_status = EXIT_GROUP;
12145637 3930 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 3931 }
5bc7452b 3932
00d9ef85
LP
3933 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3934 if (r < 0) {
3935 *exit_status = EXIT_USER;
12145637 3936 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
3937 }
3938
3939 user_lookup_fd = safe_close(user_lookup_fd);
3940
6732edab
LP
3941 r = acquire_home(context, uid, &home, &home_buffer);
3942 if (r < 0) {
3943 *exit_status = EXIT_CHDIR;
12145637 3944 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
3945 }
3946
d35fbf6b
DM
3947 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3948 * must sure to drop O_NONBLOCK */
3949 if (socket_fd >= 0)
a34ceba6 3950 (void) fd_nonblock(socket_fd, false);
acbb0225 3951
4c70a4a7
MS
3952 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3953 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3954 if (params->cgroup_path) {
3955 _cleanup_free_ char *p = NULL;
3956
3957 r = exec_parameters_get_cgroup_path(params, &p);
3958 if (r < 0) {
3959 *exit_status = EXIT_CGROUP;
3960 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3961 }
3962
3963 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3964 if (r < 0) {
3965 *exit_status = EXIT_CGROUP;
3966 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3967 }
3968 }
3969
a8d08f39 3970 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 3971 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
3972 if (r < 0) {
3973 *exit_status = EXIT_NETWORK;
3974 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3975 }
3976 }
3977
a70581ff
XR
3978 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3979 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3980 if (r < 0) {
3981 *exit_status = EXIT_NAMESPACE;
3982 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3983 }
3984 }
3985
52c239d7 3986 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
3987 if (r < 0) {
3988 *exit_status = EXIT_STDIN;
12145637 3989 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 3990 }
034c6ed7 3991
52c239d7 3992 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3993 if (r < 0) {
3994 *exit_status = EXIT_STDOUT;
12145637 3995 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
3996 }
3997
52c239d7 3998 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3999 if (r < 0) {
4000 *exit_status = EXIT_STDERR;
12145637 4001 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4002 }
4003
d35fbf6b 4004 if (context->oom_score_adjust_set) {
9f8168eb
LP
4005 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4006 * prohibit write access to this file, and we shouldn't trip up over that. */
4007 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4008 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4009 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4010 else if (r < 0) {
ff0af2a1 4011 *exit_status = EXIT_OOM_ADJUST;
12145637 4012 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4013 }
d35fbf6b
DM
4014 }
4015
ad21e542
ZJS
4016 if (context->coredump_filter_set) {
4017 r = set_coredump_filter(context->coredump_filter);
4018 if (ERRNO_IS_PRIVILEGE(r))
4019 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4020 else if (r < 0)
4021 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4022 }
4023
39090201
DJL
4024 if (context->nice_set) {
4025 r = setpriority_closest(context->nice);
4026 if (r < 0)
4027 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4028 }
613b411c 4029
d35fbf6b
DM
4030 if (context->cpu_sched_set) {
4031 struct sched_param param = {
4032 .sched_priority = context->cpu_sched_priority,
4033 };
4034
ff0af2a1
LP
4035 r = sched_setscheduler(0,
4036 context->cpu_sched_policy |
4037 (context->cpu_sched_reset_on_fork ?
4038 SCHED_RESET_ON_FORK : 0),
4039 &param);
4040 if (r < 0) {
4041 *exit_status = EXIT_SETSCHEDULER;
12145637 4042 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4043 }
d35fbf6b 4044 }
fc9b2a84 4045
e2b2fb7f
MS
4046 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4047 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4048 const CPUSet *cpu_set;
4049
4050 if (context->cpu_affinity_from_numa) {
4051 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4052 if (r < 0) {
4053 *exit_status = EXIT_CPUAFFINITY;
4054 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4055 }
4056
4057 cpu_set = &converted_cpu_set;
4058 } else
4059 cpu_set = &context->cpu_set;
4060
4061 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4062 *exit_status = EXIT_CPUAFFINITY;
12145637 4063 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4064 }
e2b2fb7f 4065 }
034c6ed7 4066
b070c7c0
MS
4067 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4068 r = apply_numa_policy(&context->numa_policy);
4069 if (r == -EOPNOTSUPP)
33fe9e3f 4070 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4071 else if (r < 0) {
4072 *exit_status = EXIT_NUMA_POLICY;
4073 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4074 }
4075 }
4076
d35fbf6b
DM
4077 if (context->ioprio_set)
4078 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4079 *exit_status = EXIT_IOPRIO;
12145637 4080 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4081 }
da726a4d 4082
d35fbf6b
DM
4083 if (context->timer_slack_nsec != NSEC_INFINITY)
4084 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4085 *exit_status = EXIT_TIMERSLACK;
12145637 4086 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4087 }
9eba9da4 4088
21022b9d
LP
4089 if (context->personality != PERSONALITY_INVALID) {
4090 r = safe_personality(context->personality);
4091 if (r < 0) {
ff0af2a1 4092 *exit_status = EXIT_PERSONALITY;
12145637 4093 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4094 }
21022b9d 4095 }
94f04347 4096
d35fbf6b 4097 if (context->utmp_id)
df0ff127 4098 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 4099 context->tty_path,
023a4f67
LP
4100 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4101 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4102 USER_PROCESS,
6a93917d 4103 username);
d35fbf6b 4104
08f67696 4105 if (uid_is_valid(uid)) {
ff0af2a1
LP
4106 r = chown_terminal(STDIN_FILENO, uid);
4107 if (r < 0) {
4108 *exit_status = EXIT_STDIN;
12145637 4109 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4110 }
d35fbf6b 4111 }
8e274523 4112
4e1dfa45 4113 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4114 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4115 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4116 * touch a single hierarchy too. */
584b8688 4117 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4118 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4119 if (r < 0) {
4120 *exit_status = EXIT_CGROUP;
12145637 4121 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4122 }
d35fbf6b 4123 }
034c6ed7 4124
5b10116e 4125 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 4126 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
4127 if (r < 0)
4128 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4129 }
94f04347 4130
bb0c0d6f
LP
4131 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4132 r = setup_credentials(context, params, unit->id, uid);
4133 if (r < 0) {
4134 *exit_status = EXIT_CREDENTIALS;
4135 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4136 }
4137 }
4138
7bce046b 4139 r = build_environment(
fd63e712 4140 unit,
7bce046b
LP
4141 context,
4142 params,
4143 n_fds,
4144 home,
4145 username,
4146 shell,
4147 journal_stream_dev,
4148 journal_stream_ino,
4149 &our_env);
2065ca69
JW
4150 if (r < 0) {
4151 *exit_status = EXIT_MEMORY;
12145637 4152 return log_oom();
2065ca69
JW
4153 }
4154
4155 r = build_pass_environment(context, &pass_env);
4156 if (r < 0) {
4157 *exit_status = EXIT_MEMORY;
12145637 4158 return log_oom();
2065ca69
JW
4159 }
4160
4161 accum_env = strv_env_merge(5,
4162 params->environment,
4163 our_env,
4164 pass_env,
4165 context->environment,
44e5d006 4166 files_env);
2065ca69
JW
4167 if (!accum_env) {
4168 *exit_status = EXIT_MEMORY;
12145637 4169 return log_oom();
2065ca69 4170 }
1280503b 4171 accum_env = strv_env_clean(accum_env);
2065ca69 4172
096424d1 4173 (void) umask(context->umask);
b213e1c1 4174
b1edf445 4175 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4176 if (r < 0) {
4177 *exit_status = EXIT_KEYRING;
12145637 4178 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4179 }
4180
165a31c0 4181 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4182 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4183
165a31c0
LP
4184 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4185 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4186
165a31c0
LP
4187 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4188 if (needs_ambient_hack)
4189 needs_setuid = false;
4190 else
4191 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4192
4193 if (needs_sandboxing) {
7f18ef0a
FK
4194 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4195 * present. The actual MAC context application will happen later, as late as possible, to avoid
4196 * impacting our own code paths. */
4197
349cc4a5 4198#if HAVE_SELINUX
43b1f709 4199 use_selinux = mac_selinux_use();
7f18ef0a 4200#endif
f9fa32f0 4201#if ENABLE_SMACK
43b1f709 4202 use_smack = mac_smack_use();
7f18ef0a 4203#endif
349cc4a5 4204#if HAVE_APPARMOR
43b1f709 4205 use_apparmor = mac_apparmor_use();
7f18ef0a 4206#endif
165a31c0 4207 }
7f18ef0a 4208
ce932d2d
LP
4209 if (needs_sandboxing) {
4210 int which_failed;
4211
4212 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4213 * is set here. (See below.) */
4214
4215 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4216 if (r < 0) {
4217 *exit_status = EXIT_LIMITS;
4218 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4219 }
4220 }
4221
0af07108 4222 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4223 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4224 * wins here. (See above.) */
4225
1da37e58 4226 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4227 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4228 if (r < 0) {
4229 *exit_status = EXIT_PAM;
4230 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4231 }
ac45f971 4232
0af07108
ZJS
4233 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4234 if (ngids_after_pam < 0) {
4235 *exit_status = EXIT_MEMORY;
4236 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4237 }
b213e1c1 4238 }
5749f855 4239
0af07108 4240 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4241 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4242 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4243 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4244
4245 userns_set_up = true;
4246 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4247 if (r < 0) {
4248 *exit_status = EXIT_USER;
4249 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4250 }
4251 }
4252
a8d08f39
LP
4253 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4254
6e2d7c4f 4255 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4256 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4257 if (r == -EPERM)
4258 log_unit_warning_errno(unit, r,
4259 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4260 else if (r < 0) {
6e2d7c4f
MS
4261 *exit_status = EXIT_NETWORK;
4262 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4263 }
a8d08f39
LP
4264 } else if (context->network_namespace_path) {
4265 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4266 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4267 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4268 } else
4269 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4270 }
169c1bda 4271
a70581ff
XR
4272 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4273
4274 if (ns_type_supported(NAMESPACE_IPC)) {
4275 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4276 if (r == -EPERM)
4277 log_unit_warning_errno(unit, r,
4278 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4279 else if (r < 0) {
4280 *exit_status = EXIT_NAMESPACE;
4281 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4282 }
4283 } else if (context->ipc_namespace_path) {
4284 *exit_status = EXIT_NAMESPACE;
4285 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4286 "IPCNamespacePath= is not supported, refusing.");
4287 } else
4288 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4289 }
4290
ee818b89 4291 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 4292 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4293 _cleanup_free_ char *error_path = NULL;
4294
9f71ba8d 4295 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4296 if (r < 0) {
4297 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4298 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4299 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4300 }
d35fbf6b 4301 }
81a2b7ce 4302
daf8f72b
LP
4303 if (needs_sandboxing) {
4304 r = apply_protect_hostname(unit, context, exit_status);
4305 if (r < 0)
4306 return r;
aecd5ac6
TM
4307 }
4308
5749f855
AZ
4309 /* Drop groups as early as possible.
4310 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4311 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4312 if (needs_setuid) {
afb11bf1
DG
4313 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4314 int ngids_to_enforce = 0;
4315
4316 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4317 ngids,
4318 gids_after_pam,
4319 ngids_after_pam,
4320 &gids_to_enforce);
4321 if (ngids_to_enforce < 0) {
4322 *exit_status = EXIT_MEMORY;
4323 return log_unit_error_errno(unit,
4324 ngids_to_enforce,
4325 "Failed to merge group lists. Group membership might be incorrect: %m");
4326 }
4327
4328 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4329 if (r < 0) {
4330 *exit_status = EXIT_GROUP;
12145637 4331 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4332 }
165a31c0 4333 }
096424d1 4334
5749f855
AZ
4335 /* If the user namespace was not set up above, try to do it now.
4336 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4337 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4338 * case of mount namespaces being less privileged when the mount point list is copied from a
4339 * different user namespace). */
9008e1ac 4340
5749f855
AZ
4341 if (needs_sandboxing && context->private_users && !userns_set_up) {
4342 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4343 if (r < 0) {
4344 *exit_status = EXIT_USER;
4345 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4346 }
4347 }
4348
9f71ba8d
ZJS
4349 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4350 * shall execute. */
4351
4352 _cleanup_free_ char *executable = NULL;
b83d5050
ZJS
4353 _cleanup_close_ int executable_fd = -1;
4354 r = find_executable_full(command->path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4355 if (r < 0) {
4356 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4357 log_unit_struct_errno(unit, LOG_INFO, r,
4358 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4359 LOG_UNIT_INVOCATION_ID(unit),
4360 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4361 command->path),
4362 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4363 return 0;
4364 }
4365
4366 *exit_status = EXIT_EXEC;
c2503e35
RH
4367
4368 return log_unit_struct_errno(unit, LOG_INFO, r,
4369 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4370 LOG_UNIT_INVOCATION_ID(unit),
4371 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4372 command->path),
4373 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4374 }
4375
b83d5050
ZJS
4376 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4377 if (r < 0) {
4378 *exit_status = EXIT_FDS;
4379 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4380 }
4381
9f71ba8d 4382#if HAVE_SELINUX
49590d67
MS
4383 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4384 int fd = -1;
4385
4386 if (socket_fd >= 0)
4387 fd = socket_fd;
4388 else if (params->n_socket_fds == 1)
4389 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4390 * use context from that fd to compute the label. */
4391 fd = params->fds[0];
4392
4393 if (fd >= 0) {
4394 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4395 if (r < 0) {
4396 *exit_status = EXIT_SELINUX_CONTEXT;
4397 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4398 }
9f71ba8d
ZJS
4399 }
4400 }
4401#endif
4402
165a31c0 4403 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4404 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4405 * however if we have it as we want to keep it open until the final execve(). */
4406
1da37e58 4407 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4408 if (r >= 0)
4409 r = shift_fds(fds, n_fds);
4410 if (r >= 0)
25b583d7 4411 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4412 if (r < 0) {
4413 *exit_status = EXIT_FDS;
12145637 4414 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4415 }
e66cf1a3 4416
5686391b
LP
4417 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4418 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4419 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4420 * came this far. */
4421
165a31c0 4422 secure_bits = context->secure_bits;
e66cf1a3 4423
165a31c0
LP
4424 if (needs_sandboxing) {
4425 uint64_t bset;
e66cf1a3 4426
ce932d2d
LP
4427 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4428 * requested. (Note this is placed after the general resource limit initialization, see
4429 * above, in order to take precedence.) */
f4170c67
LP
4430 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4431 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4432 *exit_status = EXIT_LIMITS;
12145637 4433 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4434 }
4435 }
4436
37ac2744
JB
4437#if ENABLE_SMACK
4438 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4439 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4440 if (use_smack) {
b83d5050 4441 r = setup_smack(context, executable_fd);
37ac2744
JB
4442 if (r < 0) {
4443 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4444 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4445 }
4446 }
4447#endif
4448
165a31c0
LP
4449 bset = context->capability_bounding_set;
4450 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4451 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4452 * instead of us doing that */
4453 if (needs_ambient_hack)
4454 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4455 (UINT64_C(1) << CAP_SETUID) |
4456 (UINT64_C(1) << CAP_SETGID);
4457
4458 if (!cap_test_all(bset)) {
4459 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4460 if (r < 0) {
4461 *exit_status = EXIT_CAPABILITIES;
12145637 4462 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4463 }
4c2630eb 4464 }
3b8bddde 4465
16fcb191
TK
4466 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4467 * keep-caps set.
4468 * To be able to raise the ambient capabilities after setresuid() they have to be
4469 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4470 * After setresuid() the ambient capabilities can be raised as they are present in
4471 * the permitted and inhertiable set. However it is possible that someone wants to
4472 * set ambient capabilities without changing the user, so we also set the ambient
4473 * capabilities here.
4474 * The requested ambient capabilities are raised in the inheritable set if the
4475 * second argument is true. */
943800f4 4476 if (!needs_ambient_hack) {
755d4b67
IP
4477 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4478 if (r < 0) {
4479 *exit_status = EXIT_CAPABILITIES;
12145637 4480 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4481 }
755d4b67 4482 }
165a31c0 4483 }
755d4b67 4484
fa97f630
JB
4485 /* chroot to root directory first, before we lose the ability to chroot */
4486 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4487 if (r < 0)
4488 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4489
165a31c0 4490 if (needs_setuid) {
08f67696 4491 if (uid_is_valid(uid)) {
ff0af2a1
LP
4492 r = enforce_user(context, uid);
4493 if (r < 0) {
4494 *exit_status = EXIT_USER;
12145637 4495 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4496 }
165a31c0
LP
4497
4498 if (!needs_ambient_hack &&
4499 context->capability_ambient_set != 0) {
755d4b67 4500
16fcb191 4501 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4502 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4503 if (r < 0) {
4504 *exit_status = EXIT_CAPABILITIES;
12145637 4505 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4506 }
755d4b67 4507 }
5b6319dc 4508 }
165a31c0 4509 }
d35fbf6b 4510
56ef8db9
JB
4511 /* Apply working directory here, because the working directory might be on NFS and only the user running
4512 * this service might have the correct privilege to change to the working directory */
fa97f630 4513 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4514 if (r < 0)
4515 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4516
165a31c0 4517 if (needs_sandboxing) {
37ac2744 4518 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4519 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4520 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4521 * are restricted. */
4522
349cc4a5 4523#if HAVE_SELINUX
43b1f709 4524 if (use_selinux) {
5cd9cd35
LP
4525 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4526
4527 if (exec_context) {
4528 r = setexeccon(exec_context);
4529 if (r < 0) {
4530 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 4531 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
4532 }
4533 }
4534 }
4535#endif
4536
349cc4a5 4537#if HAVE_APPARMOR
43b1f709 4538 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4539 r = aa_change_onexec(context->apparmor_profile);
4540 if (r < 0 && !context->apparmor_profile_ignore) {
4541 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4542 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4543 }
4544 }
4545#endif
4546
165a31c0 4547 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4548 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4549 * CAP_SETPCAP. */
4550 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4551 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4552 * effective set here.
4553 * The effective set is overwritten during execve with the following values:
4554 * - ambient set (for non-root processes)
4555 * - (inheritable | bounding) set for root processes)
4556 *
4557 * Hence there is no security impact to raise it in the effective set before execve
4558 */
4559 r = capability_gain_cap_setpcap(NULL);
4560 if (r < 0) {
4561 *exit_status = EXIT_CAPABILITIES;
4562 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4563 }
755d4b67 4564 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4565 *exit_status = EXIT_SECUREBITS;
12145637 4566 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4567 }
dbdc4098 4568 }
5b6319dc 4569
59eeb84b 4570 if (context_has_no_new_privileges(context))
d35fbf6b 4571 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4572 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4573 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4574 }
4575
349cc4a5 4576#if HAVE_SECCOMP
469830d1
LP
4577 r = apply_address_families(unit, context);
4578 if (r < 0) {
4579 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4580 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4581 }
04aa0cb9 4582
469830d1
LP
4583 r = apply_memory_deny_write_execute(unit, context);
4584 if (r < 0) {
4585 *exit_status = EXIT_SECCOMP;
12145637 4586 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4587 }
f4170c67 4588
469830d1
LP
4589 r = apply_restrict_realtime(unit, context);
4590 if (r < 0) {
4591 *exit_status = EXIT_SECCOMP;
12145637 4592 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4593 }
4594
f69567cb
LP
4595 r = apply_restrict_suid_sgid(unit, context);
4596 if (r < 0) {
4597 *exit_status = EXIT_SECCOMP;
4598 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4599 }
4600
add00535
LP
4601 r = apply_restrict_namespaces(unit, context);
4602 if (r < 0) {
4603 *exit_status = EXIT_SECCOMP;
12145637 4604 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4605 }
4606
469830d1
LP
4607 r = apply_protect_sysctl(unit, context);
4608 if (r < 0) {
4609 *exit_status = EXIT_SECCOMP;
12145637 4610 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4611 }
4612
469830d1
LP
4613 r = apply_protect_kernel_modules(unit, context);
4614 if (r < 0) {
4615 *exit_status = EXIT_SECCOMP;
12145637 4616 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4617 }
4618
84703040
KK
4619 r = apply_protect_kernel_logs(unit, context);
4620 if (r < 0) {
4621 *exit_status = EXIT_SECCOMP;
4622 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4623 }
4624
fc64760d
KK
4625 r = apply_protect_clock(unit, context);
4626 if (r < 0) {
4627 *exit_status = EXIT_SECCOMP;
4628 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4629 }
4630
469830d1
LP
4631 r = apply_private_devices(unit, context);
4632 if (r < 0) {
4633 *exit_status = EXIT_SECCOMP;
12145637 4634 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4635 }
4636
4637 r = apply_syscall_archs(unit, context);
4638 if (r < 0) {
4639 *exit_status = EXIT_SECCOMP;
12145637 4640 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4641 }
4642
78e864e5
TM
4643 r = apply_lock_personality(unit, context);
4644 if (r < 0) {
4645 *exit_status = EXIT_SECCOMP;
12145637 4646 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4647 }
4648
9df2cdd8
TM
4649 r = apply_syscall_log(unit, context);
4650 if (r < 0) {
4651 *exit_status = EXIT_SECCOMP;
4652 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4653 }
4654
5cd9cd35
LP
4655 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4656 * by the filter as little as possible. */
165a31c0 4657 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4658 if (r < 0) {
4659 *exit_status = EXIT_SECCOMP;
12145637 4660 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4661 }
4662#endif
d35fbf6b 4663 }
034c6ed7 4664
00819cc1
LP
4665 if (!strv_isempty(context->unset_environment)) {
4666 char **ee = NULL;
4667
4668 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4669 if (!ee) {
4670 *exit_status = EXIT_MEMORY;
12145637 4671 return log_oom();
00819cc1
LP
4672 }
4673
130d3d22 4674 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4675 }
4676
7ca69792
AZ
4677 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4678 replaced_argv = replace_env_argv(command->argv, accum_env);
4679 if (!replaced_argv) {
4680 *exit_status = EXIT_MEMORY;
4681 return log_oom();
4682 }
4683 final_argv = replaced_argv;
4684 } else
4685 final_argv = command->argv;
034c6ed7 4686
f1d34068 4687 if (DEBUG_LOGGING) {
c2b2df60 4688 _cleanup_free_ char *line = NULL;
81a2b7ce 4689
8a62620e
ZJS
4690 line = quote_command_line(final_argv);
4691 if (!line) {
4692 *exit_status = EXIT_MEMORY;
4693 return log_oom();
4694 }
4695
4696 log_unit_struct(unit, LOG_DEBUG,
4697 "EXECUTABLE=%s", executable,
4698 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 4699 }
dd305ec9 4700
5686391b
LP
4701 if (exec_fd >= 0) {
4702 uint8_t hot = 1;
4703
4704 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4705 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4706
4707 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4708 *exit_status = EXIT_EXEC;
4709 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4710 }
4711 }
4712
a6d9111c 4713 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4714
4715 if (exec_fd >= 0) {
4716 uint8_t hot = 0;
4717
4718 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4719 * that POLLHUP on it no longer means execve() succeeded. */
4720
4721 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4722 *exit_status = EXIT_EXEC;
4723 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4724 }
4725 }
12145637 4726
ff0af2a1 4727 *exit_status = EXIT_EXEC;
9f71ba8d 4728 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4729}
81a2b7ce 4730
34cf6c43 4731static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4732static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4733
f2341e0a
LP
4734int exec_spawn(Unit *unit,
4735 ExecCommand *command,
d35fbf6b
DM
4736 const ExecContext *context,
4737 const ExecParameters *params,
4738 ExecRuntime *runtime,
29206d46 4739 DynamicCreds *dcreds,
d35fbf6b 4740 pid_t *ret) {
8351ceae 4741
ee39ca20 4742 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4743 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4744 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4745 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4746 _cleanup_free_ char *line = NULL;
d35fbf6b 4747 pid_t pid;
8351ceae 4748
f2341e0a 4749 assert(unit);
d35fbf6b
DM
4750 assert(command);
4751 assert(context);
4752 assert(ret);
4753 assert(params);
25b583d7 4754 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4755
d35fbf6b
DM
4756 if (context->std_input == EXEC_INPUT_SOCKET ||
4757 context->std_output == EXEC_OUTPUT_SOCKET ||
4758 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4759
d85ff944
YW
4760 if (params->n_socket_fds > 1)
4761 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4762
d85ff944
YW
4763 if (params->n_socket_fds == 0)
4764 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4765
d35fbf6b
DM
4766 socket_fd = params->fds[0];
4767 } else {
4768 socket_fd = -1;
4769 fds = params->fds;
9b141911 4770 n_socket_fds = params->n_socket_fds;
25b583d7 4771 n_storage_fds = params->n_storage_fds;
d35fbf6b 4772 }
94f04347 4773
34cf6c43 4774 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4775 if (r < 0)
4776 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4777
f2341e0a 4778 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4779 if (r < 0)
f2341e0a 4780 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4781
8a62620e 4782 line = quote_command_line(command->argv);
d35fbf6b
DM
4783 if (!line)
4784 return log_oom();
fab56fc5 4785
9f71ba8d
ZJS
4786 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4787 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4788 mac_selinux_maybe_reload();
4789
c2503e35
RH
4790 log_unit_struct(unit, LOG_DEBUG,
4791 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4792 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4793 the mount namespace in the child, but we want to log
4794 from the parent, so we need to use the (possibly
4795 inaccurate) path here. */
4796 LOG_UNIT_INVOCATION_ID(unit));
12145637 4797
78f93209
LP
4798 if (params->cgroup_path) {
4799 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4800 if (r < 0)
4801 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4802 if (r > 0) { /* We are using a child cgroup */
4803 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4804 if (r < 0)
4805 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
4806
4807 /* Normally we would not propagate the oomd xattrs to children but since we created this
4808 * sub-cgroup internally we should do it. */
4809 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
4810 }
4811 }
4812
d35fbf6b
DM
4813 pid = fork();
4814 if (pid < 0)
74129a12 4815 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
4816
4817 if (pid == 0) {
12145637 4818 int exit_status = EXIT_SUCCESS;
ff0af2a1 4819
f2341e0a
LP
4820 r = exec_child(unit,
4821 command,
ff0af2a1
LP
4822 context,
4823 params,
4824 runtime,
29206d46 4825 dcreds,
ff0af2a1 4826 socket_fd,
52c239d7 4827 named_iofds,
4c47affc 4828 fds,
9b141911 4829 n_socket_fds,
25b583d7 4830 n_storage_fds,
ff0af2a1 4831 files_env,
00d9ef85 4832 unit->manager->user_lookup_fds[1],
12145637
LP
4833 &exit_status);
4834
e1714f02
ZJS
4835 if (r < 0) {
4836 const char *status =
4837 exit_status_to_string(exit_status,
e04ed6db 4838 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 4839
c2503e35
RH
4840 log_unit_struct_errno(unit, LOG_ERR, r,
4841 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4842 LOG_UNIT_INVOCATION_ID(unit),
4843 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4844 status, command->path),
4845 "EXECUTABLE=%s", command->path);
e1714f02 4846 }
4c2630eb 4847
ff0af2a1 4848 _exit(exit_status);
034c6ed7
LP
4849 }
4850
f2341e0a 4851 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 4852
78f93209
LP
4853 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4854 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4855 * process will be killed too). */
4856 if (subcgroup_path)
4857 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 4858
b58b4116 4859 exec_status_start(&command->exec_status, pid);
9fb86720 4860
034c6ed7 4861 *ret = pid;
5cb5a6ff
LP
4862 return 0;
4863}
4864
034c6ed7
LP
4865void exec_context_init(ExecContext *c) {
4866 assert(c);
4867
4c12626c 4868 c->umask = 0022;
9eba9da4 4869 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 4870 c->cpu_sched_policy = SCHED_OTHER;
071830ff 4871 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 4872 c->syslog_level_prefix = true;
353e12c2 4873 c->ignore_sigpipe = true;
3a43da28 4874 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 4875 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
4876 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4877 c->directories[t].mode = 0755;
12213aed 4878 c->timeout_clean_usec = USEC_INFINITY;
a103496c 4879 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
4880 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4881 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 4882 c->log_level_max = -1;
005bfaf1
TM
4883#if HAVE_SECCOMP
4884 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4885#endif
b070c7c0 4886 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
4887}
4888
613b411c 4889void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
4890 assert(c);
4891
6796073e
LP
4892 c->environment = strv_free(c->environment);
4893 c->environment_files = strv_free(c->environment_files);
b4c14404 4894 c->pass_environment = strv_free(c->pass_environment);
00819cc1 4895 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 4896
31ce987c 4897 rlimit_free_all(c->rlimit);
034c6ed7 4898
5b10116e 4899 for (size_t l = 0; l < 3; l++) {
52c239d7 4900 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
4901 c->stdio_file[l] = mfree(c->stdio_file[l]);
4902 }
52c239d7 4903
a1e58e8e
LP
4904 c->working_directory = mfree(c->working_directory);
4905 c->root_directory = mfree(c->root_directory);
915e6d16 4906 c->root_image = mfree(c->root_image);
18d73705 4907 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
4908 c->root_hash = mfree(c->root_hash);
4909 c->root_hash_size = 0;
4910 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
4911 c->root_hash_sig = mfree(c->root_hash_sig);
4912 c->root_hash_sig_size = 0;
4913 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 4914 c->root_verity = mfree(c->root_verity);
93f59701 4915 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
4916 c->tty_path = mfree(c->tty_path);
4917 c->syslog_identifier = mfree(c->syslog_identifier);
4918 c->user = mfree(c->user);
4919 c->group = mfree(c->group);
034c6ed7 4920
6796073e 4921 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 4922
a1e58e8e 4923 c->pam_name = mfree(c->pam_name);
5b6319dc 4924
2a624c36
AP
4925 c->read_only_paths = strv_free(c->read_only_paths);
4926 c->read_write_paths = strv_free(c->read_write_paths);
4927 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
4928 c->exec_paths = strv_free(c->exec_paths);
4929 c->no_exec_paths = strv_free(c->no_exec_paths);
82c121a4 4930
d2d6c096 4931 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
4932 c->bind_mounts = NULL;
4933 c->n_bind_mounts = 0;
2abd4e38
YW
4934 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4935 c->temporary_filesystems = NULL;
4936 c->n_temporary_filesystems = 0;
b3d13314 4937 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 4938
0985c7c4 4939 cpu_set_reset(&c->cpu_set);
b070c7c0 4940 numa_policy_reset(&c->numa_policy);
86a3475b 4941
a1e58e8e
LP
4942 c->utmp_id = mfree(c->utmp_id);
4943 c->selinux_context = mfree(c->selinux_context);
4944 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 4945 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 4946
8cfa775f 4947 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
4948 c->syscall_archs = set_free(c->syscall_archs);
4949 c->address_families = set_free(c->address_families);
e66cf1a3 4950
5b10116e
ZJS
4951 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4952 c->directories[t].paths = strv_free(c->directories[t].paths);
d3070fbd
LP
4953
4954 c->log_level_max = -1;
4955
4956 exec_context_free_log_extra_fields(c);
08f3be7a 4957
5ac1530e
ZJS
4958 c->log_ratelimit_interval_usec = 0;
4959 c->log_ratelimit_burst = 0;
90fc172e 4960
08f3be7a
LP
4961 c->stdin_data = mfree(c->stdin_data);
4962 c->stdin_data_size = 0;
a8d08f39
LP
4963
4964 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 4965 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
4966
4967 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 4968
43144be4 4969 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 4970 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
4971}
4972
34cf6c43 4973int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
4974 char **i;
4975
4976 assert(c);
4977
4978 if (!runtime_prefix)
4979 return 0;
4980
3536f49e 4981 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
c2b2df60 4982 _cleanup_free_ char *p = NULL;
e66cf1a3 4983
494d0247
YW
4984 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4985 p = path_join(runtime_prefix, "private", *i);
4986 else
4987 p = path_join(runtime_prefix, *i);
e66cf1a3
LP
4988 if (!p)
4989 return -ENOMEM;
4990
7bc4bf4a
LP
4991 /* We execute this synchronously, since we need to be sure this is gone when we start the
4992 * service next. */
c6878637 4993 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
4994 }
4995
4996 return 0;
5cb5a6ff
LP
4997}
4998
bb0c0d6f
LP
4999int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5000 _cleanup_free_ char *p = NULL;
5001
5002 assert(c);
5003
5004 if (!runtime_prefix || !unit)
5005 return 0;
5006
5007 p = path_join(runtime_prefix, "credentials", unit);
5008 if (!p)
5009 return -ENOMEM;
5010
5011 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5012 * unmount it, and afterwards remove the mount point */
5013 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5014 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5015
5016 return 0;
5017}
5018
34cf6c43 5019static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5020 assert(c);
5021
a1e58e8e 5022 c->path = mfree(c->path);
6796073e 5023 c->argv = strv_free(c->argv);
43d0fcbd
LP
5024}
5025
da6053d0 5026void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5027 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5028 exec_command_done(c+i);
5029}
5030
f1acf85a 5031ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5032 ExecCommand *i;
5033
5034 while ((i = c)) {
71fda00f 5035 LIST_REMOVE(command, c, i);
43d0fcbd 5036 exec_command_done(i);
5cb5a6ff
LP
5037 free(i);
5038 }
f1acf85a
ZJS
5039
5040 return NULL;
5cb5a6ff
LP
5041}
5042
da6053d0 5043void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5044 for (size_t i = 0; i < n; i++)
f1acf85a 5045 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5046}
5047
6a1d4d9f 5048void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5049 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5050 exec_status_reset(&c[i].exec_status);
5051}
5052
5053void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5054 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5055 ExecCommand *z;
5056
5057 LIST_FOREACH(command, z, c[i])
5058 exec_status_reset(&z->exec_status);
5059 }
5060}
5061
039f0e70 5062typedef struct InvalidEnvInfo {
34cf6c43 5063 const Unit *unit;
039f0e70
LP
5064 const char *path;
5065} InvalidEnvInfo;
5066
5067static void invalid_env(const char *p, void *userdata) {
5068 InvalidEnvInfo *info = userdata;
5069
f2341e0a 5070 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5071}
5072
52c239d7
LB
5073const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5074 assert(c);
5075
5076 switch (fd_index) {
5073ff6b 5077
52c239d7
LB
5078 case STDIN_FILENO:
5079 if (c->std_input != EXEC_INPUT_NAMED_FD)
5080 return NULL;
5073ff6b 5081
52c239d7 5082 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5083
52c239d7
LB
5084 case STDOUT_FILENO:
5085 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5086 return NULL;
5073ff6b 5087
52c239d7 5088 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5089
52c239d7
LB
5090 case STDERR_FILENO:
5091 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5092 return NULL;
5073ff6b 5093
52c239d7 5094 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5095
52c239d7
LB
5096 default:
5097 return NULL;
5098 }
5099}
5100
2caa38e9
LP
5101static int exec_context_named_iofds(
5102 const ExecContext *c,
5103 const ExecParameters *p,
5104 int named_iofds[static 3]) {
5105
5b10116e 5106 size_t targets;
56fbd561 5107 const char* stdio_fdname[3];
da6053d0 5108 size_t n_fds;
52c239d7
LB
5109
5110 assert(c);
5111 assert(p);
2caa38e9 5112 assert(named_iofds);
52c239d7
LB
5113
5114 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5115 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5116 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5117
5b10116e 5118 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5119 stdio_fdname[i] = exec_context_fdname(c, i);
5120
4c47affc
FB
5121 n_fds = p->n_storage_fds + p->n_socket_fds;
5122
5b10116e 5123 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5124 if (named_iofds[STDIN_FILENO] < 0 &&
5125 c->std_input == EXEC_INPUT_NAMED_FD &&
5126 stdio_fdname[STDIN_FILENO] &&
5127 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5128
52c239d7
LB
5129 named_iofds[STDIN_FILENO] = p->fds[i];
5130 targets--;
56fbd561
ZJS
5131
5132 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5133 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5134 stdio_fdname[STDOUT_FILENO] &&
5135 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5136
52c239d7
LB
5137 named_iofds[STDOUT_FILENO] = p->fds[i];
5138 targets--;
56fbd561
ZJS
5139
5140 } else if (named_iofds[STDERR_FILENO] < 0 &&
5141 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5142 stdio_fdname[STDERR_FILENO] &&
5143 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5144
52c239d7
LB
5145 named_iofds[STDERR_FILENO] = p->fds[i];
5146 targets--;
5147 }
5148
56fbd561 5149 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5150}
5151
34cf6c43 5152static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5153 char **i, **r = NULL;
5154
5155 assert(c);
5156 assert(l);
5157
5158 STRV_FOREACH(i, c->environment_files) {
5159 char *fn;
52511fae 5160 int k;
8c7be95e
LP
5161 bool ignore = false;
5162 char **p;
7fd1b19b 5163 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5164
5165 fn = *i;
5166
5167 if (fn[0] == '-') {
5168 ignore = true;
313cefa1 5169 fn++;
8c7be95e
LP
5170 }
5171
5172 if (!path_is_absolute(fn)) {
8c7be95e
LP
5173 if (ignore)
5174 continue;
5175
5176 strv_free(r);
5177 return -EINVAL;
5178 }
5179
2bef10ab 5180 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5181 k = safe_glob(fn, 0, &pglob);
5182 if (k < 0) {
2bef10ab
PL
5183 if (ignore)
5184 continue;
8c7be95e 5185
2bef10ab 5186 strv_free(r);
d8c92e8b 5187 return k;
2bef10ab 5188 }
8c7be95e 5189
d8c92e8b
ZJS
5190 /* When we don't match anything, -ENOENT should be returned */
5191 assert(pglob.gl_pathc > 0);
5192
5b10116e 5193 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5194 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5195 if (k < 0) {
5196 if (ignore)
5197 continue;
8c7be95e 5198
2bef10ab 5199 strv_free(r);
2bef10ab 5200 return k;
e9c1ea9d 5201 }
ebc05a09 5202 /* Log invalid environment variables with filename */
039f0e70
LP
5203 if (p) {
5204 InvalidEnvInfo info = {
f2341e0a 5205 .unit = unit,
039f0e70
LP
5206 .path = pglob.gl_pathv[n]
5207 };
5208
5209 p = strv_env_clean_with_callback(p, invalid_env, &info);
5210 }
8c7be95e 5211
234519ae 5212 if (!r)
2bef10ab
PL
5213 r = p;
5214 else {
5215 char **m;
8c7be95e 5216
2bef10ab
PL
5217 m = strv_env_merge(2, r, p);
5218 strv_free(r);
5219 strv_free(p);
c84a9488 5220 if (!m)
2bef10ab 5221 return -ENOMEM;
2bef10ab
PL
5222
5223 r = m;
5224 }
8c7be95e
LP
5225 }
5226 }
5227
5228 *l = r;
5229
5230 return 0;
5231}
5232
6ac8fdc9 5233static bool tty_may_match_dev_console(const char *tty) {
7b912648 5234 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5235
1e22b5cd
LP
5236 if (!tty)
5237 return true;
5238
a119ec7c 5239 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5240
5241 /* trivial identity? */
5242 if (streq(tty, "console"))
5243 return true;
5244
7b912648
LP
5245 if (resolve_dev_console(&resolved) < 0)
5246 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5247
5248 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5249 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5250}
5251
6c0ae739
LP
5252static bool exec_context_may_touch_tty(const ExecContext *ec) {
5253 assert(ec);
1e22b5cd 5254
6c0ae739 5255 return ec->tty_reset ||
1e22b5cd
LP
5256 ec->tty_vhangup ||
5257 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5258 is_terminal_input(ec->std_input) ||
5259 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5260 is_terminal_output(ec->std_error);
5261}
5262
5263bool exec_context_may_touch_console(const ExecContext *ec) {
5264
5265 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5266 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5267}
5268
15ae422b
LP
5269static void strv_fprintf(FILE *f, char **l) {
5270 char **g;
5271
5272 assert(f);
5273
5274 STRV_FOREACH(g, l)
5275 fprintf(f, " %s", *g);
5276}
5277
ddc155b2
TM
5278static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5279 assert(f);
5280 assert(prefix);
5281 assert(name);
5282
5283 if (!strv_isempty(strv)) {
a7bd1656 5284 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5285 strv_fprintf(f, strv);
5286 fputs("\n", f);
5287 }
5288}
5289
34cf6c43 5290void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
5291f26d 5291 char **e, **d;
add00535 5292 int r;
9eba9da4 5293
5cb5a6ff
LP
5294 assert(c);
5295 assert(f);
5296
4ad49000 5297 prefix = strempty(prefix);
5cb5a6ff
LP
5298
5299 fprintf(f,
94f04347
LP
5300 "%sUMask: %04o\n"
5301 "%sWorkingDirectory: %s\n"
451a074f 5302 "%sRootDirectory: %s\n"
15ae422b 5303 "%sNonBlocking: %s\n"
64747e2d 5304 "%sPrivateTmp: %s\n"
7f112f50 5305 "%sPrivateDevices: %s\n"
59eeb84b 5306 "%sProtectKernelTunables: %s\n"
e66a2f65 5307 "%sProtectKernelModules: %s\n"
84703040 5308 "%sProtectKernelLogs: %s\n"
fc64760d 5309 "%sProtectClock: %s\n"
59eeb84b 5310 "%sProtectControlGroups: %s\n"
d251207d
LP
5311 "%sPrivateNetwork: %s\n"
5312 "%sPrivateUsers: %s\n"
1b8689f9
LP
5313 "%sProtectHome: %s\n"
5314 "%sProtectSystem: %s\n"
5d997827 5315 "%sMountAPIVFS: %s\n"
f3e43635 5316 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5317 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5318 "%sRestrictRealtime: %s\n"
f69567cb 5319 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5320 "%sKeyringMode: %s\n"
4e399953
LP
5321 "%sProtectHostname: %s\n"
5322 "%sProtectProc: %s\n"
5323 "%sProcSubset: %s\n",
5cb5a6ff 5324 prefix, c->umask,
14eb3285
LP
5325 prefix, empty_to_root(c->working_directory),
5326 prefix, empty_to_root(c->root_directory),
15ae422b 5327 prefix, yes_no(c->non_blocking),
64747e2d 5328 prefix, yes_no(c->private_tmp),
7f112f50 5329 prefix, yes_no(c->private_devices),
59eeb84b 5330 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5331 prefix, yes_no(c->protect_kernel_modules),
84703040 5332 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5333 prefix, yes_no(c->protect_clock),
59eeb84b 5334 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5335 prefix, yes_no(c->private_network),
5336 prefix, yes_no(c->private_users),
1b8689f9
LP
5337 prefix, protect_home_to_string(c->protect_home),
5338 prefix, protect_system_to_string(c->protect_system),
5e98086d 5339 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5340 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5341 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5342 prefix, yes_no(c->restrict_realtime),
f69567cb 5343 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5344 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5345 prefix, yes_no(c->protect_hostname),
5346 prefix, protect_proc_to_string(c->protect_proc),
5347 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5348
915e6d16
LP
5349 if (c->root_image)
5350 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5351
18d73705
LB
5352 if (c->root_image_options) {
5353 MountOptions *o;
5354
5355 fprintf(f, "%sRootImageOptions:", prefix);
5356 LIST_FOREACH(mount_options, o, c->root_image_options)
5357 if (!isempty(o->options))
9ece6444
LB
5358 fprintf(f, " %s:%s",
5359 partition_designator_to_string(o->partition_designator),
5360 o->options);
18d73705
LB
5361 fprintf(f, "\n");
5362 }
5363
0389f4fa
LB
5364 if (c->root_hash) {
5365 _cleanup_free_ char *encoded = NULL;
5366 encoded = hexmem(c->root_hash, c->root_hash_size);
5367 if (encoded)
5368 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5369 }
5370
5371 if (c->root_hash_path)
5372 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5373
d4d55b0d
LB
5374 if (c->root_hash_sig) {
5375 _cleanup_free_ char *encoded = NULL;
5376 ssize_t len;
5377 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5378 if (len)
5379 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5380 }
5381
5382 if (c->root_hash_sig_path)
5383 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5384
0389f4fa
LB
5385 if (c->root_verity)
5386 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5387
8c7be95e
LP
5388 STRV_FOREACH(e, c->environment)
5389 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5390
5391 STRV_FOREACH(e, c->environment_files)
5392 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5393
b4c14404
FB
5394 STRV_FOREACH(e, c->pass_environment)
5395 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5396
00819cc1
LP
5397 STRV_FOREACH(e, c->unset_environment)
5398 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5399
53f47dfc
YW
5400 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5401
5b10116e 5402 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5403 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5404
5405 STRV_FOREACH(d, c->directories[dt].paths)
5406 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5407 }
c2bbd90b 5408
5291f26d 5409 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5410
fb33a393 5411 if (c->nice_set)
5291f26d 5412 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5413
dd6c17b1 5414 if (c->oom_score_adjust_set)
5291f26d 5415 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5416
ad21e542 5417 if (c->coredump_filter_set)
5291f26d 5418 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5419
5b10116e 5420 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5421 if (c->rlimit[i]) {
4c3a2b84 5422 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5423 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5424 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5425 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5426 }
94f04347 5427
f8b69d1d 5428 if (c->ioprio_set) {
1756a011 5429 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5430
837df140
YW
5431 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5432 if (r >= 0)
5433 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5434
5435 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 5436 }
94f04347 5437
f8b69d1d 5438 if (c->cpu_sched_set) {
1756a011 5439 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5440
837df140
YW
5441 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5442 if (r >= 0)
5443 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5444
94f04347 5445 fprintf(f,
38b48754
LP
5446 "%sCPUSchedulingPriority: %i\n"
5447 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5448 prefix, c->cpu_sched_priority,
5449 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5450 }
94f04347 5451
0985c7c4 5452 if (c->cpu_set.set) {
e7fca352
MS
5453 _cleanup_free_ char *affinity = NULL;
5454
5455 affinity = cpu_set_to_range_string(&c->cpu_set);
5456 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5457 }
5458
b070c7c0
MS
5459 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5460 _cleanup_free_ char *nodes = NULL;
5461
5462 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5463 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5464 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5465 }
5466
3a43da28 5467 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5468 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5469
5470 fprintf(f,
80876c20
LP
5471 "%sStandardInput: %s\n"
5472 "%sStandardOutput: %s\n"
5473 "%sStandardError: %s\n",
5474 prefix, exec_input_to_string(c->std_input),
5475 prefix, exec_output_to_string(c->std_output),
5476 prefix, exec_output_to_string(c->std_error));
5477
befc4a80
LP
5478 if (c->std_input == EXEC_INPUT_NAMED_FD)
5479 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5480 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5481 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5482 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5483 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5484
5485 if (c->std_input == EXEC_INPUT_FILE)
5486 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5487 if (c->std_output == EXEC_OUTPUT_FILE)
5488 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5489 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5490 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5491 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5492 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5493 if (c->std_error == EXEC_OUTPUT_FILE)
5494 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5495 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5496 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5497 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5498 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5499
80876c20
LP
5500 if (c->tty_path)
5501 fprintf(f,
6ea832a2
LP
5502 "%sTTYPath: %s\n"
5503 "%sTTYReset: %s\n"
5504 "%sTTYVHangup: %s\n"
5505 "%sTTYVTDisallocate: %s\n",
5506 prefix, c->tty_path,
5507 prefix, yes_no(c->tty_reset),
5508 prefix, yes_no(c->tty_vhangup),
5509 prefix, yes_no(c->tty_vt_disallocate));
94f04347 5510
9f6444eb 5511 if (IN_SET(c->std_output,
9f6444eb
LP
5512 EXEC_OUTPUT_KMSG,
5513 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5514 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5515 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5516 IN_SET(c->std_error,
9f6444eb
LP
5517 EXEC_OUTPUT_KMSG,
5518 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5519 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5520 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5521
5ce70e5b 5522 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5523
837df140
YW
5524 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5525 if (r >= 0)
5526 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5527
837df140
YW
5528 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5529 if (r >= 0)
5530 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5531 }
94f04347 5532
d3070fbd
LP
5533 if (c->log_level_max >= 0) {
5534 _cleanup_free_ char *t = NULL;
5535
5536 (void) log_level_to_string_alloc(c->log_level_max, &t);
5537
5538 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5539 }
5540
5291f26d 5541 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5542 fprintf(f,
5543 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5544 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5545
5ac1530e
ZJS
5546 if (c->log_ratelimit_burst > 0)
5547 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5548
5b10116e
ZJS
5549 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5550 fprintf(f, "%sLogExtraFields: ", prefix);
5551 fwrite(c->log_extra_fields[j].iov_base,
5552 1, c->log_extra_fields[j].iov_len,
5553 f);
5554 fputc('\n', f);
d3070fbd
LP
5555 }
5556
91dd5f7c
LP
5557 if (c->log_namespace)
5558 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5559
07d46372
YW
5560 if (c->secure_bits) {
5561 _cleanup_free_ char *str = NULL;
5562
5563 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5564 if (r >= 0)
5565 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5566 }
94f04347 5567
a103496c 5568 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5569 _cleanup_free_ char *str = NULL;
94f04347 5570
dd1f5bd0
YW
5571 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5572 if (r >= 0)
5573 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5574 }
5575
5576 if (c->capability_ambient_set != 0) {
dd1f5bd0 5577 _cleanup_free_ char *str = NULL;
755d4b67 5578
dd1f5bd0
YW
5579 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5580 if (r >= 0)
5581 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5582 }
5583
5584 if (c->user)
f2d3769a 5585 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5586 if (c->group)
f2d3769a 5587 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5588
29206d46
LP
5589 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5590
ddc155b2 5591 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5592
5b6319dc 5593 if (c->pam_name)
f2d3769a 5594 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5595
ddc155b2
TM
5596 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5597 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5598 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5599 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5600 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
2e22afe9 5601
5b10116e
ZJS
5602 for (size_t i = 0; i < c->n_bind_mounts; i++)
5603 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5604 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5605 c->bind_mounts[i].ignore_enoent ? "-": "",
5606 c->bind_mounts[i].source,
5607 c->bind_mounts[i].destination,
5608 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5609
5b10116e
ZJS
5610 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5611 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5612
5b10116e
ZJS
5613 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5614 t->path,
5615 isempty(t->options) ? "" : ":",
5616 strempty(t->options));
5617 }
2abd4e38 5618
169c1bda
LP
5619 if (c->utmp_id)
5620 fprintf(f,
5621 "%sUtmpIdentifier: %s\n",
5622 prefix, c->utmp_id);
7b52a628
MS
5623
5624 if (c->selinux_context)
5625 fprintf(f,
5f8640fb
LP
5626 "%sSELinuxContext: %s%s\n",
5627 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5628
80c21aea
WC
5629 if (c->apparmor_profile)
5630 fprintf(f,
5631 "%sAppArmorProfile: %s%s\n",
5632 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5633
5634 if (c->smack_process_label)
5635 fprintf(f,
5636 "%sSmackProcessLabel: %s%s\n",
5637 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5638
050f7277 5639 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5640 fprintf(f,
5641 "%sPersonality: %s\n",
5642 prefix, strna(personality_to_string(c->personality)));
5643
78e864e5
TM
5644 fprintf(f,
5645 "%sLockPersonality: %s\n",
5646 prefix, yes_no(c->lock_personality));
5647
17df7223 5648 if (c->syscall_filter) {
349cc4a5 5649#if HAVE_SECCOMP
8cfa775f 5650 void *id, *val;
17df7223 5651 bool first = true;
351a19b1 5652#endif
17df7223
LP
5653
5654 fprintf(f,
57183d11 5655 "%sSystemCallFilter: ",
17df7223
LP
5656 prefix);
5657
6b000af4 5658 if (!c->syscall_allow_list)
17df7223
LP
5659 fputc('~', f);
5660
349cc4a5 5661#if HAVE_SECCOMP
90e74a66 5662 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5663 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5664 const char *errno_name = NULL;
5665 int num = PTR_TO_INT(val);
17df7223
LP
5666
5667 if (first)
5668 first = false;
5669 else
5670 fputc(' ', f);
5671
57183d11 5672 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5673 fputs(strna(name), f);
8cfa775f
YW
5674
5675 if (num >= 0) {
005bfaf1 5676 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5677 if (errno_name)
5678 fprintf(f, ":%s", errno_name);
5679 else
5680 fprintf(f, ":%d", num);
5681 }
17df7223 5682 }
351a19b1 5683#endif
17df7223
LP
5684
5685 fputc('\n', f);
5686 }
5687
57183d11 5688 if (c->syscall_archs) {
349cc4a5 5689#if HAVE_SECCOMP
57183d11
LP
5690 void *id;
5691#endif
5692
5693 fprintf(f,
5694 "%sSystemCallArchitectures:",
5695 prefix);
5696
349cc4a5 5697#if HAVE_SECCOMP
90e74a66 5698 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5699 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5700#endif
5701 fputc('\n', f);
5702 }
5703
add00535
LP
5704 if (exec_context_restrict_namespaces_set(c)) {
5705 _cleanup_free_ char *s = NULL;
5706
86c2a9f1 5707 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5708 if (r >= 0)
5709 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5710 prefix, strna(s));
add00535
LP
5711 }
5712
a8d08f39
LP
5713 if (c->network_namespace_path)
5714 fprintf(f,
5715 "%sNetworkNamespacePath: %s\n",
5716 prefix, c->network_namespace_path);
5717
3df90f24 5718 if (c->syscall_errno > 0) {
005bfaf1 5719#if HAVE_SECCOMP
3df90f24 5720 const char *errno_name;
005bfaf1 5721#endif
3df90f24
YW
5722
5723 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5724
005bfaf1
TM
5725#if HAVE_SECCOMP
5726 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5727 if (errno_name)
005bfaf1 5728 fputs(errno_name, f);
3df90f24 5729 else
005bfaf1
TM
5730 fprintf(f, "%d", c->syscall_errno);
5731#endif
5732 fputc('\n', f);
3df90f24 5733 }
b3d13314 5734
5b10116e 5735 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5736 MountOptions *o;
5737
79e20ceb 5738 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5739 c->mount_images[i].ignore_enoent ? "-": "",
5740 c->mount_images[i].source,
79e20ceb 5741 c->mount_images[i].destination);
427353f6 5742 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5743 fprintf(f, ":%s:%s",
427353f6 5744 partition_designator_to_string(o->partition_designator),
79e20ceb 5745 strempty(o->options));
427353f6
LB
5746 fprintf(f, "\n");
5747 }
93f59701
LB
5748
5749 for (size_t i = 0; i < c->n_extension_images; i++) {
5750 MountOptions *o;
5751
5752 fprintf(f, "%sExtensionImages: %s%s", prefix,
5753 c->extension_images[i].ignore_enoent ? "-": "",
5754 c->extension_images[i].source);
5755 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5756 fprintf(f, ":%s:%s",
5757 partition_designator_to_string(o->partition_designator),
5758 strempty(o->options));
5759 fprintf(f, "\n");
5760 }
5cb5a6ff
LP
5761}
5762
34cf6c43 5763bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5764 assert(c);
5765
61233823 5766 /* Returns true if the process forked off would run under
a931ad47
LP
5767 * an unchanged UID or as root. */
5768
5769 if (!c->user)
5770 return true;
5771
5772 if (streq(c->user, "root") || streq(c->user, "0"))
5773 return true;
5774
5775 return false;
5776}
5777
34cf6c43 5778int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
5779 int p;
5780
5781 assert(c);
5782
5783 if (c->ioprio_set)
5784 return c->ioprio;
5785
5786 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5787 if (p < 0)
5788 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5789
5790 return p;
5791}
5792
5e98086d
ZJS
5793bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5794 assert(c);
5795
61198784 5796 /* Explicit setting wins */
5e98086d
ZJS
5797 if (c->mount_apivfs_set)
5798 return c->mount_apivfs;
5799
61198784 5800 /* Default to "yes" if root directory or image are specified */
74e12520 5801 if (exec_context_with_rootfs(c))
61198784
ZJS
5802 return true;
5803
5e98086d
ZJS
5804 return false;
5805}
5806
d3070fbd 5807void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
5808 assert(c);
5809
5b10116e 5810 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
5811 free(c->log_extra_fields[l].iov_base);
5812 c->log_extra_fields = mfree(c->log_extra_fields);
5813 c->n_log_extra_fields = 0;
5814}
5815
6f765baf 5816void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
5817 _cleanup_close_ int fd = -1;
5818 const char *path;
5819 struct stat st;
6f765baf
LP
5820 int r;
5821
5822 assert(c);
5823
5824 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5825 exec_context_tty_reset(c, NULL);
5826
5827 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5828 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5829 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
5830 if (!exec_context_may_touch_tty(c))
5831 return;
6f765baf 5832
0ba976e8
LP
5833 path = exec_context_tty_path(c);
5834 if (!path)
5835 return;
6f765baf 5836
0ba976e8
LP
5837 fd = open(path, O_PATH|O_CLOEXEC);
5838 if (fd < 0)
5839 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5840 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5841 path);
5842
5843 if (fstat(fd, &st) < 0)
5844 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5845
5846 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5847 * if things are a character device, since a proper check either means we'd have to open the TTY and
5848 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5849 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5850 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5851 if (!S_ISCHR(st.st_mode))
5852 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5853
5854 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5855 if (r < 0)
5856 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
5857}
5858
4c2f5842
LP
5859int exec_context_get_clean_directories(
5860 ExecContext *c,
5861 char **prefix,
5862 ExecCleanMask mask,
5863 char ***ret) {
5864
5865 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
5866 int r;
5867
5868 assert(c);
5869 assert(prefix);
5870 assert(ret);
5871
5b10116e 5872 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
5873 char **i;
5874
5875 if (!FLAGS_SET(mask, 1U << t))
5876 continue;
5877
5878 if (!prefix[t])
5879 continue;
5880
5881 STRV_FOREACH(i, c->directories[t].paths) {
5882 char *j;
5883
5884 j = path_join(prefix[t], *i);
5885 if (!j)
5886 return -ENOMEM;
5887
5888 r = strv_consume(&l, j);
5889 if (r < 0)
5890 return r;
7f622a19
YW
5891
5892 /* Also remove private directories unconditionally. */
5893 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5894 j = path_join(prefix[t], "private", *i);
5895 if (!j)
5896 return -ENOMEM;
5897
5898 r = strv_consume(&l, j);
5899 if (r < 0)
5900 return r;
5901 }
4c2f5842
LP
5902 }
5903 }
5904
5905 *ret = TAKE_PTR(l);
5906 return 0;
5907}
5908
5909int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5910 ExecCleanMask mask = 0;
5911
5912 assert(c);
5913 assert(ret);
5914
5915 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5916 if (!strv_isempty(c->directories[t].paths))
5917 mask |= 1U << t;
5918
5919 *ret = mask;
5920 return 0;
5921}
5922
b58b4116 5923void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 5924 assert(s);
5cb5a6ff 5925
2ed26ed0
LP
5926 *s = (ExecStatus) {
5927 .pid = pid,
5928 };
5929
b58b4116
LP
5930 dual_timestamp_get(&s->start_timestamp);
5931}
5932
34cf6c43 5933void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
5934 assert(s);
5935
d46b79bb 5936 if (s->pid != pid)
2ed26ed0
LP
5937 *s = (ExecStatus) {
5938 .pid = pid,
5939 };
b58b4116 5940
63983207 5941 dual_timestamp_get(&s->exit_timestamp);
9fb86720 5942
034c6ed7
LP
5943 s->code = code;
5944 s->status = status;
169c1bda 5945
6f765baf
LP
5946 if (context && context->utmp_id)
5947 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
5948}
5949
6a1d4d9f
LP
5950void exec_status_reset(ExecStatus *s) {
5951 assert(s);
5952
5953 *s = (ExecStatus) {};
5954}
5955
34cf6c43 5956void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
5957 assert(s);
5958 assert(f);
5959
9fb86720
LP
5960 if (s->pid <= 0)
5961 return;
5962
4c940960
LP
5963 prefix = strempty(prefix);
5964
9fb86720 5965 fprintf(f,
ccd06097
ZJS
5966 "%sPID: "PID_FMT"\n",
5967 prefix, s->pid);
9fb86720 5968
af9d16e1 5969 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
5970 fprintf(f,
5971 "%sStart Timestamp: %s\n",
04f5c018 5972 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 5973
af9d16e1 5974 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
5975 fprintf(f,
5976 "%sExit Timestamp: %s\n"
5977 "%sExit Code: %s\n"
5978 "%sExit Status: %i\n",
04f5c018 5979 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
5980 prefix, sigchld_code_to_string(s->code),
5981 prefix, s->status);
5cb5a6ff 5982}
44d8db9e 5983
34cf6c43 5984static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 5985 _cleanup_free_ char *cmd = NULL;
4c940960 5986 const char *prefix2;
44d8db9e
LP
5987
5988 assert(c);
5989 assert(f);
5990
4c940960 5991 prefix = strempty(prefix);
63c372cb 5992 prefix2 = strjoina(prefix, "\t");
44d8db9e 5993
8a62620e 5994 cmd = quote_command_line(c->argv);
44d8db9e
LP
5995 fprintf(f,
5996 "%sCommand Line: %s\n",
4bbccb02 5997 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 5998
9fb86720 5999 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6000}
6001
6002void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6003 assert(f);
6004
4c940960 6005 prefix = strempty(prefix);
44d8db9e
LP
6006
6007 LIST_FOREACH(command, c, c)
6008 exec_command_dump(c, f, prefix);
6009}
94f04347 6010
a6a80b4f
LP
6011void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6012 ExecCommand *end;
6013
6014 assert(l);
6015 assert(e);
6016
6017 if (*l) {
35b8ca3a 6018 /* It's kind of important, that we keep the order here */
71fda00f
LP
6019 LIST_FIND_TAIL(command, *l, end);
6020 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6021 } else
6022 *l = e;
6023}
6024
26fd040d
LP
6025int exec_command_set(ExecCommand *c, const char *path, ...) {
6026 va_list ap;
6027 char **l, *p;
6028
6029 assert(c);
6030 assert(path);
6031
6032 va_start(ap, path);
6033 l = strv_new_ap(path, ap);
6034 va_end(ap);
6035
6036 if (!l)
6037 return -ENOMEM;
6038
250a918d
LP
6039 p = strdup(path);
6040 if (!p) {
26fd040d
LP
6041 strv_free(l);
6042 return -ENOMEM;
6043 }
6044
6897dfe8 6045 free_and_replace(c->path, p);
26fd040d 6046
130d3d22 6047 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6048}
6049
86b23b07 6050int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6051 _cleanup_strv_free_ char **l = NULL;
86b23b07 6052 va_list ap;
86b23b07
JS
6053 int r;
6054
6055 assert(c);
6056 assert(path);
6057
6058 va_start(ap, path);
6059 l = strv_new_ap(path, ap);
6060 va_end(ap);
6061
6062 if (!l)
6063 return -ENOMEM;
6064
e287086b 6065 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6066 if (r < 0)
86b23b07 6067 return r;
86b23b07
JS
6068
6069 return 0;
6070}
6071
e8a565cb
YW
6072static void *remove_tmpdir_thread(void *p) {
6073 _cleanup_free_ char *path = p;
86b23b07 6074
e8a565cb
YW
6075 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6076 return NULL;
6077}
6078
6079static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6080 int r;
6081
6082 if (!rt)
6083 return NULL;
6084
6085 if (rt->manager)
6086 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6087
6088 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6089
6090 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6091 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6092
6093 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6094 if (r < 0)
e8a565cb 6095 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6096 else
6097 rt->tmp_dir = NULL;
e8a565cb 6098 }
613b411c 6099
56a13a49 6100 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6101 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6102
6103 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6104 if (r < 0)
e8a565cb 6105 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6106 else
6107 rt->var_tmp_dir = NULL;
e8a565cb
YW
6108 }
6109
6110 rt->id = mfree(rt->id);
6111 rt->tmp_dir = mfree(rt->tmp_dir);
6112 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6113 safe_close_pair(rt->netns_storage_socket);
a70581ff 6114 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6115 return mfree(rt);
6116}
6117
6118static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6119 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6120}
6121
56a13a49
ZJS
6122static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6123 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6124 ExecRuntime *n;
613b411c 6125
8e8009dc 6126 assert(ret);
613b411c 6127
56a13a49
ZJS
6128 id_copy = strdup(id);
6129 if (!id_copy)
6130 return -ENOMEM;
6131
8e8009dc
LP
6132 n = new(ExecRuntime, 1);
6133 if (!n)
613b411c
LP
6134 return -ENOMEM;
6135
8e8009dc 6136 *n = (ExecRuntime) {
56a13a49 6137 .id = TAKE_PTR(id_copy),
8e8009dc 6138 .netns_storage_socket = { -1, -1 },
a70581ff 6139 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6140 };
6141
6142 *ret = n;
613b411c
LP
6143 return 0;
6144}
6145
e8a565cb
YW
6146static int exec_runtime_add(
6147 Manager *m,
6148 const char *id,
56a13a49
ZJS
6149 char **tmp_dir,
6150 char **var_tmp_dir,
6151 int netns_storage_socket[2],
a70581ff 6152 int ipcns_storage_socket[2],
e8a565cb
YW
6153 ExecRuntime **ret) {
6154
6155 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6156 int r;
6157
e8a565cb 6158 assert(m);
613b411c
LP
6159 assert(id);
6160
a70581ff 6161 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6162
56a13a49 6163 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6164 if (r < 0)
6165 return r;
6166
63083706 6167 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6168 if (r < 0)
6169 return r;
e8a565cb 6170
56a13a49
ZJS
6171 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6172 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6173 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6174
6175 if (netns_storage_socket) {
56a13a49
ZJS
6176 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6177 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6178 }
6179
a70581ff
XR
6180 if (ipcns_storage_socket) {
6181 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6182 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6183 }
6184
e8a565cb
YW
6185 rt->manager = m;
6186
6187 if (ret)
6188 *ret = rt;
e8a565cb 6189 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6190 TAKE_PTR(rt);
e8a565cb
YW
6191 return 0;
6192}
6193
74aaf59b
LP
6194static int exec_runtime_make(
6195 Manager *m,
6196 const ExecContext *c,
6197 const char *id,
6198 ExecRuntime **ret) {
6199
56a13a49 6200 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6201 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6202 int r;
6203
6204 assert(m);
6205 assert(c);
6206 assert(id);
6207
6208 /* It is not necessary to create ExecRuntime object. */
a70581ff 6209 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6210 *ret = NULL;
e8a565cb 6211 return 0;
74aaf59b 6212 }
e8a565cb 6213
efa2f3a1
TM
6214 if (c->private_tmp &&
6215 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6216 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6217 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6218 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6219 if (r < 0)
6220 return r;
6221 }
6222
a8d08f39 6223 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6224 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6225 return -errno;
6226 }
6227
a70581ff
XR
6228 if (c->private_ipc || c->ipc_namespace_path) {
6229 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6230 return -errno;
6231 }
6232
6233 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6234 if (r < 0)
6235 return r;
6236
613b411c
LP
6237 return 1;
6238}
6239
e8a565cb
YW
6240int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6241 ExecRuntime *rt;
6242 int r;
613b411c 6243
e8a565cb
YW
6244 assert(m);
6245 assert(id);
6246 assert(ret);
6247
6248 rt = hashmap_get(m->exec_runtime_by_id, id);
6249 if (rt)
387f6955 6250 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6251 goto ref;
6252
74aaf59b
LP
6253 if (!create) {
6254 *ret = NULL;
e8a565cb 6255 return 0;
74aaf59b 6256 }
e8a565cb
YW
6257
6258 /* If not found, then create a new object. */
6259 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6260 if (r < 0)
e8a565cb 6261 return r;
74aaf59b
LP
6262 if (r == 0) {
6263 /* When r == 0, it is not necessary to create ExecRuntime object. */
6264 *ret = NULL;
6265 return 0;
6266 }
613b411c 6267
e8a565cb
YW
6268ref:
6269 /* increment reference counter. */
6270 rt->n_ref++;
6271 *ret = rt;
6272 return 1;
6273}
613b411c 6274
e8a565cb
YW
6275ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6276 if (!rt)
613b411c
LP
6277 return NULL;
6278
e8a565cb 6279 assert(rt->n_ref > 0);
613b411c 6280
e8a565cb
YW
6281 rt->n_ref--;
6282 if (rt->n_ref > 0)
f2341e0a
LP
6283 return NULL;
6284
e8a565cb 6285 return exec_runtime_free(rt, destroy);
613b411c
LP
6286}
6287
e8a565cb
YW
6288int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6289 ExecRuntime *rt;
e8a565cb
YW
6290
6291 assert(m);
613b411c
LP
6292 assert(f);
6293 assert(fds);
6294
90e74a66 6295 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6296 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6297
e8a565cb
YW
6298 if (rt->tmp_dir)
6299 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6300
e8a565cb
YW
6301 if (rt->var_tmp_dir)
6302 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6303
e8a565cb
YW
6304 if (rt->netns_storage_socket[0] >= 0) {
6305 int copy;
613b411c 6306
e8a565cb
YW
6307 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6308 if (copy < 0)
6309 return copy;
613b411c 6310
e8a565cb
YW
6311 fprintf(f, " netns-socket-0=%i", copy);
6312 }
613b411c 6313
e8a565cb
YW
6314 if (rt->netns_storage_socket[1] >= 0) {
6315 int copy;
613b411c 6316
e8a565cb
YW
6317 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6318 if (copy < 0)
6319 return copy;
613b411c 6320
e8a565cb
YW
6321 fprintf(f, " netns-socket-1=%i", copy);
6322 }
6323
a70581ff
XR
6324 if (rt->ipcns_storage_socket[0] >= 0) {
6325 int copy;
6326
6327 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6328 if (copy < 0)
6329 return copy;
6330
6331 fprintf(f, " ipcns-socket-0=%i", copy);
6332 }
6333
6334 if (rt->ipcns_storage_socket[1] >= 0) {
6335 int copy;
6336
6337 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6338 if (copy < 0)
6339 return copy;
6340
6341 fprintf(f, " ipcns-socket-1=%i", copy);
6342 }
6343
e8a565cb 6344 fputc('\n', f);
613b411c
LP
6345 }
6346
6347 return 0;
6348}
6349
e8a565cb
YW
6350int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6351 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6352 ExecRuntime *rt;
613b411c
LP
6353 int r;
6354
e8a565cb
YW
6355 /* This is for the migration from old (v237 or earlier) deserialization text.
6356 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6357 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6358 * so or not from the serialized text, then we always creates a new object owned by this. */
6359
6360 assert(u);
613b411c
LP
6361 assert(key);
6362 assert(value);
6363
e8a565cb
YW
6364 /* Manager manages ExecRuntime objects by the unit id.
6365 * So, we omit the serialized text when the unit does not have id (yet?)... */
6366 if (isempty(u->id)) {
6367 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6368 return 0;
6369 }
613b411c 6370
cbc165d1
ZJS
6371 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6372 return log_oom();
e8a565cb
YW
6373
6374 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6375 if (!rt) {
cbc165d1 6376 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6377 return log_oom();
613b411c 6378
e8a565cb
YW
6379 rt = rt_create;
6380 }
6381
6382 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6383 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6384 return -ENOMEM;
613b411c
LP
6385
6386 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6387 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6388 return -ENOMEM;
613b411c
LP
6389
6390 } else if (streq(key, "netns-socket-0")) {
6391 int fd;
6392
e8a565cb 6393 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6394 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6395 return 0;
613b411c 6396 }
e8a565cb
YW
6397
6398 safe_close(rt->netns_storage_socket[0]);
6399 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6400
613b411c
LP
6401 } else if (streq(key, "netns-socket-1")) {
6402 int fd;
6403
e8a565cb 6404 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6405 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6406 return 0;
613b411c 6407 }
e8a565cb
YW
6408
6409 safe_close(rt->netns_storage_socket[1]);
6410 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6411
613b411c
LP
6412 } else
6413 return 0;
6414
e8a565cb
YW
6415 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6416 if (rt_create) {
6417 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6418 if (r < 0) {
3fe91079 6419 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6420 return 0;
6421 }
613b411c 6422
e8a565cb 6423 rt_create->manager = u->manager;
613b411c 6424
e8a565cb 6425 /* Avoid cleanup */
56a13a49 6426 TAKE_PTR(rt_create);
e8a565cb 6427 }
98b47d54 6428
e8a565cb
YW
6429 return 1;
6430}
613b411c 6431
56a13a49
ZJS
6432int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6433 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6434 char *id = NULL;
a70581ff 6435 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6436 const char *p, *v = value;
6437 size_t n;
613b411c 6438
e8a565cb
YW
6439 assert(m);
6440 assert(value);
6441 assert(fds);
98b47d54 6442
e8a565cb
YW
6443 n = strcspn(v, " ");
6444 id = strndupa(v, n);
6445 if (v[n] != ' ')
6446 goto finalize;
6447 p = v + n + 1;
6448
6449 v = startswith(p, "tmp-dir=");
6450 if (v) {
6451 n = strcspn(v, " ");
56a13a49
ZJS
6452 tmp_dir = strndup(v, n);
6453 if (!tmp_dir)
6454 return log_oom();
e8a565cb
YW
6455 if (v[n] != ' ')
6456 goto finalize;
6457 p = v + n + 1;
6458 }
6459
6460 v = startswith(p, "var-tmp-dir=");
6461 if (v) {
6462 n = strcspn(v, " ");
56a13a49
ZJS
6463 var_tmp_dir = strndup(v, n);
6464 if (!var_tmp_dir)
6465 return log_oom();
e8a565cb
YW
6466 if (v[n] != ' ')
6467 goto finalize;
6468 p = v + n + 1;
6469 }
6470
6471 v = startswith(p, "netns-socket-0=");
6472 if (v) {
6473 char *buf;
6474
6475 n = strcspn(v, " ");
6476 buf = strndupa(v, n);
c413bb28 6477
a70581ff 6478 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6479 if (r < 0)
6480 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6481 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6482 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6483 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6484 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6485 if (v[n] != ' ')
6486 goto finalize;
6487 p = v + n + 1;
613b411c
LP
6488 }
6489
e8a565cb
YW
6490 v = startswith(p, "netns-socket-1=");
6491 if (v) {
6492 char *buf;
98b47d54 6493
e8a565cb
YW
6494 n = strcspn(v, " ");
6495 buf = strndupa(v, n);
a70581ff
XR
6496
6497 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6498 if (r < 0)
6499 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6500 if (!fdset_contains(fds, netns_fdpair[1]))
6501 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6502 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6503 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6504 if (v[n] != ' ')
6505 goto finalize;
6506 p = v + n + 1;
6507 }
6508
6509 v = startswith(p, "ipcns-socket-0=");
6510 if (v) {
6511 char *buf;
6512
6513 n = strcspn(v, " ");
6514 buf = strndupa(v, n);
6515
6516 r = safe_atoi(buf, &ipcns_fdpair[0]);
6517 if (r < 0)
6518 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6519 if (!fdset_contains(fds, ipcns_fdpair[0]))
6520 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6521 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6522 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6523 if (v[n] != ' ')
6524 goto finalize;
6525 p = v + n + 1;
6526 }
6527
6528 v = startswith(p, "ipcns-socket-1=");
6529 if (v) {
6530 char *buf;
6531
6532 n = strcspn(v, " ");
6533 buf = strndupa(v, n);
6534
6535 r = safe_atoi(buf, &ipcns_fdpair[1]);
6536 if (r < 0)
6537 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6538 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6539 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6540 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6541 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6542 }
98b47d54 6543
e8a565cb 6544finalize:
a70581ff 6545 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6546 if (r < 0)
56a13a49
ZJS
6547 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6548 return 0;
e8a565cb 6549}
613b411c 6550
e8a565cb
YW
6551void exec_runtime_vacuum(Manager *m) {
6552 ExecRuntime *rt;
e8a565cb
YW
6553
6554 assert(m);
6555
6556 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6557
90e74a66 6558 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6559 if (rt->n_ref > 0)
6560 continue;
6561
6562 (void) exec_runtime_free(rt, false);
6563 }
613b411c
LP
6564}
6565
b9c04eaf
YW
6566void exec_params_clear(ExecParameters *p) {
6567 if (!p)
6568 return;
6569
c3f8a065
LP
6570 p->environment = strv_free(p->environment);
6571 p->fd_names = strv_free(p->fd_names);
6572 p->fds = mfree(p->fds);
6573 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6574}
6575
bb0c0d6f
LP
6576ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6577 if (!sc)
6578 return NULL;
6579
6580 free(sc->id);
6581 free(sc->data);
6582 return mfree(sc);
6583}
6584
43144be4
LP
6585ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6586 if (!lc)
6587 return NULL;
6588
6589 free(lc->id);
6590 free(lc->path);
6591 return mfree(lc);
6592}
6593
bb0c0d6f 6594DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 6595DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 6596
80876c20
LP
6597static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6598 [EXEC_INPUT_NULL] = "null",
6599 [EXEC_INPUT_TTY] = "tty",
6600 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6601 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6602 [EXEC_INPUT_SOCKET] = "socket",
6603 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6604 [EXEC_INPUT_DATA] = "data",
2038c3f5 6605 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6606};
6607
8a0867d6
LP
6608DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6609
94f04347 6610static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6611 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6612 [EXEC_OUTPUT_NULL] = "null",
80876c20 6613 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6614 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6615 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6616 [EXEC_OUTPUT_JOURNAL] = "journal",
6617 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6618 [EXEC_OUTPUT_SOCKET] = "socket",
6619 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6620 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6621 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6622 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6623};
6624
6625DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6626
6627static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6628 [EXEC_UTMP_INIT] = "init",
6629 [EXEC_UTMP_LOGIN] = "login",
6630 [EXEC_UTMP_USER] = "user",
6631};
6632
6633DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6634
6635static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6636 [EXEC_PRESERVE_NO] = "no",
6637 [EXEC_PRESERVE_YES] = "yes",
6638 [EXEC_PRESERVE_RESTART] = "restart",
6639};
6640
6641DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6642
6b7b2ed9 6643/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6644static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6645 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6646 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6647 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6648 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6649 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6650};
6651
6652DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6653
6b7b2ed9
LP
6654/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6655 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6656 * directories, specifically .timer units with their timestamp touch file. */
6657static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6658 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6659 [EXEC_DIRECTORY_STATE] = "state",
6660 [EXEC_DIRECTORY_CACHE] = "cache",
6661 [EXEC_DIRECTORY_LOGS] = "logs",
6662 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6663};
6664
6665DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6666
6667/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6668 * the service payload in. */
fb2042dd
YW
6669static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6670 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6671 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6672 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6673 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6674 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6675};
6676
6677DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6678
b1edf445
LP
6679static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6680 [EXEC_KEYRING_INHERIT] = "inherit",
6681 [EXEC_KEYRING_PRIVATE] = "private",
6682 [EXEC_KEYRING_SHARED] = "shared",
6683};
6684
6685DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);