]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #23621 from evverx/clang-release
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
032b3afb 66#include "ioprio-util.h"
a1164ae3 67#include "label.h"
8dd4c05b
LP
68#include "log.h"
69#include "macro.h"
e8a565cb 70#include "manager.h"
2a341bb9 71#include "manager-dump.h"
0a970718 72#include "memory-util.h"
f5947a5e 73#include "missing_fs.h"
5bead76e 74#include "missing_ioprio.h"
35cd0ba5 75#include "mkdir-label.h"
21935150 76#include "mount-util.h"
bb0c0d6f 77#include "mountpoint-util.h"
8dd4c05b 78#include "namespace.h"
6bedfcbb 79#include "parse-util.h"
8dd4c05b 80#include "path-util.h"
0b452006 81#include "process-util.h"
d3dcf4e3 82#include "random-util.h"
3989bdc1 83#include "recurse-dir.h"
78f22b97 84#include "rlimit-util.h"
8dd4c05b 85#include "rm-rf.h"
349cc4a5 86#if HAVE_SECCOMP
3ffd4af2
LP
87#include "seccomp-util.h"
88#endif
07d46372 89#include "securebits-util.h"
8dd4c05b 90#include "selinux-util.h"
24882e06 91#include "signal-util.h"
8dd4c05b 92#include "smack-util.h"
57b7a260 93#include "socket-util.h"
fd63e712 94#include "special.h"
949befd3 95#include "stat-util.h"
8b43440b 96#include "string-table.h"
07630cea 97#include "string-util.h"
8dd4c05b 98#include "strv.h"
7ccbd1ae 99#include "syslog-util.h"
8dd4c05b 100#include "terminal-util.h"
bb0c0d6f 101#include "tmpfile-util.h"
566b7d23 102#include "umask-util.h"
2d3b784d 103#include "unit-serialize.h"
b1d4f8e1 104#include "user-util.h"
8dd4c05b 105#include "utmp-wtmp.h"
5cb5a6ff 106
e056b01d 107#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 108#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 109
531dca78
LP
110#define SNDBUF_SIZE (8*1024*1024)
111
da6053d0 112static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
113 if (n_fds <= 0)
114 return 0;
115
a0d40ac5
LP
116 /* Modifies the fds array! (sorts it) */
117
034c6ed7
LP
118 assert(fds);
119
5b10116e
ZJS
120 for (int start = 0;;) {
121 int restart_from = -1;
034c6ed7 122
5b10116e 123 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
124 int nfd;
125
126 /* Already at right index? */
127 if (fds[i] == i+3)
128 continue;
129
3cc2aff1
LP
130 nfd = fcntl(fds[i], F_DUPFD, i + 3);
131 if (nfd < 0)
034c6ed7
LP
132 return -errno;
133
03e334a1 134 safe_close(fds[i]);
034c6ed7
LP
135 fds[i] = nfd;
136
137 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 138 * let's remember that and try again from here */
034c6ed7
LP
139 if (nfd != i+3 && restart_from < 0)
140 restart_from = i;
141 }
142
143 if (restart_from < 0)
144 break;
145
146 start = restart_from;
147 }
148
149 return 0;
150}
151
25b583d7 152static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 153 size_t n_fds;
e2c76839 154 int r;
47a71eed 155
25b583d7 156 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
9b141911
FB
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
47a71eed 164
5b10116e 165 for (size_t i = 0; i < n_fds; i++) {
47a71eed 166
9b141911
FB
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
47a71eed 172
451a074f
LP
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
47a71eed 176
3cc2aff1
LP
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
e2c76839 179 return r;
47a71eed
LP
180 }
181
182 return 0;
183}
184
1e22b5cd 185static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
186 assert(context);
187
1e22b5cd
LP
188 if (context->stdio_as_fds)
189 return NULL;
190
80876c20
LP
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195}
196
1e22b5cd
LP
197static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
198 const char *path;
199
6ea832a2
LP
200 assert(context);
201
1e22b5cd 202 path = exec_context_tty_path(context);
6ea832a2 203
1e22b5cd
LP
204 if (context->tty_vhangup) {
205 if (p && p->stdin_fd >= 0)
206 (void) terminal_vhangup_fd(p->stdin_fd);
207 else if (path)
208 (void) terminal_vhangup(path);
209 }
6ea832a2 210
1e22b5cd
LP
211 if (context->tty_reset) {
212 if (p && p->stdin_fd >= 0)
213 (void) reset_terminal_fd(p->stdin_fd, true);
214 else if (path)
215 (void) reset_terminal(path);
216 }
217
51462135
DDM
218 if (p && p->stdin_fd >= 0)
219 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
220
1e22b5cd
LP
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
6ea832a2
LP
223}
224
6af760f3
LP
225static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230}
231
3a1286b6 232static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
6af760f3
LP
235 EXEC_OUTPUT_KMSG_AND_CONSOLE,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237}
238
aac8c0c3
LP
239static bool is_kmsg_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_KMSG,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE);
243}
244
6af760f3
LP
245static bool exec_context_needs_term(const ExecContext *c) {
246 assert(c);
247
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
249
250 if (is_terminal_input(c->std_input))
251 return true;
252
253 if (is_terminal_output(c->std_output))
254 return true;
255
256 if (is_terminal_output(c->std_error))
257 return true;
258
259 return !!c->tty_path;
3a1286b6
MS
260}
261
80876c20 262static int open_null_as(int flags, int nfd) {
046a82c1 263 int fd;
071830ff 264
80876c20 265 assert(nfd >= 0);
071830ff 266
613b411c
LP
267 fd = open("/dev/null", flags|O_NOCTTY);
268 if (fd < 0)
071830ff
LP
269 return -errno;
270
046a82c1 271 return move_fd(fd, nfd, false);
071830ff
LP
272}
273
91dd5f7c
LP
274static int connect_journal_socket(
275 int fd,
276 const char *log_namespace,
277 uid_t uid,
278 gid_t gid) {
279
524daa8c
ZJS
280 uid_t olduid = UID_INVALID;
281 gid_t oldgid = GID_INVALID;
91dd5f7c 282 const char *j;
524daa8c
ZJS
283 int r;
284
91dd5f7c
LP
285 j = log_namespace ?
286 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
287 "/run/systemd/journal/stdout";
91dd5f7c 288
cad93f29 289 if (gid_is_valid(gid)) {
524daa8c
ZJS
290 oldgid = getgid();
291
92a17af9 292 if (setegid(gid) < 0)
524daa8c
ZJS
293 return -errno;
294 }
295
cad93f29 296 if (uid_is_valid(uid)) {
524daa8c
ZJS
297 olduid = getuid();
298
92a17af9 299 if (seteuid(uid) < 0) {
524daa8c
ZJS
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
1861986a 305 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 306
1861986a
LP
307 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
308 an LSM interferes. */
524daa8c 309
cad93f29 310 if (uid_is_valid(uid))
524daa8c
ZJS
311 (void) seteuid(olduid);
312
313 restore_gid:
cad93f29 314 if (gid_is_valid(gid))
524daa8c
ZJS
315 (void) setegid(oldgid);
316
317 return r;
318}
319
fd1f9c89 320static int connect_logger_as(
34cf6c43 321 const Unit *unit,
fd1f9c89 322 const ExecContext *context,
af635cf3 323 const ExecParameters *params,
fd1f9c89
LP
324 ExecOutput output,
325 const char *ident,
fd1f9c89
LP
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
2ac1ff68
EV
330 _cleanup_close_ int fd = -1;
331 int r;
071830ff
LP
332
333 assert(context);
af635cf3 334 assert(params);
80876c20
LP
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
071830ff 338
54fe0cdb
LP
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
80876c20 341 return -errno;
071830ff 342
91dd5f7c 343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
344 if (r < 0)
345 return r;
071830ff 346
2ac1ff68 347 if (shutdown(fd, SHUT_RD) < 0)
80876c20 348 return -errno;
071830ff 349
fd1f9c89 350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 351
2ac1ff68 352 if (dprintf(fd,
62bca2c6 353 "%s\n"
80876c20
LP
354 "%s\n"
355 "%i\n"
54fe0cdb
LP
356 "%i\n"
357 "%i\n"
358 "%i\n"
4f4a1dbf 359 "%i\n",
c867611e 360 context->syslog_identifier ?: ident,
af635cf3 361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
f3dc6af2 364 false,
aac8c0c3 365 is_kmsg_output(output),
2ac1ff68
EV
366 is_terminal_output(output)) < 0)
367 return -errno;
80876c20 368
2ac1ff68 369 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 370}
2ac1ff68 371
3a274a21 372static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 373 int fd;
071830ff 374
80876c20
LP
375 assert(path);
376 assert(nfd >= 0);
fd1f9c89 377
3a274a21 378 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 379 if (fd < 0)
80876c20 380 return fd;
071830ff 381
046a82c1 382 return move_fd(fd, nfd, false);
80876c20 383}
071830ff 384
2038c3f5 385static int acquire_path(const char *path, int flags, mode_t mode) {
15a3e96f 386 _cleanup_close_ int fd = -1;
86fca584 387 int r;
071830ff 388
80876c20 389 assert(path);
071830ff 390
2038c3f5
LP
391 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
392 flags |= O_CREAT;
393
394 fd = open(path, flags|O_NOCTTY, mode);
395 if (fd >= 0)
15a3e96f 396 return TAKE_FD(fd);
071830ff 397
2038c3f5
LP
398 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
399 return -errno;
2038c3f5
LP
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
1861986a
LP
407 r = connect_unix_path(fd, AT_FDCWD, path);
408 if (IN_SET(r, -ENOTSOCK, -EINVAL))
409 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
410 * wasn't an AF_UNIX socket after all */
411 return -ENXIO;
412 if (r < 0)
413 return r;
071830ff 414
2038c3f5
LP
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
86fca584 420 r = 0;
15a3e96f 421 if (r < 0)
2038c3f5 422 return -errno;
2038c3f5 423
15a3e96f 424 return TAKE_FD(fd);
80876c20 425}
071830ff 426
08f3be7a
LP
427static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
1e3ad081
LP
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
071830ff 440
03fd9c49 441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
442 return EXEC_INPUT_NULL;
443
08f3be7a
LP
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
03fd9c49 447 return std_input;
4f2d528d
LP
448}
449
7966a916 450static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 451
7966a916 452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
453 return EXEC_OUTPUT_INHERIT;
454
7966a916 455 return output;
4f2d528d
LP
456}
457
a34ceba6
LP
458static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
52c239d7 461 int socket_fd,
2caa38e9 462 const int named_iofds[static 3]) {
a34ceba6 463
4f2d528d 464 ExecInput i;
51462135 465 int r;
4f2d528d
LP
466
467 assert(context);
a34ceba6 468 assert(params);
2caa38e9 469 assert(named_iofds);
a34ceba6
LP
470
471 if (params->stdin_fd >= 0) {
472 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
473 return -errno;
474
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
476 if (isatty(STDIN_FILENO)) {
477 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
478 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 479 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 480 }
a34ceba6
LP
481
482 return STDIN_FILENO;
483 }
4f2d528d 484
08f3be7a 485 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
486
487 switch (i) {
071830ff 488
80876c20
LP
489 case EXEC_INPUT_NULL:
490 return open_null_as(O_RDONLY, STDIN_FILENO);
491
492 case EXEC_INPUT_TTY:
493 case EXEC_INPUT_TTY_FORCE:
494 case EXEC_INPUT_TTY_FAIL: {
046a82c1 495 int fd;
071830ff 496
1e22b5cd 497 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
498 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
499 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
500 ACQUIRE_TERMINAL_WAIT,
3a43da28 501 USEC_INFINITY);
970edce6 502 if (fd < 0)
80876c20
LP
503 return fd;
504
51462135
DDM
505 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
506 if (r < 0)
507 return r;
508
046a82c1 509 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
510 }
511
4f2d528d 512 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
513 assert(socket_fd >= 0);
514
7c248223 515 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 516
52c239d7 517 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
518 assert(named_iofds[STDIN_FILENO] >= 0);
519
52c239d7 520 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 521 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 522
08f3be7a
LP
523 case EXEC_INPUT_DATA: {
524 int fd;
525
526 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
527 if (fd < 0)
528 return fd;
529
530 return move_fd(fd, STDIN_FILENO, false);
531 }
532
2038c3f5
LP
533 case EXEC_INPUT_FILE: {
534 bool rw;
535 int fd;
536
537 assert(context->stdio_file[STDIN_FILENO]);
538
539 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
540 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
541
542 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
543 if (fd < 0)
544 return fd;
545
546 return move_fd(fd, STDIN_FILENO, false);
547 }
548
80876c20 549 default:
04499a70 550 assert_not_reached();
80876c20
LP
551 }
552}
553
41fc585a
LP
554static bool can_inherit_stderr_from_stdout(
555 const ExecContext *context,
556 ExecOutput o,
557 ExecOutput e) {
558
559 assert(context);
560
561 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
562 * stderr fd */
563
564 if (e == EXEC_OUTPUT_INHERIT)
565 return true;
566 if (e != o)
567 return false;
568
569 if (e == EXEC_OUTPUT_NAMED_FD)
570 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
571
8d7dab1f 572 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
573 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
574
575 return true;
576}
577
a34ceba6 578static int setup_output(
34cf6c43 579 const Unit *unit,
a34ceba6
LP
580 const ExecContext *context,
581 const ExecParameters *params,
582 int fileno,
583 int socket_fd,
2caa38e9 584 const int named_iofds[static 3],
a34ceba6 585 const char *ident,
7bce046b
LP
586 uid_t uid,
587 gid_t gid,
588 dev_t *journal_stream_dev,
589 ino_t *journal_stream_ino) {
a34ceba6 590
4f2d528d
LP
591 ExecOutput o;
592 ExecInput i;
47c1d80d 593 int r;
4f2d528d 594
f2341e0a 595 assert(unit);
80876c20 596 assert(context);
a34ceba6 597 assert(params);
80876c20 598 assert(ident);
7bce046b
LP
599 assert(journal_stream_dev);
600 assert(journal_stream_ino);
80876c20 601
a34ceba6
LP
602 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
603
604 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
605 return -errno;
606
607 return STDOUT_FILENO;
608 }
609
610 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
611 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
612 return -errno;
613
614 return STDERR_FILENO;
615 }
616
08f3be7a 617 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 618 o = fixup_output(context->std_output, socket_fd);
4f2d528d 619
eb17e935
MS
620 if (fileno == STDERR_FILENO) {
621 ExecOutput e;
622 e = fixup_output(context->std_error, socket_fd);
80876c20 623
eb17e935
MS
624 /* This expects the input and output are already set up */
625
626 /* Don't change the stderr file descriptor if we inherit all
627 * the way and are not on a tty */
628 if (e == EXEC_OUTPUT_INHERIT &&
629 o == EXEC_OUTPUT_INHERIT &&
630 i == EXEC_INPUT_NULL &&
631 !is_terminal_input(context->std_input) &&
7966a916 632 getppid() != 1)
eb17e935
MS
633 return fileno;
634
635 /* Duplicate from stdout if possible */
41fc585a 636 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 637 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 638
eb17e935 639 o = e;
80876c20 640
eb17e935 641 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
642 /* If input got downgraded, inherit the original value */
643 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 644 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 645
08f3be7a
LP
646 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
647 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 648 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 649
acb591e4
LP
650 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
651 if (getppid() != 1)
eb17e935 652 return fileno;
94f04347 653
eb17e935
MS
654 /* We need to open /dev/null here anew, to get the right access mode. */
655 return open_null_as(O_WRONLY, fileno);
071830ff 656 }
94f04347 657
eb17e935 658 switch (o) {
80876c20
LP
659
660 case EXEC_OUTPUT_NULL:
eb17e935 661 return open_null_as(O_WRONLY, fileno);
80876c20
LP
662
663 case EXEC_OUTPUT_TTY:
4f2d528d 664 if (is_terminal_input(i))
7c248223 665 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
666
667 /* We don't reset the terminal if this is just about output */
1e22b5cd 668 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 669
9a6bca7a 670 case EXEC_OUTPUT_KMSG:
28dbc1e8 671 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
672 case EXEC_OUTPUT_JOURNAL:
673 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 674 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 675 if (r < 0) {
7966a916
ZJS
676 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
677 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 678 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
679 } else {
680 struct stat st;
681
682 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
683 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
684 * services to detect whether they are connected to the journal or not.
685 *
686 * If both stdout and stderr are connected to a stream then let's make sure to store the data
687 * about STDERR as that's usually the best way to do logging. */
7bce046b 688
ab2116b1
LP
689 if (fstat(fileno, &st) >= 0 &&
690 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
691 *journal_stream_dev = st.st_dev;
692 *journal_stream_ino = st.st_ino;
693 }
47c1d80d
MS
694 }
695 return r;
4f2d528d
LP
696
697 case EXEC_OUTPUT_SOCKET:
698 assert(socket_fd >= 0);
e75a9ed1 699
7c248223 700 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 701
52c239d7 702 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
703 assert(named_iofds[fileno] >= 0);
704
52c239d7 705 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 706 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 707
566b7d23 708 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
709 case EXEC_OUTPUT_FILE_APPEND:
710 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 711 bool rw;
566b7d23 712 int fd, flags;
2038c3f5
LP
713
714 assert(context->stdio_file[fileno]);
715
716 rw = context->std_input == EXEC_INPUT_FILE &&
717 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
718
719 if (rw)
7c248223 720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 721
566b7d23
ZD
722 flags = O_WRONLY;
723 if (o == EXEC_OUTPUT_FILE_APPEND)
724 flags |= O_APPEND;
8d7dab1f
LW
725 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
726 flags |= O_TRUNC;
566b7d23
ZD
727
728 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
729 if (fd < 0)
730 return fd;
731
566b7d23 732 return move_fd(fd, fileno, 0);
2038c3f5
LP
733 }
734
94f04347 735 default:
04499a70 736 assert_not_reached();
94f04347 737 }
071830ff
LP
738}
739
02a51aba 740static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 741 int r;
02a51aba
LP
742
743 assert(fd >= 0);
02a51aba 744
1ff74fb6 745 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
746 if (isatty(fd) < 1) {
747 if (IN_SET(errno, EINVAL, ENOTTY))
748 return 0; /* not a tty */
1ff74fb6 749
02a51aba 750 return -errno;
4b3b5bc7 751 }
02a51aba 752
4b3b5bc7 753 /* This might fail. What matters are the results. */
f2df231f 754 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
755 if (r < 0)
756 return r;
02a51aba 757
4b3b5bc7 758 return 1;
02a51aba
LP
759}
760
aedec452 761static int setup_confirm_stdio(
51462135 762 const ExecContext *context,
aedec452
LP
763 const char *vc,
764 int *ret_saved_stdin,
765 int *ret_saved_stdout) {
766
3d18b167
LP
767 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
768 int r;
80876c20 769
aedec452
LP
770 assert(ret_saved_stdin);
771 assert(ret_saved_stdout);
80876c20 772
af6da548
LP
773 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
774 if (saved_stdin < 0)
775 return -errno;
80876c20 776
af6da548 777 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
778 if (saved_stdout < 0)
779 return -errno;
80876c20 780
8854d795 781 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
782 if (fd < 0)
783 return fd;
80876c20 784
af6da548
LP
785 r = chown_terminal(fd, getuid());
786 if (r < 0)
3d18b167 787 return r;
02a51aba 788
3d18b167
LP
789 r = reset_terminal_fd(fd, true);
790 if (r < 0)
791 return r;
80876c20 792
51462135
DDM
793 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
794 if (r < 0)
795 return r;
796
aedec452
LP
797 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
798 TAKE_FD(fd);
2b33ab09
LP
799 if (r < 0)
800 return r;
80876c20 801
aedec452
LP
802 *ret_saved_stdin = TAKE_FD(saved_stdin);
803 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 804 return 0;
80876c20
LP
805}
806
63d77c92 807static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
808 assert(err < 0);
809
810 if (err == -ETIMEDOUT)
63d77c92 811 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
812 else {
813 errno = -err;
63d77c92 814 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
815 }
816}
817
63d77c92 818static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 819 _cleanup_close_ int fd = -1;
80876c20 820
3b20f877 821 assert(vc);
80876c20 822
7d5ceb64 823 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 824 if (fd < 0)
3b20f877 825 return;
80876c20 826
63d77c92 827 write_confirm_error_fd(err, fd, u);
af6da548 828}
80876c20 829
3d18b167 830static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 831 int r = 0;
80876c20 832
af6da548
LP
833 assert(saved_stdin);
834 assert(saved_stdout);
835
836 release_terminal();
837
838 if (*saved_stdin >= 0)
80876c20 839 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 840 r = -errno;
80876c20 841
af6da548 842 if (*saved_stdout >= 0)
80876c20 843 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 844 r = -errno;
80876c20 845
3d18b167
LP
846 *saved_stdin = safe_close(*saved_stdin);
847 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
848
849 return r;
850}
851
3b20f877
FB
852enum {
853 CONFIRM_PRETEND_FAILURE = -1,
854 CONFIRM_PRETEND_SUCCESS = 0,
855 CONFIRM_EXECUTE = 1,
856};
857
51462135 858static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 859 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 860 _cleanup_free_ char *e = NULL;
3b20f877 861 char c;
af6da548 862
3b20f877 863 /* For any internal errors, assume a positive response. */
51462135 864 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 865 if (r < 0) {
63d77c92 866 write_confirm_error(r, vc, u);
3b20f877
FB
867 return CONFIRM_EXECUTE;
868 }
af6da548 869
b0eb2944
FB
870 /* confirm_spawn might have been disabled while we were sleeping. */
871 if (manager_is_confirm_spawn_disabled(u->manager)) {
872 r = 1;
873 goto restore_stdio;
874 }
af6da548 875
2bcd3c26
FB
876 e = ellipsize(cmdline, 60, 100);
877 if (!e) {
878 log_oom();
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
af6da548 882
d172b175 883 for (;;) {
539622bd 884 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 885 if (r < 0) {
63d77c92 886 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
887 r = CONFIRM_EXECUTE;
888 goto restore_stdio;
889 }
af6da548 890
d172b175 891 switch (c) {
b0eb2944
FB
892 case 'c':
893 printf("Resuming normal execution.\n");
894 manager_disable_confirm_spawn();
895 r = 1;
896 break;
dd6f9ac0
FB
897 case 'D':
898 unit_dump(u, stdout, " ");
899 continue; /* ask again */
d172b175
FB
900 case 'f':
901 printf("Failing execution.\n");
902 r = CONFIRM_PRETEND_FAILURE;
903 break;
904 case 'h':
b0eb2944
FB
905 printf(" c - continue, proceed without asking anymore\n"
906 " D - dump, show the state of the unit\n"
dd6f9ac0 907 " f - fail, don't execute the command and pretend it failed\n"
d172b175 908 " h - help\n"
eedf223a 909 " i - info, show a short summary of the unit\n"
56fde33a 910 " j - jobs, show jobs that are in progress\n"
d172b175
FB
911 " s - skip, don't execute the command and pretend it succeeded\n"
912 " y - yes, execute the command\n");
dd6f9ac0 913 continue; /* ask again */
eedf223a
FB
914 case 'i':
915 printf(" Description: %s\n"
916 " Unit: %s\n"
917 " Command: %s\n",
918 u->id, u->description, cmdline);
919 continue; /* ask again */
56fde33a
FB
920 case 'j':
921 manager_dump_jobs(u->manager, stdout, " ");
922 continue; /* ask again */
539622bd
FB
923 case 'n':
924 /* 'n' was removed in favor of 'f'. */
925 printf("Didn't understand 'n', did you mean 'f'?\n");
926 continue; /* ask again */
d172b175
FB
927 case 's':
928 printf("Skipping execution.\n");
929 r = CONFIRM_PRETEND_SUCCESS;
930 break;
931 case 'y':
932 r = CONFIRM_EXECUTE;
933 break;
934 default:
04499a70 935 assert_not_reached();
d172b175 936 }
3b20f877 937 break;
3b20f877 938 }
af6da548 939
3b20f877 940restore_stdio:
af6da548 941 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 942 return r;
80876c20
LP
943}
944
4d885bd3
DH
945static int get_fixed_user(const ExecContext *c, const char **user,
946 uid_t *uid, gid_t *gid,
947 const char **home, const char **shell) {
81a2b7ce 948 int r;
4d885bd3 949 const char *name;
81a2b7ce 950
4d885bd3 951 assert(c);
81a2b7ce 952
23deef88
LP
953 if (!c->user)
954 return 0;
955
4d885bd3
DH
956 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
957 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 958
23deef88 959 name = c->user;
fafff8f1 960 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
961 if (r < 0)
962 return r;
81a2b7ce 963
4d885bd3
DH
964 *user = name;
965 return 0;
966}
967
968static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
969 int r;
970 const char *name;
971
972 assert(c);
973
974 if (!c->group)
975 return 0;
976
977 name = c->group;
fafff8f1 978 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
979 if (r < 0)
980 return r;
981
982 *group = name;
983 return 0;
984}
985
cdc5d5c5
DH
986static int get_supplementary_groups(const ExecContext *c, const char *user,
987 const char *group, gid_t gid,
988 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
989 int r, k = 0;
990 int ngroups_max;
991 bool keep_groups = false;
992 gid_t *groups = NULL;
993 _cleanup_free_ gid_t *l_gids = NULL;
994
995 assert(c);
996
bbeea271
DH
997 /*
998 * If user is given, then lookup GID and supplementary groups list.
999 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1000 * here and as early as possible so we keep the list of supplementary
1001 * groups of the caller.
bbeea271
DH
1002 */
1003 if (user && gid_is_valid(gid) && gid != 0) {
1004 /* First step, initialize groups from /etc/groups */
1005 if (initgroups(user, gid) < 0)
1006 return -errno;
1007
1008 keep_groups = true;
1009 }
1010
ac6e8be6 1011 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1012 return 0;
1013
366ddd25
DH
1014 /*
1015 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016 * be positive, otherwise fail.
1017 */
1018 errno = 0;
1019 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1020 if (ngroups_max <= 0)
1021 return errno_or_else(EOPNOTSUPP);
366ddd25 1022
4d885bd3
DH
1023 l_gids = new(gid_t, ngroups_max);
1024 if (!l_gids)
1025 return -ENOMEM;
81a2b7ce 1026
4d885bd3
DH
1027 if (keep_groups) {
1028 /*
1029 * Lookup the list of groups that the user belongs to, we
1030 * avoid NSS lookups here too for gid=0.
1031 */
1032 k = ngroups_max;
1033 if (getgrouplist(user, gid, l_gids, &k) < 0)
1034 return -EINVAL;
1035 } else
1036 k = 0;
81a2b7ce 1037
4d885bd3
DH
1038 STRV_FOREACH(i, c->supplementary_groups) {
1039 const char *g;
81a2b7ce 1040
4d885bd3
DH
1041 if (k >= ngroups_max)
1042 return -E2BIG;
81a2b7ce 1043
4d885bd3 1044 g = *i;
fafff8f1 1045 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1046 if (r < 0)
1047 return r;
81a2b7ce 1048
4d885bd3
DH
1049 k++;
1050 }
81a2b7ce 1051
4d885bd3
DH
1052 /*
1053 * Sets ngids to zero to drop all supplementary groups, happens
1054 * when we are under root and SupplementaryGroups= is empty.
1055 */
1056 if (k == 0) {
1057 *ngids = 0;
1058 return 0;
1059 }
81a2b7ce 1060
4d885bd3
DH
1061 /* Otherwise get the final list of supplementary groups */
1062 groups = memdup(l_gids, sizeof(gid_t) * k);
1063 if (!groups)
1064 return -ENOMEM;
1065
1066 *supplementary_gids = groups;
1067 *ngids = k;
1068
1069 groups = NULL;
1070
1071 return 0;
1072}
1073
34cf6c43 1074static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1075 int r;
1076
709dbeac
YW
1077 /* Handle SupplementaryGroups= if it is not empty */
1078 if (ngids > 0) {
4d885bd3
DH
1079 r = maybe_setgroups(ngids, supplementary_gids);
1080 if (r < 0)
97f0e76f 1081 return r;
4d885bd3 1082 }
81a2b7ce 1083
4d885bd3
DH
1084 if (gid_is_valid(gid)) {
1085 /* Then set our gids */
1086 if (setresgid(gid, gid, gid) < 0)
1087 return -errno;
81a2b7ce
LP
1088 }
1089
1090 return 0;
1091}
1092
dbdc4098
TK
1093static int set_securebits(int bits, int mask) {
1094 int current, applied;
1095 current = prctl(PR_GET_SECUREBITS);
1096 if (current < 0)
1097 return -errno;
1098 /* Clear all securebits defined in mask and set bits */
1099 applied = (current & ~mask) | bits;
1100 if (current == applied)
1101 return 0;
1102 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1103 return -errno;
1104 return 1;
1105}
1106
81a2b7ce 1107static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1108 assert(context);
dbdc4098 1109 int r;
81a2b7ce 1110
4d885bd3
DH
1111 if (!uid_is_valid(uid))
1112 return 0;
1113
479050b3 1114 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1115 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116 * required, so we also need keep-caps in this case.
1117 */
81a2b7ce 1118
dbdc4098 1119 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1120
1121 /* First step: If we need to keep capabilities but
1122 * drop privileges we need to make sure we keep our
cbb21cca 1123 * caps, while we drop privileges. */
693ced48 1124 if (uid != 0) {
dbdc4098
TK
1125 /* Add KEEP_CAPS to the securebits */
1126 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1127 if (r < 0)
1128 return r;
693ced48 1129 }
81a2b7ce
LP
1130 }
1131
479050b3 1132 /* Second step: actually set the uids */
81a2b7ce
LP
1133 if (setresuid(uid, uid, uid) < 0)
1134 return -errno;
1135
1136 /* At this point we should have all necessary capabilities but
1137 are otherwise a normal user. However, the caps might got
1138 corrupted due to the setresuid() so we need clean them up
1139 later. This is done outside of this call. */
1140
1141 return 0;
1142}
1143
349cc4a5 1144#if HAVE_PAM
5b6319dc
LP
1145
1146static int null_conv(
1147 int num_msg,
1148 const struct pam_message **msg,
1149 struct pam_response **resp,
1150 void *appdata_ptr) {
1151
1152 /* We don't support conversations */
1153
1154 return PAM_CONV_ERR;
1155}
1156
cefc33ae
LP
1157#endif
1158
5b6319dc
LP
1159static int setup_pam(
1160 const char *name,
1161 const char *user,
940c5210 1162 uid_t uid,
2d6fce8d 1163 gid_t gid,
5b6319dc 1164 const char *tty,
421bb42d 1165 char ***env, /* updated on success */
5b8d1f6b 1166 const int fds[], size_t n_fds) {
5b6319dc 1167
349cc4a5 1168#if HAVE_PAM
cefc33ae 1169
5b6319dc
LP
1170 static const struct pam_conv conv = {
1171 .conv = null_conv,
1172 .appdata_ptr = NULL
1173 };
1174
2d7c6aa2 1175 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1176 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1177 pam_handle_t *handle = NULL;
d6e5f3ad 1178 sigset_t old_ss;
7bb70b6e 1179 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1180 bool close_session = false;
1181 pid_t pam_pid = 0, parent_pid;
970edce6 1182 int flags = 0;
5b6319dc
LP
1183
1184 assert(name);
1185 assert(user);
2065ca69 1186 assert(env);
5b6319dc
LP
1187
1188 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1189 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1190 * systemd via the cgroup logic. It will then remove the PAM
1191 * session again. The parent process will exec() the actual
1192 * daemon. We do things this way to ensure that the main PID
1193 * of the daemon is the one we initially fork()ed. */
1194
7bb70b6e
LP
1195 r = barrier_create(&barrier);
1196 if (r < 0)
2d7c6aa2
DH
1197 goto fail;
1198
553d2243 1199 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1200 flags |= PAM_SILENT;
1201
f546241b
ZJS
1202 pam_code = pam_start(name, user, &conv, &handle);
1203 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1204 handle = NULL;
1205 goto fail;
1206 }
1207
3cd24c1a
LP
1208 if (!tty) {
1209 _cleanup_free_ char *q = NULL;
1210
1211 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212 * out if that's the case, and read the TTY off it. */
1213
1214 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1215 tty = strjoina("/dev/", q);
1216 }
1217
f546241b
ZJS
1218 if (tty) {
1219 pam_code = pam_set_item(handle, PAM_TTY, tty);
1220 if (pam_code != PAM_SUCCESS)
5b6319dc 1221 goto fail;
f546241b 1222 }
5b6319dc 1223
84eada2f
JW
1224 STRV_FOREACH(nv, *env) {
1225 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1226 if (pam_code != PAM_SUCCESS)
1227 goto fail;
1228 }
1229
970edce6 1230 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1231 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1232 goto fail;
1233
3bb39ea9
DG
1234 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1235 if (pam_code != PAM_SUCCESS)
46d7c6af 1236 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1237
970edce6 1238 pam_code = pam_open_session(handle, flags);
f546241b 1239 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1240 goto fail;
1241
1242 close_session = true;
1243
f546241b
ZJS
1244 e = pam_getenvlist(handle);
1245 if (!e) {
5b6319dc
LP
1246 pam_code = PAM_BUF_ERR;
1247 goto fail;
1248 }
1249
cafc5ca1 1250 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1251
72c0a2c2 1252 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1253
df0ff127 1254 parent_pid = getpid_cached();
5b6319dc 1255
4c253ed1
LP
1256 r = safe_fork("(sd-pam)", 0, &pam_pid);
1257 if (r < 0)
5b6319dc 1258 goto fail;
4c253ed1 1259 if (r == 0) {
7bb70b6e 1260 int sig, ret = EXIT_PAM;
5b6319dc 1261
cafc5ca1 1262 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1263 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1264
1da37e58
ZJS
1265 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1266 * those fds are open here that have been opened by PAM. */
4c253ed1 1267 (void) close_many(fds, n_fds);
5b6319dc 1268
cafc5ca1
LP
1269 /* Drop privileges - we don't need any to pam_close_session and this will make
1270 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1271 * threads to fail to exit normally */
2d6fce8d 1272
97f0e76f
LP
1273 r = maybe_setgroups(0, NULL);
1274 if (r < 0)
1275 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1276 if (setresgid(gid, gid, gid) < 0)
1277 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1278 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1279 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1280
9c274488 1281 (void) ignore_signals(SIGPIPE);
ce30c8dc 1282
cafc5ca1
LP
1283 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1284 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1285 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1286 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1287 goto child_finish;
1288
cafc5ca1
LP
1289 /* Tell the parent that our setup is done. This is especially important regarding dropping
1290 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1291 *
cafc5ca1 1292 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1293 (void) barrier_place(&barrier);
2d7c6aa2 1294
643f4706 1295 /* Check if our parent process might already have died? */
5b6319dc 1296 if (getppid() == parent_pid) {
d6e5f3ad
DM
1297 sigset_t ss;
1298
1299 assert_se(sigemptyset(&ss) >= 0);
1300 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
3dead8d9
LP
1302 for (;;) {
1303 if (sigwait(&ss, &sig) < 0) {
1304 if (errno == EINTR)
1305 continue;
1306
1307 goto child_finish;
1308 }
5b6319dc 1309
3dead8d9
LP
1310 assert(sig == SIGTERM);
1311 break;
1312 }
5b6319dc
LP
1313 }
1314
3bb39ea9
DG
1315 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316 if (pam_code != PAM_SUCCESS)
1317 goto child_finish;
1318
3dead8d9 1319 /* If our parent died we'll end the session */
f546241b 1320 if (getppid() != parent_pid) {
970edce6 1321 pam_code = pam_close_session(handle, flags);
f546241b 1322 if (pam_code != PAM_SUCCESS)
5b6319dc 1323 goto child_finish;
f546241b 1324 }
5b6319dc 1325
7bb70b6e 1326 ret = 0;
5b6319dc
LP
1327
1328 child_finish:
7feb2b57
LP
1329 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1330 * know about this. See pam_end(3) */
1331 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1332 _exit(ret);
5b6319dc
LP
1333 }
1334
2d7c6aa2
DH
1335 barrier_set_role(&barrier, BARRIER_PARENT);
1336
cafc5ca1
LP
1337 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1338 * here. */
5b6319dc
LP
1339 handle = NULL;
1340
3b8bddde 1341 /* Unblock SIGTERM again in the parent */
72c0a2c2 1342 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1343
cafc5ca1
LP
1344 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1345 * this fd around. */
5b6319dc
LP
1346 closelog();
1347
cafc5ca1
LP
1348 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1349 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1350 if (!barrier_place_and_sync(&barrier))
1351 log_error("PAM initialization failed");
1352
130d3d22 1353 return strv_free_and_replace(*env, e);
5b6319dc
LP
1354
1355fail:
970edce6
ZJS
1356 if (pam_code != PAM_SUCCESS) {
1357 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1358 r = -EPERM; /* PAM errors do not map to errno */
1359 } else
1360 log_error_errno(r, "PAM failed: %m");
9ba35398 1361
5b6319dc
LP
1362 if (handle) {
1363 if (close_session)
970edce6 1364 pam_code = pam_close_session(handle, flags);
5b6319dc 1365
7feb2b57 1366 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1367 }
1368
5b6319dc 1369 closelog();
7bb70b6e 1370 return r;
cefc33ae
LP
1371#else
1372 return 0;
5b6319dc 1373#endif
cefc33ae 1374}
5b6319dc 1375
5d6b1584
LP
1376static void rename_process_from_path(const char *path) {
1377 char process_name[11];
1378 const char *p;
1379 size_t l;
1380
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1383
2b6bf07d 1384 p = basename(path);
5d6b1584
LP
1385 if (isempty(p)) {
1386 rename_process("(...)");
1387 return;
1388 }
1389
1390 l = strlen(p);
1391 if (l > 8) {
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1394 * "systemd-" */
1395 p = p + l - 8;
1396 l = 8;
1397 }
1398
1399 process_name[0] = '(';
1400 memcpy(process_name+1, p, l);
1401 process_name[1+l] = ')';
1402 process_name[1+l+1] = 0;
1403
1404 rename_process(process_name);
1405}
1406
469830d1
LP
1407static bool context_has_address_families(const ExecContext *c) {
1408 assert(c);
1409
6b000af4 1410 return c->address_families_allow_list ||
469830d1
LP
1411 !set_isempty(c->address_families);
1412}
1413
1414static bool context_has_syscall_filters(const ExecContext *c) {
1415 assert(c);
1416
6b000af4 1417 return c->syscall_allow_list ||
8cfa775f 1418 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1419}
1420
9df2cdd8
TM
1421static bool context_has_syscall_logs(const ExecContext *c) {
1422 assert(c);
1423
1424 return c->syscall_log_allow_list ||
1425 !hashmap_isempty(c->syscall_log);
1426}
1427
469830d1
LP
1428static bool context_has_no_new_privileges(const ExecContext *c) {
1429 assert(c);
1430
1431 if (c->no_new_privileges)
1432 return true;
1433
1434 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435 return false;
1436
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1438 return c->lock_personality ||
469830d1 1439 c->memory_deny_write_execute ||
0538d2a8 1440 c->private_devices ||
fc64760d 1441 c->protect_clock ||
0538d2a8 1442 c->protect_hostname ||
469830d1
LP
1443 c->protect_kernel_tunables ||
1444 c->protect_kernel_modules ||
84703040 1445 c->protect_kernel_logs ||
0538d2a8
YW
1446 context_has_address_families(c) ||
1447 exec_context_restrict_namespaces_set(c) ||
1448 c->restrict_realtime ||
1449 c->restrict_suid_sgid ||
78e864e5 1450 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1451 context_has_syscall_filters(c) ||
1452 context_has_syscall_logs(c);
469830d1
LP
1453}
1454
bb0c0d6f
LP
1455static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457 assert(context);
1458
1459 return !hashmap_isempty(context->set_credentials) ||
43144be4 1460 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1461}
1462
349cc4a5 1463#if HAVE_SECCOMP
17df7223 1464
83f12b27 1465static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1466
1467 if (is_seccomp_available())
1468 return false;
1469
f673b62d 1470 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1471 return true;
83f12b27
FS
1472}
1473
165a31c0 1474static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1475 uint32_t negative_action, default_action, action;
165a31c0 1476 int r;
8351ceae 1477
469830d1 1478 assert(u);
c0467cf3 1479 assert(c);
8351ceae 1480
469830d1 1481 if (!context_has_syscall_filters(c))
83f12b27
FS
1482 return 0;
1483
469830d1
LP
1484 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485 return 0;
e9642be2 1486
005bfaf1 1487 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1488
6b000af4 1489 if (c->syscall_allow_list) {
469830d1
LP
1490 default_action = negative_action;
1491 action = SCMP_ACT_ALLOW;
7c66bae2 1492 } else {
469830d1
LP
1493 default_action = SCMP_ACT_ALLOW;
1494 action = negative_action;
57183d11 1495 }
8351ceae 1496
165a31c0 1497 if (needs_ambient_hack) {
6b000af4 1498 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1499 if (r < 0)
1500 return r;
1501 }
1502
b54f36c6 1503 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1504}
1505
9df2cdd8
TM
1506static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507#ifdef SCMP_ACT_LOG
1508 uint32_t default_action, action;
1509#endif
1510
1511 assert(u);
1512 assert(c);
1513
1514 if (!context_has_syscall_logs(c))
1515 return 0;
1516
1517#ifdef SCMP_ACT_LOG
1518 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519 return 0;
1520
1521 if (c->syscall_log_allow_list) {
1522 /* Log nothing but the ones listed */
1523 default_action = SCMP_ACT_ALLOW;
1524 action = SCMP_ACT_LOG;
1525 } else {
1526 /* Log everything but the ones listed */
1527 default_action = SCMP_ACT_LOG;
1528 action = SCMP_ACT_ALLOW;
1529 }
1530
1531 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532#else
1533 /* old libseccomp */
1534 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535 return 0;
1536#endif
1537}
1538
469830d1
LP
1539static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540 assert(u);
4298d0b5
LP
1541 assert(c);
1542
469830d1 1543 if (set_isempty(c->syscall_archs))
83f12b27
FS
1544 return 0;
1545
469830d1
LP
1546 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547 return 0;
4298d0b5 1548
469830d1
LP
1549 return seccomp_restrict_archs(c->syscall_archs);
1550}
4298d0b5 1551
469830d1
LP
1552static int apply_address_families(const Unit* u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
4298d0b5 1555
469830d1
LP
1556 if (!context_has_address_families(c))
1557 return 0;
4298d0b5 1558
469830d1
LP
1559 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560 return 0;
4298d0b5 1561
6b000af4 1562 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1563}
4298d0b5 1564
83f12b27 1565static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1566 assert(u);
f3e43635
TM
1567 assert(c);
1568
469830d1 1569 if (!c->memory_deny_write_execute)
83f12b27
FS
1570 return 0;
1571
469830d1
LP
1572 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573 return 0;
f3e43635 1574
469830d1 1575 return seccomp_memory_deny_write_execute();
f3e43635
TM
1576}
1577
83f12b27 1578static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1579 assert(u);
f4170c67
LP
1580 assert(c);
1581
469830d1 1582 if (!c->restrict_realtime)
83f12b27
FS
1583 return 0;
1584
469830d1
LP
1585 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586 return 0;
f4170c67 1587
469830d1 1588 return seccomp_restrict_realtime();
f4170c67
LP
1589}
1590
f69567cb
LP
1591static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592 assert(u);
1593 assert(c);
1594
1595 if (!c->restrict_suid_sgid)
1596 return 0;
1597
1598 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599 return 0;
1600
1601 return seccomp_restrict_suid_sgid();
1602}
1603
59e856c7 1604static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1605 assert(u);
59eeb84b
LP
1606 assert(c);
1607
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1610
469830d1 1611 if (!c->protect_kernel_tunables)
59eeb84b
LP
1612 return 0;
1613
469830d1
LP
1614 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615 return 0;
59eeb84b 1616
469830d1 1617 return seccomp_protect_sysctl();
59eeb84b
LP
1618}
1619
59e856c7 1620static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1621 assert(u);
502d704e
DH
1622 assert(c);
1623
25a8d8a0 1624 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1625
469830d1
LP
1626 if (!c->protect_kernel_modules)
1627 return 0;
1628
502d704e
DH
1629 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630 return 0;
1631
b54f36c6 1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1633}
1634
84703040
KK
1635static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636 assert(u);
1637 assert(c);
1638
1639 if (!c->protect_kernel_logs)
1640 return 0;
1641
1642 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643 return 0;
1644
1645 return seccomp_protect_syslog();
1646}
1647
daf8f72b 1648static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1649 assert(u);
1650 assert(c);
1651
1652 if (!c->protect_clock)
1653 return 0;
1654
1655 if (skip_seccomp_unavailable(u, "ProtectClock="))
1656 return 0;
1657
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659}
1660
59e856c7 1661static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1662 assert(u);
ba128bb8
LP
1663 assert(c);
1664
8f81a5f6 1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1666
469830d1
LP
1667 if (!c->private_devices)
1668 return 0;
1669
ba128bb8
LP
1670 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671 return 0;
1672
b54f36c6 1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1674}
1675
34cf6c43 1676static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1677 assert(u);
add00535
LP
1678 assert(c);
1679
1680 if (!exec_context_restrict_namespaces_set(c))
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684 return 0;
1685
1686 return seccomp_restrict_namespaces(c->restrict_namespaces);
1687}
1688
78e864e5 1689static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1690 unsigned long personality;
1691 int r;
78e864e5
TM
1692
1693 assert(u);
1694 assert(c);
1695
1696 if (!c->lock_personality)
1697 return 0;
1698
1699 if (skip_seccomp_unavailable(u, "LockPersonality="))
1700 return 0;
1701
e8132d63
LP
1702 personality = c->personality;
1703
1704 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1705 if (personality == PERSONALITY_INVALID) {
1706
1707 r = opinionated_personality(&personality);
1708 if (r < 0)
1709 return r;
1710 }
78e864e5
TM
1711
1712 return seccomp_lock_personality(personality);
1713}
1714
c0467cf3 1715#endif
8351ceae 1716
7a8288f6 1717#if HAVE_LIBBPF
7a8288f6
DM
1718static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1719 assert(u);
1720 assert(c);
1721
1722 if (!exec_context_restrict_filesystems_set(c))
1723 return 0;
1724
46004616
ZJS
1725 if (!u->manager->restrict_fs) {
1726 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1727 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1728 return 0;
46004616 1729 }
7a8288f6
DM
1730
1731 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1732}
1733#endif
1734
daf8f72b 1735static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1736 assert(u);
1737 assert(c);
1738
1739 if (!c->protect_hostname)
1740 return 0;
1741
1742 if (ns_type_supported(NAMESPACE_UTS)) {
1743 if (unshare(CLONE_NEWUTS) < 0) {
1744 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1745 *ret_exit_status = EXIT_NAMESPACE;
1746 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1747 }
1748
1749 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1750 }
1751 } else
1752 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1753
1754#if HAVE_SECCOMP
8f3e342f
ZJS
1755 int r;
1756
daf8f72b
LP
1757 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1758 return 0;
1759
1760 r = seccomp_protect_hostname();
1761 if (r < 0) {
1762 *ret_exit_status = EXIT_SECCOMP;
1763 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1764 }
1765#endif
1766
1767 return 0;
1768}
1769
3042bbeb 1770static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1771 assert(idle_pipe);
1772
54eb2300
LP
1773 idle_pipe[1] = safe_close(idle_pipe[1]);
1774 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1775
1776 if (idle_pipe[0] >= 0) {
1777 int r;
1778
1779 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1780
1781 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1782 ssize_t n;
1783
31a7eb86 1784 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1785 n = write(idle_pipe[3], "x", 1);
1786 if (n > 0)
cd972d69 1787 /* Wait for systemd to react to the signal above. */
54756dce 1788 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1789 }
1790
54eb2300 1791 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1792
1793 }
1794
54eb2300 1795 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1796}
1797
fb2042dd
YW
1798static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1799
7cae38c4 1800static int build_environment(
34cf6c43 1801 const Unit *u,
9fa95f85 1802 const ExecContext *c,
1e22b5cd 1803 const ExecParameters *p,
da6053d0 1804 size_t n_fds,
7cae38c4
LP
1805 const char *home,
1806 const char *username,
1807 const char *shell,
7bce046b
LP
1808 dev_t journal_stream_dev,
1809 ino_t journal_stream_ino,
7cae38c4
LP
1810 char ***ret) {
1811
1812 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1813 size_t n_env = 0;
7cae38c4
LP
1814 char *x;
1815
4b58153d 1816 assert(u);
7cae38c4 1817 assert(c);
7c1cb6f1 1818 assert(p);
7cae38c4
LP
1819 assert(ret);
1820
dc4e2940 1821#define N_ENV_VARS 17
8d5bb13d 1822 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1823 if (!our_env)
1824 return -ENOMEM;
1825
1826 if (n_fds > 0) {
8dd4c05b
LP
1827 _cleanup_free_ char *joined = NULL;
1828
df0ff127 1829 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1830 return -ENOMEM;
1831 our_env[n_env++] = x;
1832
da6053d0 1833 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
8dd4c05b 1836
1e22b5cd 1837 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1838 if (!joined)
1839 return -ENOMEM;
1840
605405c6 1841 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1842 if (!x)
1843 return -ENOMEM;
1844 our_env[n_env++] = x;
7cae38c4
LP
1845 }
1846
b08af3b1 1847 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1848 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1849 return -ENOMEM;
1850 our_env[n_env++] = x;
1851
1e22b5cd 1852 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1853 return -ENOMEM;
1854 our_env[n_env++] = x;
1855 }
1856
de90700f
LP
1857 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1858 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1859 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1860 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1861 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1862 if (!x)
1863 return -ENOMEM;
1864 our_env[n_env++] = x;
1865 }
1866
7cae38c4 1867 if (home) {
b910cc72 1868 x = strjoin("HOME=", home);
7cae38c4
LP
1869 if (!x)
1870 return -ENOMEM;
7bbead1d 1871
4ff361cc 1872 path_simplify(x + 5);
7cae38c4
LP
1873 our_env[n_env++] = x;
1874 }
1875
1876 if (username) {
b910cc72 1877 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1878 if (!x)
1879 return -ENOMEM;
1880 our_env[n_env++] = x;
1881
b910cc72 1882 x = strjoin("USER=", username);
7cae38c4
LP
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886 }
1887
1888 if (shell) {
b910cc72 1889 x = strjoin("SHELL=", shell);
7cae38c4
LP
1890 if (!x)
1891 return -ENOMEM;
7bbead1d 1892
4ff361cc 1893 path_simplify(x + 6);
7cae38c4
LP
1894 our_env[n_env++] = x;
1895 }
1896
4b58153d
LP
1897 if (!sd_id128_is_null(u->invocation_id)) {
1898 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1899 return -ENOMEM;
1900
1901 our_env[n_env++] = x;
1902 }
1903
6af760f3
LP
1904 if (exec_context_needs_term(c)) {
1905 const char *tty_path, *term = NULL;
1906
1907 tty_path = exec_context_tty_path(c);
1908
e8cf09b2
LP
1909 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1910 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1911 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1912
e8cf09b2 1913 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1914 term = getenv("TERM");
e8cf09b2 1915
6af760f3
LP
1916 if (!term)
1917 term = default_term_for_tty(tty_path);
7cae38c4 1918
b910cc72 1919 x = strjoin("TERM=", term);
7cae38c4
LP
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
7bce046b
LP
1925 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1926 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1927 return -ENOMEM;
1928
1929 our_env[n_env++] = x;
1930 }
1931
91dd5f7c
LP
1932 if (c->log_namespace) {
1933 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1934 if (!x)
1935 return -ENOMEM;
1936
1937 our_env[n_env++] = x;
1938 }
1939
5b10116e 1940 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1941 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1942 const char *n;
1943
1944 if (!p->prefix[t])
1945 continue;
1946
211a3d87 1947 if (c->directories[t].n_items == 0)
fb2042dd
YW
1948 continue;
1949
1950 n = exec_directory_env_name_to_string(t);
1951 if (!n)
1952 continue;
1953
211a3d87
LB
1954 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1955 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1956
211a3d87
LB
1957 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1958 if (!prefixed)
1959 return -ENOMEM;
1960
1961 if (!strextend_with_separator(&joined, ":", prefixed))
1962 return -ENOMEM;
1963 }
fb2042dd
YW
1964
1965 x = strjoin(n, "=", joined);
1966 if (!x)
1967 return -ENOMEM;
1968
1969 our_env[n_env++] = x;
1970 }
1971
bb0c0d6f
LP
1972 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1973 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1974 if (!x)
1975 return -ENOMEM;
1976
1977 our_env[n_env++] = x;
1978 }
1979
dc4e2940
YW
1980 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1981 return -ENOMEM;
1982
1983 our_env[n_env++] = x;
1984
7cae38c4 1985 our_env[n_env++] = NULL;
8d5bb13d
LP
1986 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1987#undef N_ENV_VARS
7cae38c4 1988
ae2a15bc 1989 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1990
1991 return 0;
1992}
1993
b4c14404
FB
1994static int build_pass_environment(const ExecContext *c, char ***ret) {
1995 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1996 size_t n_env = 0;
b4c14404
FB
1997
1998 STRV_FOREACH(i, c->pass_environment) {
1999 _cleanup_free_ char *x = NULL;
2000 char *v;
2001
2002 v = getenv(*i);
2003 if (!v)
2004 continue;
605405c6 2005 x = strjoin(*i, "=", v);
b4c14404
FB
2006 if (!x)
2007 return -ENOMEM;
00819cc1 2008
319a4f4b 2009 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2010 return -ENOMEM;
00819cc1 2011
1cc6c93a 2012 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2013 pass_env[n_env] = NULL;
b4c14404
FB
2014 }
2015
ae2a15bc 2016 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2017
2018 return 0;
2019}
2020
5e8deb94 2021bool exec_needs_mount_namespace(
8b44a3d2
LP
2022 const ExecContext *context,
2023 const ExecParameters *params,
4657abb5 2024 const ExecRuntime *runtime) {
8b44a3d2
LP
2025
2026 assert(context);
8b44a3d2 2027
915e6d16
LP
2028 if (context->root_image)
2029 return true;
2030
2a624c36
AP
2031 if (!strv_isempty(context->read_write_paths) ||
2032 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2033 !strv_isempty(context->inaccessible_paths) ||
2034 !strv_isempty(context->exec_paths) ||
2035 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2036 return true;
2037
42b1d8e0 2038 if (context->n_bind_mounts > 0)
d2d6c096
LP
2039 return true;
2040
2abd4e38
YW
2041 if (context->n_temporary_filesystems > 0)
2042 return true;
2043
b3d13314
LB
2044 if (context->n_mount_images > 0)
2045 return true;
2046
93f59701
LB
2047 if (context->n_extension_images > 0)
2048 return true;
2049
a07b9926
LB
2050 if (!strv_isempty(context->extension_directories))
2051 return true;
2052
37ed15d7 2053 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2054 return true;
2055
2056 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2057 return true;
2058
8b44a3d2 2059 if (context->private_devices ||
228af36f 2060 context->private_mounts ||
8b44a3d2 2061 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2062 context->protect_home != PROTECT_HOME_NO ||
2063 context->protect_kernel_tunables ||
c575770b 2064 context->protect_kernel_modules ||
94a7b275 2065 context->protect_kernel_logs ||
4e399953
LP
2066 context->protect_control_groups ||
2067 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2068 context->proc_subset != PROC_SUBSET_ALL ||
2069 context->private_ipc ||
2070 context->ipc_namespace_path)
8b44a3d2
LP
2071 return true;
2072
37c56f89 2073 if (context->root_directory) {
5e98086d 2074 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2075 return true;
2076
5b10116e 2077 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2078 if (params && !params->prefix[t])
37c56f89
YW
2079 continue;
2080
211a3d87 2081 if (context->directories[t].n_items > 0)
37c56f89
YW
2082 return true;
2083 }
2084 }
5d997827 2085
42b1d8e0 2086 if (context->dynamic_user &&
211a3d87
LB
2087 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2088 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2089 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2090 return true;
2091
91dd5f7c
LP
2092 if (context->log_namespace)
2093 return true;
2094
8b44a3d2
LP
2095 return false;
2096}
2097
5749f855 2098static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2099 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2100 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2101 _cleanup_close_ int unshare_ready_fd = -1;
2102 _cleanup_(sigkill_waitp) pid_t pid = 0;
2103 uint64_t c = 1;
d251207d
LP
2104 ssize_t n;
2105 int r;
2106
5749f855
AZ
2107 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2108 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2109 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2110 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2111 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2112 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2113 * continues execution normally.
2114 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2115 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2116
5749f855
AZ
2117 /* Can only set up multiple mappings with CAP_SETUID. */
2118 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2119 r = asprintf(&uid_map,
5749f855 2120 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2121 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2122 ouid, ouid, uid, uid);
2123 else
2124 r = asprintf(&uid_map,
2125 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2126 ouid, ouid);
d251207d 2127
5749f855
AZ
2128 if (r < 0)
2129 return -ENOMEM;
2130
2131 /* Can only set up multiple mappings with CAP_SETGID. */
2132 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2133 r = asprintf(&gid_map,
5749f855 2134 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2135 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2136 ogid, ogid, gid, gid);
2137 else
2138 r = asprintf(&gid_map,
2139 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2140 ogid, ogid);
2141
2142 if (r < 0)
2143 return -ENOMEM;
d251207d
LP
2144
2145 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2146 * namespace. */
2147 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2148 if (unshare_ready_fd < 0)
2149 return -errno;
2150
2151 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2152 * failed. */
2153 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2154 return -errno;
2155
4c253ed1
LP
2156 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2157 if (r < 0)
2158 return r;
2159 if (r == 0) {
d251207d
LP
2160 _cleanup_close_ int fd = -1;
2161 const char *a;
2162 pid_t ppid;
2163
2164 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2165 * here, after the parent opened its own user namespace. */
2166
2167 ppid = getppid();
2168 errno_pipe[0] = safe_close(errno_pipe[0]);
2169
2170 /* Wait until the parent unshared the user namespace */
2171 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175
2176 /* Disable the setgroups() system call in the child user namespace, for good. */
2177 a = procfs_file_alloca(ppid, "setgroups");
2178 fd = open(a, O_WRONLY|O_CLOEXEC);
2179 if (fd < 0) {
2180 if (errno != ENOENT) {
2181 r = -errno;
2182 goto child_fail;
2183 }
2184
2185 /* If the file is missing the kernel is too old, let's continue anyway. */
2186 } else {
2187 if (write(fd, "deny\n", 5) < 0) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191
2192 fd = safe_close(fd);
2193 }
2194
2195 /* First write the GID map */
2196 a = procfs_file_alloca(ppid, "gid_map");
2197 fd = open(a, O_WRONLY|O_CLOEXEC);
2198 if (fd < 0) {
2199 r = -errno;
2200 goto child_fail;
2201 }
2202 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2203 r = -errno;
2204 goto child_fail;
2205 }
2206 fd = safe_close(fd);
2207
2208 /* The write the UID map */
2209 a = procfs_file_alloca(ppid, "uid_map");
2210 fd = open(a, O_WRONLY|O_CLOEXEC);
2211 if (fd < 0) {
2212 r = -errno;
2213 goto child_fail;
2214 }
2215 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2216 r = -errno;
2217 goto child_fail;
2218 }
2219
2220 _exit(EXIT_SUCCESS);
2221
2222 child_fail:
2223 (void) write(errno_pipe[1], &r, sizeof(r));
2224 _exit(EXIT_FAILURE);
2225 }
2226
2227 errno_pipe[1] = safe_close(errno_pipe[1]);
2228
2229 if (unshare(CLONE_NEWUSER) < 0)
2230 return -errno;
2231
2232 /* Let the child know that the namespace is ready now */
2233 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2234 return -errno;
2235
2236 /* Try to read an error code from the child */
2237 n = read(errno_pipe[0], &r, sizeof(r));
2238 if (n < 0)
2239 return -errno;
2240 if (n == sizeof(r)) { /* an error code was sent to us */
2241 if (r < 0)
2242 return r;
2243 return -EIO;
2244 }
2245 if (n != 0) /* on success we should have read 0 bytes */
2246 return -EIO;
2247
8f03de53 2248 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2249 if (r < 0)
2250 return r;
2e87a1fd 2251 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2252 return -EIO;
2253
2254 return 0;
2255}
2256
494d0247
YW
2257static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2258 if (!context->dynamic_user)
2259 return false;
2260
2261 if (type == EXEC_DIRECTORY_CONFIGURATION)
2262 return false;
2263
2264 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2265 return false;
2266
2267 return true;
2268}
2269
211a3d87
LB
2270static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2272 int r;
2273
2274 assert(source);
2275
2276 src_abs = path_join(root, source);
2277 if (!src_abs)
2278 return -ENOMEM;
2279
2280 STRV_FOREACH(dst, symlinks) {
2281 _cleanup_free_ char *dst_abs = NULL;
2282
2283 dst_abs = path_join(root, *dst);
2284 if (!dst_abs)
2285 return -ENOMEM;
2286
2287 r = mkdir_parents_label(dst_abs, 0755);
2288 if (r < 0)
2289 return r;
2290
2291 r = symlink_idempotent(src_abs, dst_abs, true);
2292 if (r < 0)
2293 return r;
2294 }
2295
2296 return 0;
2297}
2298
3536f49e 2299static int setup_exec_directory(
07689d5d
LP
2300 const ExecContext *context,
2301 const ExecParameters *params,
2302 uid_t uid,
3536f49e 2303 gid_t gid,
3536f49e 2304 ExecDirectoryType type,
211a3d87 2305 bool needs_mount_namespace,
3536f49e 2306 int *exit_status) {
07689d5d 2307
72fd1768 2308 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2309 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314 };
07689d5d
LP
2315 int r;
2316
2317 assert(context);
2318 assert(params);
72fd1768 2319 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2320 assert(exit_status);
07689d5d 2321
3536f49e
YW
2322 if (!params->prefix[type])
2323 return 0;
2324
8679efde 2325 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2326 if (!uid_is_valid(uid))
2327 uid = 0;
2328 if (!gid_is_valid(gid))
2329 gid = 0;
2330 }
2331
211a3d87 2332 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2333 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2334
211a3d87 2335 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2336 if (!p) {
2337 r = -ENOMEM;
2338 goto fail;
2339 }
07689d5d 2340
23a7448e
YW
2341 r = mkdir_parents_label(p, 0755);
2342 if (r < 0)
3536f49e 2343 goto fail;
23a7448e 2344
494d0247 2345 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2346 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2347 * case we want to avoid leaving a directory around fully accessible that is owned by
2348 * a dynamic user whose UID is later on reused. To lock this down we use the same
2349 * trick used by container managers to prohibit host users to get access to files of
2350 * the same UID in containers: we place everything inside a directory that has an
2351 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2352 * for unprivileged host code. We then use fs namespacing to make this directory
2353 * permeable for the service itself.
6c47cd7d 2354 *
3f5b1508
LP
2355 * Specifically: for a service which wants a special directory "foo/" we first create
2356 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2357 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2358 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2359 * unprivileged host users can't look into it. Inside of the namespace of the unit
2360 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2361 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2362 * for the service and making sure it only gets access to the dirs it needs but no
2363 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2364 *
3f5b1508
LP
2365 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2366 * to be owned by the service itself.
2367 *
2368 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2369 * for sharing files or sockets with other services. */
6c47cd7d 2370
4ede9802
LP
2371 pp = path_join(params->prefix[type], "private");
2372 if (!pp) {
6c47cd7d
LP
2373 r = -ENOMEM;
2374 goto fail;
2375 }
2376
2377 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2378 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2379 if (r < 0)
2380 goto fail;
2381
211a3d87 2382 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2383 r = -ENOMEM;
2384 goto fail;
2385 }
2386
2387 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2388 r = mkdir_parents_label(pp, 0755);
2389 if (r < 0)
2390 goto fail;
2391
949befd3
LP
2392 if (is_dir(p, false) > 0 &&
2393 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2394
2395 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2396 * it over. Most likely the service has been upgraded from one that didn't use
2397 * DynamicUser=1, to one that does. */
2398
cf52c45d
LP
2399 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2401 exec_directory_type_to_string(type), p, pp);
2402
949befd3
LP
2403 if (rename(p, pp) < 0) {
2404 r = -errno;
2405 goto fail;
2406 }
2407 } else {
2408 /* Otherwise, create the actual directory for the service */
2409
2410 r = mkdir_label(pp, context->directories[type].mode);
2411 if (r < 0 && r != -EEXIST)
2412 goto fail;
2413 }
6c47cd7d 2414
df61e79a
LB
2415 /* And link it up from the original place. Note that if a mount namespace is going to be
2416 * used, then this symlink remains on the host, and a new one for the child namespace will
2417 * be created later. */
6c9c51e5 2418 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2419 if (r < 0)
2420 goto fail;
2421
6c47cd7d 2422 } else {
5c6d40d1
LP
2423 _cleanup_free_ char *target = NULL;
2424
2425 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2426 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2427 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2428
2429 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2430 * by DynamicUser=1 (see above)?
2431 *
2432 * We do this for all directory types except for ConfigurationDirectory=,
2433 * since they all support the private/ symlink logic at least in some
2434 * configurations, see above. */
5c6d40d1 2435
578dc69f
YW
2436 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2437 if (r < 0)
2438 goto fail;
2439
211a3d87 2440 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2441 if (!q) {
2442 r = -ENOMEM;
2443 goto fail;
2444 }
2445
578dc69f
YW
2446 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2447 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2448 if (r < 0)
2449 goto fail;
2450
2451 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2452
2453 /* Hmm, apparently DynamicUser= was once turned on for this service,
2454 * but is no longer. Let's move the directory back up. */
2455
cf52c45d
LP
2456 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2457 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2458 exec_directory_type_to_string(type), q, p);
2459
5c6d40d1
LP
2460 if (unlink(p) < 0) {
2461 r = -errno;
2462 goto fail;
2463 }
2464
2465 if (rename(q, p) < 0) {
2466 r = -errno;
2467 goto fail;
2468 }
2469 }
2470 }
2471
6c47cd7d 2472 r = mkdir_label(p, context->directories[type].mode);
d484580c 2473 if (r < 0) {
d484580c
LP
2474 if (r != -EEXIST)
2475 goto fail;
2476
206e9864
LP
2477 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2478 struct stat st;
2479
2480 /* Don't change the owner/access mode of the configuration directory,
2481 * as in the common case it is not written to by a service, and shall
2482 * not be writable. */
2483
2484 if (stat(p, &st) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488
2489 /* Still complain if the access mode doesn't match */
2490 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2491 log_warning("%s \'%s\' already exists but the mode is different. "
2492 "(File system: %o %sMode: %o)",
211a3d87 2493 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2494 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2495
6cff72eb 2496 continue;
206e9864 2497 }
6cff72eb 2498 }
a1164ae3 2499 }
07689d5d 2500
206e9864 2501 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2502 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2503 * current UID/GID ownership.) */
2504 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2505 if (r < 0)
2506 goto fail;
c71b2eb7 2507
607b358e
LP
2508 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2509 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2510 * assignments to exist. */
607b358e 2511 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2512 if (r < 0)
3536f49e 2513 goto fail;
07689d5d
LP
2514 }
2515
211a3d87
LB
2516 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2517 * they are set up later, to allow configuring empty var/run/etc. */
2518 if (!needs_mount_namespace)
2519 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2520 r = create_many_symlinks(params->prefix[type],
2521 context->directories[type].items[i].path,
2522 context->directories[type].items[i].symlinks);
2523 if (r < 0)
2524 goto fail;
2525 }
2526
07689d5d 2527 return 0;
3536f49e
YW
2528
2529fail:
2530 *exit_status = exit_status_table[type];
3536f49e 2531 return r;
07689d5d
LP
2532}
2533
bb0c0d6f
LP
2534static int write_credential(
2535 int dfd,
2536 const char *id,
2537 const void *data,
2538 size_t size,
2539 uid_t uid,
2540 bool ownership_ok) {
2541
2542 _cleanup_(unlink_and_freep) char *tmp = NULL;
2543 _cleanup_close_ int fd = -1;
2544 int r;
2545
2546 r = tempfn_random_child("", "cred", &tmp);
2547 if (r < 0)
2548 return r;
2549
2550 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2551 if (fd < 0) {
2552 tmp = mfree(tmp);
2553 return -errno;
2554 }
2555
43144be4 2556 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2557 if (r < 0)
2558 return r;
2559
2560 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2561 return -errno;
2562
2563 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2564 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2565 if (r < 0) {
2566 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2567 return r;
2568
2569 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2570 * to express: that the user gets read access and nothing
2571 * else. But if the backing fs can't support that (e.g. ramfs)
2572 * then we can use file ownership instead. But that's only safe if
2573 * we can then re-mount the whole thing read-only, so that the
2574 * user can no longer chmod() the file to gain write access. */
2575 return r;
2576
f5fbe71d 2577 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2578 return -errno;
2579 }
2580 }
2581
2582 if (renameat(dfd, tmp, dfd, id) < 0)
2583 return -errno;
2584
2585 tmp = mfree(tmp);
2586 return 0;
2587}
2588
2ad591a3
LP
2589static char **credential_search_path(
2590 const ExecParameters *params,
2591 bool encrypted) {
2592
2593 _cleanup_strv_free_ char **l = NULL;
2594
2595 assert(params);
2596
2597 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2598 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2599 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2600
2601 if (encrypted) {
2602 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2603 return NULL;
2604
2605 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2606 return NULL;
2607 }
2608
2609 if (params->received_credentials_directory)
2610 if (strv_extend(&l, params->received_credentials_directory) < 0)
2611 return NULL;
2612
2613 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2614 return NULL;
2615
2616 if (DEBUG_LOGGING) {
2617 _cleanup_free_ char *t = strv_join(l, ":");
2618
2619 log_debug("Credential search path is: %s", t);
2620 }
2621
2622 return TAKE_PTR(l);
2623}
2624
3989bdc1
AB
2625static int load_credential(
2626 const ExecContext *context,
2627 const ExecParameters *params,
10b44e1d
LP
2628 const char *id,
2629 const char *path,
2630 bool encrypted,
3989bdc1
AB
2631 const char *unit,
2632 int read_dfd,
2633 int write_dfd,
2634 uid_t uid,
2635 bool ownership_ok,
2636 uint64_t *left) {
2637
3989bdc1 2638 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2639 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2640 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2641 _cleanup_free_ char *bindname = NULL;
2642 const char *source = NULL;
3989bdc1 2643 bool missing_ok = true;
2ad591a3 2644 size_t size, add, maxsz;
3989bdc1
AB
2645 int r;
2646
10b44e1d
LP
2647 assert(context);
2648 assert(params);
2649 assert(id);
2650 assert(path);
2651 assert(unit);
2652 assert(write_dfd >= 0);
2653 assert(left);
2654
2ad591a3
LP
2655 if (read_dfd >= 0) {
2656 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2657 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2658 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2659 * open it. */
2660
2661 if (!filename_is_valid(path)) /* safety check */
2662 return -EINVAL;
2663
2664 missing_ok = true;
10b44e1d 2665 source = path;
2ad591a3
LP
2666
2667 } else if (path_is_absolute(path)) {
2668 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2669 * sockets */
2670
2671 if (!path_is_valid(path)) /* safety check */
2672 return -EINVAL;
2673
3989bdc1
AB
2674 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2675
2676 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2677 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2678 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2679 return -ENOMEM;
2680
2681 missing_ok = false;
2ad591a3 2682 source = path;
3989bdc1 2683
2ad591a3
LP
2684 } else if (credential_name_valid(path)) {
2685 /* If this is a relative path, take it as credential name relative to the credentials
2686 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2687 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2688
2689 search_path = credential_search_path(params, encrypted);
2690 if (!search_path)
3989bdc1
AB
2691 return -ENOMEM;
2692
2ad591a3 2693 missing_ok = true;
3989bdc1
AB
2694 } else
2695 source = NULL;
2696
2ad591a3
LP
2697 if (encrypted)
2698 flags |= READ_FULL_FILE_UNBASE64;
2699
2700 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2701
2702 if (search_path) {
2703 STRV_FOREACH(d, search_path) {
2704 _cleanup_free_ char *j = NULL;
2705
2706 j = path_join(*d, path);
2707 if (!j)
2708 return -ENOMEM;
2709
2710 r = read_full_file_full(
2711 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2712 UINT64_MAX,
2713 maxsz,
2714 flags,
2715 NULL,
2716 &data, &size);
2717 if (r != -ENOENT)
2718 break;
2719 }
2720 } else if (source)
3989bdc1
AB
2721 r = read_full_file_full(
2722 read_dfd, source,
2723 UINT64_MAX,
2ad591a3
LP
2724 maxsz,
2725 flags,
3989bdc1
AB
2726 bindname,
2727 &data, &size);
2728 else
2729 r = -ENOENT;
2730
10b44e1d 2731 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2732 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2733 * will get clear errors if we don't pass such a missing credential on as they
2734 * themselves will get ENOENT when trying to read them, which should not be much
2735 * worse than when we handle the error here and make it fatal.
2736 *
2737 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2738 * we are fine, too. */
10b44e1d 2739 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2740 return 0;
2741 }
2742 if (r < 0)
10b44e1d 2743 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2744
10b44e1d 2745 if (encrypted) {
3989bdc1
AB
2746 _cleanup_free_ void *plaintext = NULL;
2747 size_t plaintext_size = 0;
2748
10b44e1d 2749 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2750 if (r < 0)
2751 return r;
2752
2753 free_and_replace(data, plaintext);
2754 size = plaintext_size;
2755 }
2756
10b44e1d 2757 add = strlen(id) + size;
3989bdc1
AB
2758 if (add > *left)
2759 return -E2BIG;
2760
10b44e1d 2761 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2762 if (r < 0)
94602bff 2763 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2764
2765 *left -= add;
2766 return 0;
2767}
2768
2769struct load_cred_args {
3989bdc1
AB
2770 const ExecContext *context;
2771 const ExecParameters *params;
461345a1 2772 bool encrypted;
3989bdc1
AB
2773 const char *unit;
2774 int dfd;
2775 uid_t uid;
2776 bool ownership_ok;
2777 uint64_t *left;
2778};
2779
2780static int load_cred_recurse_dir_cb(
2781 RecurseDirEvent event,
2782 const char *path,
2783 int dir_fd,
2784 int inode_fd,
2785 const struct dirent *de,
2786 const struct statx *sx,
2787 void *userdata) {
2788
6394e5cd 2789 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2790 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2791 int r;
2792
2793 if (event != RECURSE_DIR_ENTRY)
2794 return RECURSE_DIR_CONTINUE;
2795
2796 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2797 return RECURSE_DIR_CONTINUE;
2798
11348386 2799 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2800 if (!sub_id)
2801 return -ENOMEM;
2802
2803 if (!credential_name_valid(sub_id))
1451435c 2804 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2805
5bec447a 2806 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2807 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2808 return RECURSE_DIR_CONTINUE;
2809 }
5bec447a
LP
2810 if (errno != ENOENT)
2811 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2812
10b44e1d
LP
2813 r = load_credential(
2814 args->context,
2815 args->params,
2816 sub_id,
2817 de->d_name,
461345a1 2818 args->encrypted,
10b44e1d
LP
2819 args->unit,
2820 dir_fd,
2821 args->dfd,
2822 args->uid,
2823 args->ownership_ok,
2824 args->left);
3989bdc1
AB
2825 if (r < 0)
2826 return r;
2827
2828 return RECURSE_DIR_CONTINUE;
2829}
2830
bb0c0d6f
LP
2831static int acquire_credentials(
2832 const ExecContext *context,
2833 const ExecParameters *params,
d3dcf4e3 2834 const char *unit,
bb0c0d6f
LP
2835 const char *p,
2836 uid_t uid,
2837 bool ownership_ok) {
2838
43144be4 2839 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2840 _cleanup_close_ int dfd = -1;
43144be4 2841 ExecLoadCredential *lc;
bb0c0d6f 2842 ExecSetCredential *sc;
bb0c0d6f
LP
2843 int r;
2844
2845 assert(context);
2846 assert(p);
2847
2848 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2849 if (dfd < 0)
2850 return -errno;
2851
43144be4
LP
2852 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2853 HASHMAP_FOREACH(lc, context->load_credentials) {
3989bdc1 2854 _cleanup_close_ int sub_fd = -1;
d3dcf4e3 2855
f344f7fd
LP
2856 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2857 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2858 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2859 * propagate a credential passed to us from further up. */
43144be4 2860
f344f7fd
LP
2861 if (path_is_absolute(lc->path)) {
2862 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2863 if (sub_fd < 0 && !IN_SET(errno,
2864 ENOTDIR, /* Not a directory */
2865 ENOENT)) /* Doesn't exist? */
2866 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2867 }
43144be4 2868
61c5a49e 2869 if (sub_fd < 0)
f344f7fd 2870 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2871 r = load_credential(
2872 context,
2873 params,
2874 lc->id,
2875 lc->path,
2876 lc->encrypted,
2877 unit,
2878 -1,
2879 dfd,
2880 uid,
2881 ownership_ok,
2882 &left);
61c5a49e 2883 else
10b44e1d 2884 /* Directory */
3989bdc1
AB
2885 r = recurse_dir(
2886 sub_fd,
11348386 2887 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2888 /* statx_mask= */ 0,
2889 /* n_depth_max= */ UINT_MAX,
9883cbb2 2890 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2891 load_cred_recurse_dir_cb,
2892 &(struct load_cred_args) {
3989bdc1
AB
2893 .context = context,
2894 .params = params,
461345a1 2895 .encrypted = lc->encrypted,
3989bdc1
AB
2896 .unit = unit,
2897 .dfd = dfd,
2898 .uid = uid,
2899 .ownership_ok = ownership_ok,
2900 .left = &left,
2901 });
61c5a49e
LP
2902 if (r < 0)
2903 return r;
bb0c0d6f
LP
2904 }
2905
9e6e9d61
LP
2906 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2907 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2908 HASHMAP_FOREACH(sc, context->set_credentials) {
2909 _cleanup_(erase_and_freep) void *plaintext = NULL;
2910 const char *data;
2911 size_t size, add;
2912
9e6e9d61
LP
2913 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2914 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2915 * slow and involved, hence it's nice to be able to skip that if the credential already
2916 * exists anyway. */
43144be4
LP
2917 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2918 continue;
2919 if (errno != ENOENT)
2920 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2921
2922 if (sc->encrypted) {
2923 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2924 if (r < 0)
2925 return r;
2926
2927 data = plaintext;
2928 } else {
2929 data = sc->data;
2930 size = sc->size;
2931 }
2932
2933 add = strlen(sc->id) + size;
2934 if (add > left)
2935 return -E2BIG;
2936
2937 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2938 if (r < 0)
2939 return r;
2940
43144be4
LP
2941 left -= add;
2942 }
2943
bb0c0d6f
LP
2944 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2945 return -errno;
2946
2947 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2948 * accessible */
2949
2950 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2951 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2952 if (r < 0) {
2953 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2954 return r;
2955
2956 if (!ownership_ok)
2957 return r;
2958
f5fbe71d 2959 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2960 return -errno;
2961 }
2962 }
2963
2964 return 0;
2965}
2966
2967static int setup_credentials_internal(
2968 const ExecContext *context,
2969 const ExecParameters *params,
d3dcf4e3 2970 const char *unit,
bb0c0d6f
LP
2971 const char *final, /* This is where the credential store shall eventually end up at */
2972 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2973 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2974 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2975 uid_t uid) {
2976
2977 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2978 * if we mounted something; false if we definitely can't mount anything */
2979 bool final_mounted;
2980 const char *where;
2981
2982 assert(context);
2983 assert(final);
2984 assert(workspace);
2985
2986 if (reuse_workspace) {
2987 r = path_is_mount_point(workspace, NULL, 0);
2988 if (r < 0)
2989 return r;
2990 if (r > 0)
2991 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2992 else
2993 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2994 } else
2995 workspace_mounted = -1; /* ditto */
2996
2997 r = path_is_mount_point(final, NULL, 0);
2998 if (r < 0)
2999 return r;
3000 if (r > 0) {
3001 /* If the final place already has something mounted, we use that. If the workspace also has
3002 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3003 * different). */
3004 final_mounted = true;
3005
3006 if (workspace_mounted < 0) {
3007 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3008 * the final version to the workspace, and make it writable, so that we can make
3009 * changes */
3010
21935150
LP
3011 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3012 if (r < 0)
3013 return r;
bb0c0d6f 3014
21935150
LP
3015 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3016 if (r < 0)
3017 return r;
bb0c0d6f
LP
3018
3019 workspace_mounted = true;
3020 }
3021 } else
3022 final_mounted = false;
3023
3024 if (workspace_mounted < 0) {
3025 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3026 for (int try = 0;; try++) {
3027
3028 if (try == 0) {
3029 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3030 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3031 if (r >= 0) {
bb0c0d6f
LP
3032 workspace_mounted = true;
3033 break;
3034 }
3035
3036 } else if (try == 1) {
3037 _cleanup_free_ char *opts = NULL;
3038
43144be4 3039 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3040 return -ENOMEM;
3041
3042 /* Fall back to "tmpfs" otherwise */
21935150
LP
3043 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3044 if (r >= 0) {
bb0c0d6f
LP
3045 workspace_mounted = true;
3046 break;
3047 }
3048
3049 } else {
3050 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3051 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3052 if (r < 0) {
3053 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3054 return r;
bb0c0d6f
LP
3055
3056 if (must_mount) /* If we it's not OK to use the plain directory
3057 * fallback, propagate all errors too */
21935150 3058 return r;
bb0c0d6f
LP
3059
3060 /* If we lack privileges to bind mount stuff, then let's gracefully
3061 * proceed for compat with container envs, and just use the final dir
3062 * as is. */
3063
3064 workspace_mounted = false;
3065 break;
3066 }
3067
3068 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3069 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3070 if (r < 0)
3071 return r;
bb0c0d6f
LP
3072
3073 workspace_mounted = true;
3074 break;
3075 }
3076 }
3077 }
3078
3079 assert(!must_mount || workspace_mounted > 0);
3080 where = workspace_mounted ? workspace : final;
3081
e3a0a862
CG
3082 (void) label_fix_container(where, final, 0);
3083
d3dcf4e3 3084 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3085 if (r < 0)
3086 return r;
3087
3088 if (workspace_mounted) {
3089 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3090 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3091 if (r < 0)
3092 return r;
bb0c0d6f
LP
3093
3094 /* And mount it to the final place, read-only */
21935150
LP
3095 if (final_mounted)
3096 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3097 else
3098 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3099 if (r < 0)
3100 return r;
bb0c0d6f
LP
3101 } else {
3102 _cleanup_free_ char *parent = NULL;
3103
3104 /* If we do not have our own mount put used the plain directory fallback, then we need to
3105 * open access to the top-level credential directory and the per-service directory now */
3106
3107 parent = dirname_malloc(final);
3108 if (!parent)
3109 return -ENOMEM;
3110 if (chmod(parent, 0755) < 0)
3111 return -errno;
3112 }
3113
3114 return 0;
3115}
3116
3117static int setup_credentials(
3118 const ExecContext *context,
3119 const ExecParameters *params,
3120 const char *unit,
3121 uid_t uid) {
3122
3123 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3124 int r;
3125
3126 assert(context);
3127 assert(params);
3128
3129 if (!exec_context_has_credentials(context))
3130 return 0;
3131
3132 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3133 return -EINVAL;
3134
3135 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3136 * and the subdir we mount over with a read-only file system readable by the service's user */
3137 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3138 if (!q)
3139 return -ENOMEM;
3140
3141 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3142 if (r < 0 && r != -EEXIST)
3143 return r;
3144
3145 p = path_join(q, unit);
3146 if (!p)
3147 return -ENOMEM;
3148
3149 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3150 if (r < 0 && r != -EEXIST)
3151 return r;
3152
3153 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3154 if (r < 0) {
3155 _cleanup_free_ char *t = NULL, *u = NULL;
3156
3157 /* If this is not a privilege or support issue then propagate the error */
3158 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3159 return r;
3160
3161 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3162 * it into place, so that users can't access half-initialized credential stores. */
3163 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3164 if (!t)
3165 return -ENOMEM;
3166
3167 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3168 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3169 * after it is fully set up */
3170 u = path_join(t, unit);
3171 if (!u)
3172 return -ENOMEM;
3173
3174 FOREACH_STRING(i, t, u) {
3175 r = mkdir_label(i, 0700);
3176 if (r < 0 && r != -EEXIST)
3177 return r;
3178 }
3179
3180 r = setup_credentials_internal(
3181 context,
3182 params,
d3dcf4e3 3183 unit,
bb0c0d6f
LP
3184 p, /* final mount point */
3185 u, /* temporary workspace to overmount */
3186 true, /* reuse the workspace if it is already a mount */
3187 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3188 uid);
3189
3190 (void) rmdir(u); /* remove the workspace again if we can. */
3191
3192 if (r < 0)
3193 return r;
3194
3195 } else if (r == 0) {
3196
3197 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3198 * we can use the same directory for all cases, after turning off propagation. Question
3199 * though is: where do we turn off propagation exactly, and where do we place the workspace
3200 * directory? We need some place that is guaranteed to be a mount point in the host, and
3201 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3202 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3203 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3204 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3205 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3206 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3207 * propagation on the former, and then overmount the latter.
3208 *
3209 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3210 * for this purpose, but there are few other candidates that work equally well for us, and
3211 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3212 * that no one else sees this should be OK to do. */
bb0c0d6f 3213
21935150
LP
3214 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3215 if (r < 0)
bb0c0d6f
LP
3216 goto child_fail;
3217
3218 r = setup_credentials_internal(
3219 context,
3220 params,
d3dcf4e3 3221 unit,
bb0c0d6f
LP
3222 p, /* final mount point */
3223 "/dev/shm", /* temporary workspace to overmount */
3224 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3225 true, /* insist that something is mounted, do not allow fallback to plain directory */
3226 uid);
3227 if (r < 0)
3228 goto child_fail;
3229
3230 _exit(EXIT_SUCCESS);
3231
3232 child_fail:
3233 _exit(EXIT_FAILURE);
3234 }
3235
3236 return 0;
3237}
3238
92b423b9 3239#if ENABLE_SMACK
cefc33ae
LP
3240static int setup_smack(
3241 const ExecContext *context,
b83d5050 3242 int executable_fd) {
cefc33ae
LP
3243 int r;
3244
3245 assert(context);
b83d5050 3246 assert(executable_fd >= 0);
cefc33ae 3247
cefc33ae
LP
3248 if (context->smack_process_label) {
3249 r = mac_smack_apply_pid(0, context->smack_process_label);
3250 if (r < 0)
3251 return r;
3252 }
3253#ifdef SMACK_DEFAULT_PROCESS_LABEL
3254 else {
3255 _cleanup_free_ char *exec_label = NULL;
3256
b83d5050 3257 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3258 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3259 return r;
3260
3261 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3262 if (r < 0)
3263 return r;
3264 }
cefc33ae
LP
3265#endif
3266
3267 return 0;
3268}
92b423b9 3269#endif
cefc33ae 3270
6c47cd7d
LP
3271static int compile_bind_mounts(
3272 const ExecContext *context,
3273 const ExecParameters *params,
3274 BindMount **ret_bind_mounts,
da6053d0 3275 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3276 char ***ret_empty_directories) {
3277
3278 _cleanup_strv_free_ char **empty_directories = NULL;
3279 BindMount *bind_mounts;
5b10116e 3280 size_t n, h = 0;
6c47cd7d
LP
3281 int r;
3282
3283 assert(context);
3284 assert(params);
3285 assert(ret_bind_mounts);
3286 assert(ret_n_bind_mounts);
3287 assert(ret_empty_directories);
3288
3289 n = context->n_bind_mounts;
5b10116e 3290 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3291 if (!params->prefix[t])
3292 continue;
3293
211a3d87 3294 n += context->directories[t].n_items;
6c47cd7d
LP
3295 }
3296
3297 if (n <= 0) {
3298 *ret_bind_mounts = NULL;
3299 *ret_n_bind_mounts = 0;
3300 *ret_empty_directories = NULL;
3301 return 0;
3302 }
3303
3304 bind_mounts = new(BindMount, n);
3305 if (!bind_mounts)
3306 return -ENOMEM;
3307
5b10116e 3308 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3309 BindMount *item = context->bind_mounts + i;
3310 char *s, *d;
3311
3312 s = strdup(item->source);
3313 if (!s) {
3314 r = -ENOMEM;
3315 goto finish;
3316 }
3317
3318 d = strdup(item->destination);
3319 if (!d) {
3320 free(s);
3321 r = -ENOMEM;
3322 goto finish;
3323 }
3324
3325 bind_mounts[h++] = (BindMount) {
3326 .source = s,
3327 .destination = d,
3328 .read_only = item->read_only,
3329 .recursive = item->recursive,
3330 .ignore_enoent = item->ignore_enoent,
3331 };
3332 }
3333
5b10116e 3334 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3335 if (!params->prefix[t])
3336 continue;
3337
211a3d87 3338 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3339 continue;
3340
494d0247 3341 if (exec_directory_is_private(context, t) &&
74e12520 3342 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3343 char *private_root;
3344
3345 /* So this is for a dynamic user, and we need to make sure the process can access its own
3346 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3347 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3348
657ee2d8 3349 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3350 if (!private_root) {
3351 r = -ENOMEM;
3352 goto finish;
3353 }
3354
3355 r = strv_consume(&empty_directories, private_root);
a635a7ae 3356 if (r < 0)
6c47cd7d 3357 goto finish;
6c47cd7d
LP
3358 }
3359
211a3d87 3360 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3361 char *s, *d;
3362
494d0247 3363 if (exec_directory_is_private(context, t))
211a3d87 3364 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3365 else
211a3d87 3366 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3367 if (!s) {
3368 r = -ENOMEM;
3369 goto finish;
3370 }
3371
494d0247 3372 if (exec_directory_is_private(context, t) &&
74e12520 3373 exec_context_with_rootfs(context))
5609f688
YW
3374 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3375 * directory is not created on the root directory. So, let's bind-mount the directory
3376 * on the 'non-private' place. */
211a3d87 3377 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3378 else
3379 d = strdup(s);
6c47cd7d
LP
3380 if (!d) {
3381 free(s);
3382 r = -ENOMEM;
3383 goto finish;
3384 }
3385
3386 bind_mounts[h++] = (BindMount) {
3387 .source = s,
3388 .destination = d,
3389 .read_only = false,
9ce4e4b0 3390 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3391 .recursive = true,
3392 .ignore_enoent = false,
3393 };
3394 }
3395 }
3396
3397 assert(h == n);
3398
3399 *ret_bind_mounts = bind_mounts;
3400 *ret_n_bind_mounts = n;
ae2a15bc 3401 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3402
3403 return (int) n;
3404
3405finish:
3406 bind_mount_free_many(bind_mounts, h);
3407 return r;
3408}
3409
df61e79a
LB
3410/* ret_symlinks will contain a list of pairs src:dest that describes
3411 * the symlinks to create later on. For example, the symlinks needed
3412 * to safely give private directories to DynamicUser=1 users. */
3413static int compile_symlinks(
3414 const ExecContext *context,
3415 const ExecParameters *params,
3416 char ***ret_symlinks) {
3417
3418 _cleanup_strv_free_ char **symlinks = NULL;
3419 int r;
3420
3421 assert(context);
3422 assert(params);
3423 assert(ret_symlinks);
3424
3425 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3426 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3427 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3428
211a3d87
LB
3429 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3430 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3431
211a3d87
LB
3432 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3433 dst_abs = path_join(params->prefix[dt], *symlink);
3434 if (!src_abs || !dst_abs)
3435 return -ENOMEM;
df61e79a 3436
211a3d87
LB
3437 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3438 if (r < 0)
3439 return r;
3440 }
3441
3fa80e5e 3442 if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
211a3d87
LB
3443 continue;
3444
3445 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3446 if (!private_path)
3447 return -ENOMEM;
3448
211a3d87 3449 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3450 if (!path)
3451 return -ENOMEM;
3452
3453 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3454 if (r < 0)
3455 return r;
3456 }
3457 }
3458
3459 *ret_symlinks = TAKE_PTR(symlinks);
3460
3461 return 0;
3462}
3463
4e677599
LP
3464static bool insist_on_sandboxing(
3465 const ExecContext *context,
3466 const char *root_dir,
3467 const char *root_image,
3468 const BindMount *bind_mounts,
3469 size_t n_bind_mounts) {
3470
4e677599
LP
3471 assert(context);
3472 assert(n_bind_mounts == 0 || bind_mounts);
3473
3474 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3475 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3476 * rearrange stuff in a way we cannot ignore gracefully. */
3477
3478 if (context->n_temporary_filesystems > 0)
3479 return true;
3480
3481 if (root_dir || root_image)
3482 return true;
3483
b3d13314
LB
3484 if (context->n_mount_images > 0)
3485 return true;
3486
4e677599
LP
3487 if (context->dynamic_user)
3488 return true;
3489
4355c04f
LB
3490 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3491 return true;
3492
4e677599
LP
3493 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3494 * essential. */
5b10116e 3495 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3496 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3497 return true;
3498
91dd5f7c
LP
3499 if (context->log_namespace)
3500 return true;
3501
4e677599
LP
3502 return false;
3503}
3504
6818c54c 3505static int apply_mount_namespace(
34cf6c43 3506 const Unit *u,
9f71ba8d 3507 ExecCommandFlags command_flags,
6818c54c
LP
3508 const ExecContext *context,
3509 const ExecParameters *params,
7cc5ef5f
ZJS
3510 const ExecRuntime *runtime,
3511 char **error_path) {
6818c54c 3512
df61e79a 3513 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3514 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3515 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3516 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3517 *extension_dir = NULL;
228af36f 3518 NamespaceInfo ns_info;
165a31c0 3519 bool needs_sandboxing;
6c47cd7d 3520 BindMount *bind_mounts = NULL;
da6053d0 3521 size_t n_bind_mounts = 0;
6818c54c 3522 int r;
93c6bb51 3523
2b3c1b9e
DH
3524 assert(context);
3525
915e6d16
LP
3526 if (params->flags & EXEC_APPLY_CHROOT) {
3527 root_image = context->root_image;
3528
3529 if (!root_image)
3530 root_dir = context->root_directory;
3531 }
93c6bb51 3532
6c47cd7d
LP
3533 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3534 if (r < 0)
3535 return r;
3536
211a3d87 3537 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3538 r = compile_symlinks(context, params, &symlinks);
3539 if (r < 0)
3540 return r;
3541
9f71ba8d 3542 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3543 if (needs_sandboxing) {
3544 /* The runtime struct only contains the parent of the private /tmp,
3545 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3546 * that is sticky, and that's the one we want to use here.
3547 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3548
3549 if (context->private_tmp && runtime) {
56a13a49
ZJS
3550 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3551 tmp_dir = runtime->tmp_dir;
3552 else if (runtime->tmp_dir)
3553 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3554
3555 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3556 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3557 else if (runtime->var_tmp_dir)
56a13a49 3558 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3559 }
3560
b5a33299
YW
3561 ns_info = (NamespaceInfo) {
3562 .ignore_protect_paths = false,
3563 .private_dev = context->private_devices,
3564 .protect_control_groups = context->protect_control_groups,
3565 .protect_kernel_tunables = context->protect_kernel_tunables,
3566 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3567 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3568 .protect_hostname = context->protect_hostname,
5e98086d 3569 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3570 .private_mounts = context->private_mounts,
52b3d652
LP
3571 .protect_home = context->protect_home,
3572 .protect_system = context->protect_system,
4e399953
LP
3573 .protect_proc = context->protect_proc,
3574 .proc_subset = context->proc_subset,
80271a44 3575 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3576 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3577 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3578 };
ecf63c91 3579 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3580 /*
3581 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3582 * sandbox info, otherwise enforce it, don't ignore protected paths and
3583 * fail if we are enable to apply the sandbox inside the mount namespace.
3584 */
3585 ns_info = (NamespaceInfo) {
3586 .ignore_protect_paths = true,
3587 };
3588 else
3589 ns_info = (NamespaceInfo) {};
b5a33299 3590
37ed15d7
FB
3591 if (context->mount_flags == MS_SHARED)
3592 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3593
a631cbfa
LP
3594 if (exec_context_has_credentials(context) &&
3595 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3596 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3597 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3598 if (!creds_path) {
3599 r = -ENOMEM;
3600 goto finalize;
3601 }
bbb4e7f3
LP
3602 }
3603
5e8deb94
LB
3604 if (MANAGER_IS_SYSTEM(u->manager)) {
3605 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3606 if (!propagate_dir) {
3607 r = -ENOMEM;
3608 goto finalize;
3609 }
3610
5e8deb94 3611 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3612 if (!incoming_dir) {
3613 r = -ENOMEM;
3614 goto finalize;
3615 }
24759d8f
LB
3616
3617 extension_dir = strdup("/run/systemd/unit-extensions");
3618 if (!extension_dir) {
3619 r = -ENOMEM;
3620 goto finalize;
3621 }
3622 } else
3623 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3624 r = -ENOMEM;
3625 goto finalize;
3626 }
5e8deb94 3627
18d73705 3628 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3629 &ns_info, context->read_write_paths,
165a31c0
LP
3630 needs_sandboxing ? context->read_only_paths : NULL,
3631 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3632 needs_sandboxing ? context->exec_paths : NULL,
3633 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3634 empty_directories,
df61e79a 3635 symlinks,
6c47cd7d
LP
3636 bind_mounts,
3637 n_bind_mounts,
2abd4e38
YW
3638 context->temporary_filesystems,
3639 context->n_temporary_filesystems,
b3d13314
LB
3640 context->mount_images,
3641 context->n_mount_images,
56a13a49
ZJS
3642 tmp_dir,
3643 var_tmp_dir,
bbb4e7f3 3644 creds_path,
91dd5f7c 3645 context->log_namespace,
915e6d16 3646 context->mount_flags,
d4d55b0d
LB
3647 context->root_hash, context->root_hash_size, context->root_hash_path,
3648 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3649 context->root_verity,
93f59701
LB
3650 context->extension_images,
3651 context->n_extension_images,
a07b9926 3652 context->extension_directories,
5e8deb94
LB
3653 propagate_dir,
3654 incoming_dir,
24759d8f 3655 extension_dir,
3bdc25a4 3656 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3657 error_path);
93c6bb51 3658
1beab8b0 3659 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3660 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3661 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3662 * completely different execution environment. */
aca835ed 3663 if (r == -ENOANO) {
4e677599
LP
3664 if (insist_on_sandboxing(
3665 context,
3666 root_dir, root_image,
3667 bind_mounts,
3668 n_bind_mounts)) {
3669 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3670 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3671 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3672
3673 r = -EOPNOTSUPP;
3674 } else {
aca835ed 3675 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3676 r = 0;
aca835ed 3677 }
93c6bb51
DH
3678 }
3679
8062e643 3680finalize:
4e677599 3681 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3682 return r;
3683}
3684
915e6d16
LP
3685static int apply_working_directory(
3686 const ExecContext *context,
3687 const ExecParameters *params,
3688 const char *home,
376fecf6 3689 int *exit_status) {
915e6d16 3690
6732edab 3691 const char *d, *wd;
2b3c1b9e
DH
3692
3693 assert(context);
376fecf6 3694 assert(exit_status);
2b3c1b9e 3695
6732edab
LP
3696 if (context->working_directory_home) {
3697
376fecf6
LP
3698 if (!home) {
3699 *exit_status = EXIT_CHDIR;
6732edab 3700 return -ENXIO;
376fecf6 3701 }
6732edab 3702
2b3c1b9e 3703 wd = home;
6732edab 3704
14eb3285
LP
3705 } else
3706 wd = empty_to_root(context->working_directory);
e7f1e7c6 3707
fa97f630 3708 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3709 d = wd;
fa97f630 3710 else
3b0e5bb5 3711 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3712
376fecf6
LP
3713 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3714 *exit_status = EXIT_CHDIR;
2b3c1b9e 3715 return -errno;
376fecf6 3716 }
e7f1e7c6
DH
3717
3718 return 0;
3719}
3720
fa97f630
JB
3721static int apply_root_directory(
3722 const ExecContext *context,
3723 const ExecParameters *params,
3724 const bool needs_mount_ns,
3725 int *exit_status) {
3726
3727 assert(context);
3728 assert(exit_status);
3729
5b10116e 3730 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3731 if (!needs_mount_ns && context->root_directory)
3732 if (chroot(context->root_directory) < 0) {
3733 *exit_status = EXIT_CHROOT;
3734 return -errno;
3735 }
fa97f630
JB
3736
3737 return 0;
3738}
3739
b1edf445 3740static int setup_keyring(
34cf6c43 3741 const Unit *u,
b1edf445
LP
3742 const ExecContext *context,
3743 const ExecParameters *p,
3744 uid_t uid, gid_t gid) {
3745
74dd6b51 3746 key_serial_t keyring;
e64c2d0b
DJL
3747 int r = 0;
3748 uid_t saved_uid;
3749 gid_t saved_gid;
74dd6b51
LP
3750
3751 assert(u);
b1edf445 3752 assert(context);
74dd6b51
LP
3753 assert(p);
3754
3755 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3756 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3757 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3758 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3759 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3760 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3761
b1edf445
LP
3762 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3763 return 0;
3764
e64c2d0b
DJL
3765 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3766 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3767 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3768 * & group is just as nasty as acquiring a reference to the user keyring. */
3769
3770 saved_uid = getuid();
3771 saved_gid = getgid();
3772
3773 if (gid_is_valid(gid) && gid != saved_gid) {
3774 if (setregid(gid, -1) < 0)
3775 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3776 }
3777
3778 if (uid_is_valid(uid) && uid != saved_uid) {
3779 if (setreuid(uid, -1) < 0) {
3780 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3781 goto out;
3782 }
3783 }
3784
74dd6b51
LP
3785 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3786 if (keyring == -1) {
3787 if (errno == ENOSYS)
8002fb97 3788 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3789 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3790 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3791 else if (errno == EDQUOT)
8002fb97 3792 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3793 else
e64c2d0b 3794 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3795
e64c2d0b 3796 goto out;
74dd6b51
LP
3797 }
3798
e64c2d0b
DJL
3799 /* When requested link the user keyring into the session keyring. */
3800 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3801
3802 if (keyctl(KEYCTL_LINK,
3803 KEY_SPEC_USER_KEYRING,
3804 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3805 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3806 goto out;
3807 }
3808 }
3809
3810 /* Restore uid/gid back */
3811 if (uid_is_valid(uid) && uid != saved_uid) {
3812 if (setreuid(saved_uid, -1) < 0) {
3813 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3814 goto out;
3815 }
3816 }
3817
3818 if (gid_is_valid(gid) && gid != saved_gid) {
3819 if (setregid(saved_gid, -1) < 0)
3820 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3821 }
3822
3823 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3824 if (!sd_id128_is_null(u->invocation_id)) {
3825 key_serial_t key;
3826
3827 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3828 if (key == -1)
8002fb97 3829 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3830 else {
3831 if (keyctl(KEYCTL_SETPERM, key,
3832 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3833 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3834 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3835 }
3836 }
3837
e64c2d0b 3838out:
37b22b3b 3839 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3840 /* no extra logging, as only the first already reported error matters */
3841 if (getuid() != saved_uid)
3842 (void) setreuid(saved_uid, -1);
b1edf445 3843
e64c2d0b
DJL
3844 if (getgid() != saved_gid)
3845 (void) setregid(saved_gid, -1);
b1edf445 3846
e64c2d0b 3847 return r;
74dd6b51
LP
3848}
3849
3042bbeb 3850static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3851 assert(array);
3852 assert(n);
2caa38e9 3853 assert(pair);
29206d46
LP
3854
3855 if (pair[0] >= 0)
3856 array[(*n)++] = pair[0];
3857 if (pair[1] >= 0)
3858 array[(*n)++] = pair[1];
3859}
3860
a34ceba6
LP
3861static int close_remaining_fds(
3862 const ExecParameters *params,
34cf6c43
YW
3863 const ExecRuntime *runtime,
3864 const DynamicCreds *dcreds,
00d9ef85 3865 int user_lookup_fd,
a34ceba6 3866 int socket_fd,
5b8d1f6b 3867 const int *fds, size_t n_fds) {
a34ceba6 3868
da6053d0 3869 size_t n_dont_close = 0;
00d9ef85 3870 int dont_close[n_fds + 12];
a34ceba6
LP
3871
3872 assert(params);
3873
3874 if (params->stdin_fd >= 0)
3875 dont_close[n_dont_close++] = params->stdin_fd;
3876 if (params->stdout_fd >= 0)
3877 dont_close[n_dont_close++] = params->stdout_fd;
3878 if (params->stderr_fd >= 0)
3879 dont_close[n_dont_close++] = params->stderr_fd;
3880
3881 if (socket_fd >= 0)
3882 dont_close[n_dont_close++] = socket_fd;
3883 if (n_fds > 0) {
3884 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3885 n_dont_close += n_fds;
3886 }
3887
a70581ff 3888 if (runtime) {
29206d46 3889 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3890 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3891 }
29206d46
LP
3892
3893 if (dcreds) {
3894 if (dcreds->user)
3895 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3896 if (dcreds->group)
3897 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3898 }
3899
00d9ef85
LP
3900 if (user_lookup_fd >= 0)
3901 dont_close[n_dont_close++] = user_lookup_fd;
3902
a34ceba6
LP
3903 return close_all_fds(dont_close, n_dont_close);
3904}
3905
00d9ef85
LP
3906static int send_user_lookup(
3907 Unit *unit,
3908 int user_lookup_fd,
3909 uid_t uid,
3910 gid_t gid) {
3911
3912 assert(unit);
3913
3914 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3915 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3916 * specified. */
3917
3918 if (user_lookup_fd < 0)
3919 return 0;
3920
3921 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3922 return 0;
3923
3924 if (writev(user_lookup_fd,
3925 (struct iovec[]) {
e6a7ec4b
LP
3926 IOVEC_INIT(&uid, sizeof(uid)),
3927 IOVEC_INIT(&gid, sizeof(gid)),
3928 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3929 return -errno;
3930
3931 return 0;
3932}
3933
6732edab
LP
3934static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3935 int r;
3936
3937 assert(c);
3938 assert(home);
3939 assert(buf);
3940
3941 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3942
3943 if (*home)
3944 return 0;
3945
3946 if (!c->working_directory_home)
3947 return 0;
3948
6732edab
LP
3949 r = get_home_dir(buf);
3950 if (r < 0)
3951 return r;
3952
3953 *home = *buf;
3954 return 1;
3955}
3956
da50b85a
LP
3957static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3958 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3959 int r;
3960
3961 assert(c);
3962 assert(p);
3963 assert(ret);
3964
3965 assert(c->dynamic_user);
3966
3967 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3968 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3969 * directories. */
3970
5b10116e 3971 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3972 if (t == EXEC_DIRECTORY_CONFIGURATION)
3973 continue;
3974
3975 if (!p->prefix[t])
3976 continue;
3977
211a3d87 3978 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3979 char *e;
3980
494d0247 3981 if (exec_directory_is_private(c, t))
211a3d87 3982 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3983 else
211a3d87 3984 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3985 if (!e)
3986 return -ENOMEM;
3987
3988 r = strv_consume(&list, e);
3989 if (r < 0)
3990 return r;
3991 }
3992 }
3993
ae2a15bc 3994 *ret = TAKE_PTR(list);
da50b85a
LP
3995
3996 return 0;
3997}
3998
78f93209
LP
3999static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4000 bool using_subcgroup;
4001 char *p;
4002
4003 assert(params);
4004 assert(ret);
4005
4006 if (!params->cgroup_path)
4007 return -EINVAL;
4008
4009 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4010 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4011 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4012 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4013 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4014 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4015 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4016 * flag, which is only passed for the former statements, not for the latter. */
4017
4018 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4019 if (using_subcgroup)
657ee2d8 4020 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4021 else
4022 p = strdup(params->cgroup_path);
4023 if (!p)
4024 return -ENOMEM;
4025
4026 *ret = p;
4027 return using_subcgroup;
4028}
4029
e2b2fb7f
MS
4030static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4031 _cleanup_(cpu_set_reset) CPUSet s = {};
4032 int r;
4033
4034 assert(c);
4035 assert(ret);
4036
4037 if (!c->numa_policy.nodes.set) {
4038 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4039 return 0;
4040 }
4041
4042 r = numa_to_cpu_set(&c->numa_policy, &s);
4043 if (r < 0)
4044 return r;
4045
4046 cpu_set_reset(ret);
4047
4048 return cpu_set_add_all(ret, &s);
4049}
4050
4051bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4052 assert(c);
4053
4054 return c->cpu_affinity_from_numa;
4055}
4056
1da37e58
ZJS
4057static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4058 int r;
4059
4060 assert(fds);
4061 assert(n_fds);
4062 assert(*n_fds < fds_size);
4063 assert(ret_fd);
4064
4065 if (fd < 0) {
4066 *ret_fd = -1;
4067 return 0;
4068 }
4069
4070 if (fd < 3 + (int) *n_fds) {
4071 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4072 * the fds we pass to the process (or which are closed only during execve). */
4073
4074 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4075 if (r < 0)
4076 return -errno;
4077
4078 CLOSE_AND_REPLACE(fd, r);
4079 }
4080
4081 *ret_fd = fds[*n_fds] = fd;
4082 (*n_fds) ++;
4083 return 1;
4084}
4085
ff0af2a1 4086static int exec_child(
f2341e0a 4087 Unit *unit,
34cf6c43 4088 const ExecCommand *command,
ff0af2a1
LP
4089 const ExecContext *context,
4090 const ExecParameters *params,
4091 ExecRuntime *runtime,
29206d46 4092 DynamicCreds *dcreds,
ff0af2a1 4093 int socket_fd,
2caa38e9 4094 const int named_iofds[static 3],
4c47affc 4095 int *fds,
da6053d0 4096 size_t n_socket_fds,
25b583d7 4097 size_t n_storage_fds,
ff0af2a1 4098 char **files_env,
00d9ef85 4099 int user_lookup_fd,
12145637 4100 int *exit_status) {
d35fbf6b 4101
8c35c10d 4102 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4103 int r, ngids = 0, exec_fd;
4d885bd3
DH
4104 _cleanup_free_ gid_t *supplementary_gids = NULL;
4105 const char *username = NULL, *groupname = NULL;
5686391b 4106 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 4107 const char *home = NULL, *shell = NULL;
7ca69792 4108 char **final_argv = NULL;
7bce046b
LP
4109 dev_t journal_stream_dev = 0;
4110 ino_t journal_stream_ino = 0;
5749f855 4111 bool userns_set_up = false;
165a31c0
LP
4112 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4113 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4114 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4115 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4116#if HAVE_SELINUX
7f59dd35 4117 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4118 bool use_selinux = false;
ecfbc84f 4119#endif
f9fa32f0 4120#if ENABLE_SMACK
43b1f709 4121 bool use_smack = false;
ecfbc84f 4122#endif
349cc4a5 4123#if HAVE_APPARMOR
43b1f709 4124 bool use_apparmor = false;
ecfbc84f 4125#endif
5749f855
AZ
4126 uid_t saved_uid = getuid();
4127 gid_t saved_gid = getgid();
fed1e721
LP
4128 uid_t uid = UID_INVALID;
4129 gid_t gid = GID_INVALID;
1da37e58
ZJS
4130 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4131 n_keep_fds; /* total number of fds not to close */
165a31c0 4132 int secure_bits;
afb11bf1
DG
4133 _cleanup_free_ gid_t *gids_after_pam = NULL;
4134 int ngids_after_pam = 0;
034c6ed7 4135
f2341e0a 4136 assert(unit);
5cb5a6ff
LP
4137 assert(command);
4138 assert(context);
d35fbf6b 4139 assert(params);
ff0af2a1 4140 assert(exit_status);
d35fbf6b 4141
69339ae9
LP
4142 /* Explicitly test for CVE-2021-4034 inspired invocations */
4143 assert(command->path);
4144 assert(!strv_isempty(command->argv));
4145
d35fbf6b
DM
4146 rename_process_from_path(command->path);
4147
9c274488
LP
4148 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4149 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4150 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4151 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4152 SIGNALS_IGNORE);
d35fbf6b
DM
4153
4154 if (context->ignore_sigpipe)
9c274488 4155 (void) ignore_signals(SIGPIPE);
d35fbf6b 4156
ff0af2a1
LP
4157 r = reset_signal_mask();
4158 if (r < 0) {
4159 *exit_status = EXIT_SIGNAL_MASK;
12145637 4160 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4161 }
034c6ed7 4162
d35fbf6b
DM
4163 if (params->idle_pipe)
4164 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4165
2c027c62
LP
4166 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4167 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4168 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4169 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4170
d35fbf6b 4171 log_forget_fds();
2c027c62 4172 log_set_open_when_needed(true);
4f2d528d 4173
40a80078
LP
4174 /* In case anything used libc syslog(), close this here, too */
4175 closelog();
4176
b1994387 4177 int keep_fds[n_fds + 3];
1da37e58
ZJS
4178 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4179 n_keep_fds = n_fds;
4180
4181 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4182 if (r < 0) {
4183 *exit_status = EXIT_FDS;
4184 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4185 }
4186
b1994387 4187#if HAVE_LIBBPF
46004616
ZJS
4188 if (unit->manager->restrict_fs) {
4189 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4190 if (bpf_map_fd < 0) {
4191 *exit_status = EXIT_FDS;
46004616 4192 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4193 }
4194
4195 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4196 if (r < 0) {
4197 *exit_status = EXIT_FDS;
4198 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4199 }
4200 }
4201#endif
4202
1da37e58 4203 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4204 if (r < 0) {
4205 *exit_status = EXIT_FDS;
12145637 4206 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4207 }
4208
0af07108
ZJS
4209 if (!context->same_pgrp &&
4210 setsid() < 0) {
4211 *exit_status = EXIT_SETSID;
4212 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4213 }
9e2f7c11 4214
1e22b5cd 4215 exec_context_tty_reset(context, params);
d35fbf6b 4216
c891efaf 4217 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4218 _cleanup_free_ char *cmdline = NULL;
4219
4ef15008 4220 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4221 if (!cmdline) {
0460aa5c 4222 *exit_status = EXIT_MEMORY;
12145637 4223 return log_oom();
3b20f877 4224 }
d35fbf6b 4225
4ef15008 4226 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4227 if (r != CONFIRM_EXECUTE) {
4228 if (r == CONFIRM_PRETEND_SUCCESS) {
4229 *exit_status = EXIT_SUCCESS;
4230 return 0;
4231 }
ff0af2a1 4232 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4233 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4234 "Execution cancelled by the user");
d35fbf6b
DM
4235 }
4236 }
1a63a750 4237
d521916d
LP
4238 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4239 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4240 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4241 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4242 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4243 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4244 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4245 *exit_status = EXIT_MEMORY;
4246 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4247 }
4248
29206d46 4249 if (context->dynamic_user && dcreds) {
da50b85a 4250 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4251
d521916d 4252 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4253 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4254 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4255 *exit_status = EXIT_USER;
12145637 4256 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4257 }
4258
da50b85a
LP
4259 r = compile_suggested_paths(context, params, &suggested_paths);
4260 if (r < 0) {
4261 *exit_status = EXIT_MEMORY;
4262 return log_oom();
4263 }
4264
4265 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4266 if (r < 0) {
4267 *exit_status = EXIT_USER;
d85ff944
YW
4268 if (r == -EILSEQ)
4269 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4270 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4271 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4272 }
524daa8c 4273
70dd455c 4274 if (!uid_is_valid(uid)) {
29206d46 4275 *exit_status = EXIT_USER;
d85ff944 4276 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4277 }
4278
4279 if (!gid_is_valid(gid)) {
4280 *exit_status = EXIT_USER;
d85ff944 4281 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4282 }
5bc7452b 4283
29206d46
LP
4284 if (dcreds->user)
4285 username = dcreds->user->name;
4286
4287 } else {
4d885bd3
DH
4288 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4289 if (r < 0) {
4290 *exit_status = EXIT_USER;
12145637 4291 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4292 }
5bc7452b 4293
4d885bd3
DH
4294 r = get_fixed_group(context, &groupname, &gid);
4295 if (r < 0) {
4296 *exit_status = EXIT_GROUP;
12145637 4297 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4298 }
cdc5d5c5 4299 }
29206d46 4300
cdc5d5c5
DH
4301 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4302 r = get_supplementary_groups(context, username, groupname, gid,
4303 &supplementary_gids, &ngids);
4304 if (r < 0) {
4305 *exit_status = EXIT_GROUP;
12145637 4306 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4307 }
5bc7452b 4308
00d9ef85
LP
4309 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4310 if (r < 0) {
4311 *exit_status = EXIT_USER;
12145637 4312 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4313 }
4314
4315 user_lookup_fd = safe_close(user_lookup_fd);
4316
6732edab
LP
4317 r = acquire_home(context, uid, &home, &home_buffer);
4318 if (r < 0) {
4319 *exit_status = EXIT_CHDIR;
12145637 4320 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4321 }
4322
d35fbf6b
DM
4323 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4324 * must sure to drop O_NONBLOCK */
4325 if (socket_fd >= 0)
a34ceba6 4326 (void) fd_nonblock(socket_fd, false);
acbb0225 4327
4c70a4a7
MS
4328 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4329 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4330 if (params->cgroup_path) {
4331 _cleanup_free_ char *p = NULL;
4332
4333 r = exec_parameters_get_cgroup_path(params, &p);
4334 if (r < 0) {
4335 *exit_status = EXIT_CGROUP;
4336 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4337 }
4338
4339 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4340 if (r == -EUCLEAN) {
4341 *exit_status = EXIT_CGROUP;
4342 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4343 "because the cgroup or one of its parents or "
4344 "siblings is in the threaded mode: %m", p);
4345 }
4c70a4a7
MS
4346 if (r < 0) {
4347 *exit_status = EXIT_CGROUP;
4348 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4349 }
4350 }
4351
a8d08f39 4352 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4353 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4354 if (r < 0) {
4355 *exit_status = EXIT_NETWORK;
4356 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4357 }
4358 }
4359
a70581ff
XR
4360 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4361 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4362 if (r < 0) {
4363 *exit_status = EXIT_NAMESPACE;
4364 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4365 }
4366 }
4367
52c239d7 4368 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4369 if (r < 0) {
4370 *exit_status = EXIT_STDIN;
12145637 4371 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4372 }
034c6ed7 4373
52c239d7 4374 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4375 if (r < 0) {
4376 *exit_status = EXIT_STDOUT;
12145637 4377 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4378 }
4379
52c239d7 4380 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4381 if (r < 0) {
4382 *exit_status = EXIT_STDERR;
12145637 4383 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4384 }
4385
d35fbf6b 4386 if (context->oom_score_adjust_set) {
9f8168eb
LP
4387 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4388 * prohibit write access to this file, and we shouldn't trip up over that. */
4389 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4390 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4391 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4392 else if (r < 0) {
ff0af2a1 4393 *exit_status = EXIT_OOM_ADJUST;
12145637 4394 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4395 }
d35fbf6b
DM
4396 }
4397
ad21e542
ZJS
4398 if (context->coredump_filter_set) {
4399 r = set_coredump_filter(context->coredump_filter);
4400 if (ERRNO_IS_PRIVILEGE(r))
4401 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4402 else if (r < 0)
4403 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4404 }
4405
39090201
DJL
4406 if (context->nice_set) {
4407 r = setpriority_closest(context->nice);
4408 if (r < 0)
4409 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4410 }
613b411c 4411
d35fbf6b
DM
4412 if (context->cpu_sched_set) {
4413 struct sched_param param = {
4414 .sched_priority = context->cpu_sched_priority,
4415 };
4416
ff0af2a1
LP
4417 r = sched_setscheduler(0,
4418 context->cpu_sched_policy |
4419 (context->cpu_sched_reset_on_fork ?
4420 SCHED_RESET_ON_FORK : 0),
4421 &param);
4422 if (r < 0) {
4423 *exit_status = EXIT_SETSCHEDULER;
12145637 4424 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4425 }
d35fbf6b 4426 }
fc9b2a84 4427
e2b2fb7f
MS
4428 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4429 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4430 const CPUSet *cpu_set;
4431
4432 if (context->cpu_affinity_from_numa) {
4433 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4434 if (r < 0) {
4435 *exit_status = EXIT_CPUAFFINITY;
4436 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4437 }
4438
4439 cpu_set = &converted_cpu_set;
4440 } else
4441 cpu_set = &context->cpu_set;
4442
4443 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4444 *exit_status = EXIT_CPUAFFINITY;
12145637 4445 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4446 }
e2b2fb7f 4447 }
034c6ed7 4448
b070c7c0
MS
4449 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4450 r = apply_numa_policy(&context->numa_policy);
4451 if (r == -EOPNOTSUPP)
33fe9e3f 4452 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4453 else if (r < 0) {
4454 *exit_status = EXIT_NUMA_POLICY;
4455 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4456 }
4457 }
4458
d35fbf6b
DM
4459 if (context->ioprio_set)
4460 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4461 *exit_status = EXIT_IOPRIO;
12145637 4462 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4463 }
da726a4d 4464
d35fbf6b
DM
4465 if (context->timer_slack_nsec != NSEC_INFINITY)
4466 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4467 *exit_status = EXIT_TIMERSLACK;
12145637 4468 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4469 }
9eba9da4 4470
21022b9d
LP
4471 if (context->personality != PERSONALITY_INVALID) {
4472 r = safe_personality(context->personality);
4473 if (r < 0) {
ff0af2a1 4474 *exit_status = EXIT_PERSONALITY;
12145637 4475 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4476 }
21022b9d 4477 }
94f04347 4478
33331d11
VB
4479 if (context->utmp_id) {
4480 const char *line = context->tty_path ?
4481 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4482 NULL;
df0ff127 4483 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4484 line,
023a4f67
LP
4485 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4486 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4487 USER_PROCESS,
6a93917d 4488 username);
33331d11 4489 }
d35fbf6b 4490
08f67696 4491 if (uid_is_valid(uid)) {
ff0af2a1
LP
4492 r = chown_terminal(STDIN_FILENO, uid);
4493 if (r < 0) {
4494 *exit_status = EXIT_STDIN;
12145637 4495 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4496 }
d35fbf6b 4497 }
8e274523 4498
4e1dfa45 4499 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4500 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4501 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4502 * touch a single hierarchy too. */
584b8688 4503 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4504 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4505 if (r < 0) {
4506 *exit_status = EXIT_CGROUP;
12145637 4507 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4508 }
d35fbf6b 4509 }
034c6ed7 4510
211a3d87
LB
4511 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4512
5b10116e 4513 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4514 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4515 if (r < 0)
4516 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4517 }
94f04347 4518
bb0c0d6f
LP
4519 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4520 r = setup_credentials(context, params, unit->id, uid);
4521 if (r < 0) {
4522 *exit_status = EXIT_CREDENTIALS;
4523 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4524 }
4525 }
4526
7bce046b 4527 r = build_environment(
fd63e712 4528 unit,
7bce046b
LP
4529 context,
4530 params,
4531 n_fds,
4532 home,
4533 username,
4534 shell,
4535 journal_stream_dev,
4536 journal_stream_ino,
4537 &our_env);
2065ca69
JW
4538 if (r < 0) {
4539 *exit_status = EXIT_MEMORY;
12145637 4540 return log_oom();
2065ca69
JW
4541 }
4542
4543 r = build_pass_environment(context, &pass_env);
4544 if (r < 0) {
4545 *exit_status = EXIT_MEMORY;
12145637 4546 return log_oom();
2065ca69
JW
4547 }
4548
adf769b0
ZJS
4549 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4550 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4551 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4552 if (!strv_isempty(context->exec_search_path)) {
4553 _cleanup_free_ char *joined = NULL;
4554
4555 joined = strv_join(context->exec_search_path, ":");
4556 if (!joined) {
4557 *exit_status = EXIT_MEMORY;
4558 return log_oom();
4559 }
4560
4561 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4562 if (r < 0) {
4563 *exit_status = EXIT_MEMORY;
4564 return log_oom();
4565 }
4566 }
4567
4ab3d29f 4568 accum_env = strv_env_merge(params->environment,
2065ca69 4569 our_env,
8c35c10d 4570 joined_exec_search_path,
2065ca69
JW
4571 pass_env,
4572 context->environment,
44e5d006 4573 files_env);
2065ca69
JW
4574 if (!accum_env) {
4575 *exit_status = EXIT_MEMORY;
12145637 4576 return log_oom();
2065ca69 4577 }
1280503b 4578 accum_env = strv_env_clean(accum_env);
2065ca69 4579
096424d1 4580 (void) umask(context->umask);
b213e1c1 4581
b1edf445 4582 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4583 if (r < 0) {
4584 *exit_status = EXIT_KEYRING;
12145637 4585 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4586 }
4587
adf769b0
ZJS
4588 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4589 * from it. */
1703fa41 4590 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4591
adf769b0
ZJS
4592 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4593 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4594 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4595
adf769b0
ZJS
4596 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4597 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4598 * desired. */
165a31c0
LP
4599 if (needs_ambient_hack)
4600 needs_setuid = false;
4601 else
4602 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4603
4604 if (needs_sandboxing) {
adf769b0
ZJS
4605 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4606 * /sys being present. The actual MAC context application will happen later, as late as
4607 * possible, to avoid impacting our own code paths. */
7f18ef0a 4608
349cc4a5 4609#if HAVE_SELINUX
43b1f709 4610 use_selinux = mac_selinux_use();
7f18ef0a 4611#endif
f9fa32f0 4612#if ENABLE_SMACK
43b1f709 4613 use_smack = mac_smack_use();
7f18ef0a 4614#endif
349cc4a5 4615#if HAVE_APPARMOR
43b1f709 4616 use_apparmor = mac_apparmor_use();
7f18ef0a 4617#endif
165a31c0 4618 }
7f18ef0a 4619
ce932d2d
LP
4620 if (needs_sandboxing) {
4621 int which_failed;
4622
4623 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4624 * is set here. (See below.) */
4625
4626 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4627 if (r < 0) {
4628 *exit_status = EXIT_LIMITS;
4629 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4630 }
4631 }
4632
0af07108 4633 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4634 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4635 * wins here. (See above.) */
4636
1da37e58 4637 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4638 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4639 if (r < 0) {
4640 *exit_status = EXIT_PAM;
4641 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4642 }
ac45f971 4643
0af07108
ZJS
4644 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4645 if (ngids_after_pam < 0) {
4646 *exit_status = EXIT_MEMORY;
4647 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4648 }
b213e1c1 4649 }
5749f855 4650
0af07108 4651 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4652 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4653 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4654 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4655
4656 userns_set_up = true;
4657 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4658 if (r < 0) {
4659 *exit_status = EXIT_USER;
4660 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4661 }
4662 }
4663
a8d08f39
LP
4664 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4665
6e2d7c4f 4666 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4667 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4668 if (r == -EPERM)
4669 log_unit_warning_errno(unit, r,
4670 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4671 else if (r < 0) {
6e2d7c4f
MS
4672 *exit_status = EXIT_NETWORK;
4673 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4674 }
a8d08f39
LP
4675 } else if (context->network_namespace_path) {
4676 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4677 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4678 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4679 } else
4680 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4681 }
169c1bda 4682
a70581ff
XR
4683 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4684
4685 if (ns_type_supported(NAMESPACE_IPC)) {
4686 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4687 if (r == -EPERM)
4688 log_unit_warning_errno(unit, r,
4689 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4690 else if (r < 0) {
4691 *exit_status = EXIT_NAMESPACE;
4692 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4693 }
4694 } else if (context->ipc_namespace_path) {
4695 *exit_status = EXIT_NAMESPACE;
4696 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4697 "IPCNamespacePath= is not supported, refusing.");
4698 } else
4699 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4700 }
4701
ee818b89 4702 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4703 _cleanup_free_ char *error_path = NULL;
4704
9f71ba8d 4705 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4706 if (r < 0) {
4707 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4708 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4709 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4710 }
d35fbf6b 4711 }
81a2b7ce 4712
daf8f72b
LP
4713 if (needs_sandboxing) {
4714 r = apply_protect_hostname(unit, context, exit_status);
4715 if (r < 0)
4716 return r;
aecd5ac6
TM
4717 }
4718
5749f855
AZ
4719 /* Drop groups as early as possible.
4720 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4721 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4722 if (needs_setuid) {
afb11bf1
DG
4723 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4724 int ngids_to_enforce = 0;
4725
4726 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4727 ngids,
4728 gids_after_pam,
4729 ngids_after_pam,
4730 &gids_to_enforce);
4731 if (ngids_to_enforce < 0) {
4732 *exit_status = EXIT_MEMORY;
4733 return log_unit_error_errno(unit,
4734 ngids_to_enforce,
4735 "Failed to merge group lists. Group membership might be incorrect: %m");
4736 }
4737
4738 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4739 if (r < 0) {
4740 *exit_status = EXIT_GROUP;
12145637 4741 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4742 }
165a31c0 4743 }
096424d1 4744
5749f855
AZ
4745 /* If the user namespace was not set up above, try to do it now.
4746 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4747 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4748 * case of mount namespaces being less privileged when the mount point list is copied from a
4749 * different user namespace). */
9008e1ac 4750
5749f855
AZ
4751 if (needs_sandboxing && context->private_users && !userns_set_up) {
4752 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4753 if (r < 0) {
4754 *exit_status = EXIT_USER;
4755 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4756 }
4757 }
4758
9f71ba8d
ZJS
4759 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4760 * shall execute. */
4761
4762 _cleanup_free_ char *executable = NULL;
b83d5050 4763 _cleanup_close_ int executable_fd = -1;
8c35c10d 4764 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4765 if (r < 0) {
4766 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4767 log_unit_struct_errno(unit, LOG_INFO, r,
4768 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4769 LOG_UNIT_INVOCATION_ID(unit),
4770 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4771 command->path),
4772 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4773 return 0;
4774 }
4775
4776 *exit_status = EXIT_EXEC;
c2503e35
RH
4777
4778 return log_unit_struct_errno(unit, LOG_INFO, r,
4779 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4780 LOG_UNIT_INVOCATION_ID(unit),
4781 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4782 command->path),
4783 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4784 }
4785
b83d5050
ZJS
4786 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4787 if (r < 0) {
4788 *exit_status = EXIT_FDS;
4789 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4790 }
4791
9f71ba8d 4792#if HAVE_SELINUX
49590d67
MS
4793 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4794 int fd = -1;
4795
4796 if (socket_fd >= 0)
4797 fd = socket_fd;
4798 else if (params->n_socket_fds == 1)
4799 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4800 * use context from that fd to compute the label. */
4801 fd = params->fds[0];
4802
4803 if (fd >= 0) {
4804 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4805 if (r < 0) {
4806 if (!context->selinux_context_ignore) {
4807 *exit_status = EXIT_SELINUX_CONTEXT;
4808 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4809 }
4810 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4811 }
9f71ba8d
ZJS
4812 }
4813 }
4814#endif
4815
165a31c0 4816 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4817 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4818 * however if we have it as we want to keep it open until the final execve(). */
4819
1da37e58 4820 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4821 if (r >= 0)
4822 r = shift_fds(fds, n_fds);
4823 if (r >= 0)
25b583d7 4824 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4825 if (r < 0) {
4826 *exit_status = EXIT_FDS;
12145637 4827 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4828 }
e66cf1a3 4829
5686391b
LP
4830 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4831 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4832 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4833 * came this far. */
4834
165a31c0 4835 secure_bits = context->secure_bits;
e66cf1a3 4836
165a31c0
LP
4837 if (needs_sandboxing) {
4838 uint64_t bset;
e66cf1a3 4839
ce932d2d
LP
4840 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4841 * requested. (Note this is placed after the general resource limit initialization, see
4842 * above, in order to take precedence.) */
f4170c67
LP
4843 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4844 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4845 *exit_status = EXIT_LIMITS;
12145637 4846 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4847 }
4848 }
4849
37ac2744
JB
4850#if ENABLE_SMACK
4851 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4852 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4853 if (use_smack) {
b83d5050 4854 r = setup_smack(context, executable_fd);
29ff6247 4855 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4856 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4857 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4858 }
4859 }
4860#endif
4861
165a31c0
LP
4862 bset = context->capability_bounding_set;
4863 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4864 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4865 * instead of us doing that */
4866 if (needs_ambient_hack)
4867 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4868 (UINT64_C(1) << CAP_SETUID) |
4869 (UINT64_C(1) << CAP_SETGID);
4870
4871 if (!cap_test_all(bset)) {
4872 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4873 if (r < 0) {
4874 *exit_status = EXIT_CAPABILITIES;
12145637 4875 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4876 }
4c2630eb 4877 }
3b8bddde 4878
16fcb191
TK
4879 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4880 * keep-caps set.
4881 * To be able to raise the ambient capabilities after setresuid() they have to be
4882 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4883 * After setresuid() the ambient capabilities can be raised as they are present in
4884 * the permitted and inhertiable set. However it is possible that someone wants to
4885 * set ambient capabilities without changing the user, so we also set the ambient
4886 * capabilities here.
4887 * The requested ambient capabilities are raised in the inheritable set if the
4888 * second argument is true. */
943800f4 4889 if (!needs_ambient_hack) {
755d4b67
IP
4890 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4891 if (r < 0) {
4892 *exit_status = EXIT_CAPABILITIES;
12145637 4893 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4894 }
755d4b67 4895 }
165a31c0 4896 }
755d4b67 4897
fa97f630
JB
4898 /* chroot to root directory first, before we lose the ability to chroot */
4899 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4900 if (r < 0)
4901 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4902
165a31c0 4903 if (needs_setuid) {
08f67696 4904 if (uid_is_valid(uid)) {
ff0af2a1
LP
4905 r = enforce_user(context, uid);
4906 if (r < 0) {
4907 *exit_status = EXIT_USER;
12145637 4908 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4909 }
165a31c0
LP
4910
4911 if (!needs_ambient_hack &&
4912 context->capability_ambient_set != 0) {
755d4b67 4913
16fcb191 4914 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4915 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4916 if (r < 0) {
4917 *exit_status = EXIT_CAPABILITIES;
12145637 4918 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4919 }
755d4b67 4920 }
5b6319dc 4921 }
165a31c0 4922 }
d35fbf6b 4923
56ef8db9
JB
4924 /* Apply working directory here, because the working directory might be on NFS and only the user running
4925 * this service might have the correct privilege to change to the working directory */
fa97f630 4926 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4927 if (r < 0)
4928 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4929
165a31c0 4930 if (needs_sandboxing) {
37ac2744 4931 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4932 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4933 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4934 * are restricted. */
4935
349cc4a5 4936#if HAVE_SELINUX
43b1f709 4937 if (use_selinux) {
5cd9cd35
LP
4938 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4939
4940 if (exec_context) {
4941 r = setexeccon(exec_context);
006d1864
TM
4942 if (r < 0) {
4943 if (!context->selinux_context_ignore) {
4944 *exit_status = EXIT_SELINUX_CONTEXT;
4945 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4946 }
4947 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4948 }
4949 }
4950 }
4951#endif
4952
349cc4a5 4953#if HAVE_APPARMOR
43b1f709 4954 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4955 r = aa_change_onexec(context->apparmor_profile);
4956 if (r < 0 && !context->apparmor_profile_ignore) {
4957 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4958 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4959 }
4960 }
4961#endif
4962
165a31c0 4963 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4964 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4965 * CAP_SETPCAP. */
4966 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4967 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4968 * effective set here.
4969 * The effective set is overwritten during execve with the following values:
4970 * - ambient set (for non-root processes)
4971 * - (inheritable | bounding) set for root processes)
4972 *
4973 * Hence there is no security impact to raise it in the effective set before execve
4974 */
4975 r = capability_gain_cap_setpcap(NULL);
4976 if (r < 0) {
4977 *exit_status = EXIT_CAPABILITIES;
4978 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4979 }
755d4b67 4980 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4981 *exit_status = EXIT_SECUREBITS;
12145637 4982 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4983 }
dbdc4098 4984 }
5b6319dc 4985
59eeb84b 4986 if (context_has_no_new_privileges(context))
d35fbf6b 4987 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4988 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4989 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4990 }
4991
349cc4a5 4992#if HAVE_SECCOMP
469830d1
LP
4993 r = apply_address_families(unit, context);
4994 if (r < 0) {
4995 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4996 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4997 }
04aa0cb9 4998
469830d1
LP
4999 r = apply_memory_deny_write_execute(unit, context);
5000 if (r < 0) {
5001 *exit_status = EXIT_SECCOMP;
12145637 5002 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5003 }
f4170c67 5004
469830d1
LP
5005 r = apply_restrict_realtime(unit, context);
5006 if (r < 0) {
5007 *exit_status = EXIT_SECCOMP;
12145637 5008 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5009 }
5010
f69567cb
LP
5011 r = apply_restrict_suid_sgid(unit, context);
5012 if (r < 0) {
5013 *exit_status = EXIT_SECCOMP;
5014 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5015 }
5016
add00535
LP
5017 r = apply_restrict_namespaces(unit, context);
5018 if (r < 0) {
5019 *exit_status = EXIT_SECCOMP;
12145637 5020 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5021 }
5022
469830d1
LP
5023 r = apply_protect_sysctl(unit, context);
5024 if (r < 0) {
5025 *exit_status = EXIT_SECCOMP;
12145637 5026 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5027 }
5028
469830d1
LP
5029 r = apply_protect_kernel_modules(unit, context);
5030 if (r < 0) {
5031 *exit_status = EXIT_SECCOMP;
12145637 5032 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5033 }
5034
84703040
KK
5035 r = apply_protect_kernel_logs(unit, context);
5036 if (r < 0) {
5037 *exit_status = EXIT_SECCOMP;
5038 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5039 }
5040
fc64760d
KK
5041 r = apply_protect_clock(unit, context);
5042 if (r < 0) {
5043 *exit_status = EXIT_SECCOMP;
5044 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5045 }
5046
469830d1
LP
5047 r = apply_private_devices(unit, context);
5048 if (r < 0) {
5049 *exit_status = EXIT_SECCOMP;
12145637 5050 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5051 }
5052
5053 r = apply_syscall_archs(unit, context);
5054 if (r < 0) {
5055 *exit_status = EXIT_SECCOMP;
12145637 5056 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5057 }
5058
78e864e5
TM
5059 r = apply_lock_personality(unit, context);
5060 if (r < 0) {
5061 *exit_status = EXIT_SECCOMP;
12145637 5062 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5063 }
5064
9df2cdd8
TM
5065 r = apply_syscall_log(unit, context);
5066 if (r < 0) {
5067 *exit_status = EXIT_SECCOMP;
5068 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5069 }
5070
5cd9cd35
LP
5071 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5072 * by the filter as little as possible. */
165a31c0 5073 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5074 if (r < 0) {
5075 *exit_status = EXIT_SECCOMP;
12145637 5076 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5077 }
5078#endif
b1994387
ILG
5079
5080#if HAVE_LIBBPF
5081 r = apply_restrict_filesystems(unit, context);
5082 if (r < 0) {
5083 *exit_status = EXIT_BPF;
5084 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5085 }
5086#endif
5087
d35fbf6b 5088 }
034c6ed7 5089
00819cc1
LP
5090 if (!strv_isempty(context->unset_environment)) {
5091 char **ee = NULL;
5092
5093 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5094 if (!ee) {
5095 *exit_status = EXIT_MEMORY;
12145637 5096 return log_oom();
00819cc1
LP
5097 }
5098
130d3d22 5099 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5100 }
5101
7ca69792
AZ
5102 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5103 replaced_argv = replace_env_argv(command->argv, accum_env);
5104 if (!replaced_argv) {
5105 *exit_status = EXIT_MEMORY;
5106 return log_oom();
5107 }
5108 final_argv = replaced_argv;
5109 } else
5110 final_argv = command->argv;
034c6ed7 5111
f1d34068 5112 if (DEBUG_LOGGING) {
c2b2df60 5113 _cleanup_free_ char *line = NULL;
81a2b7ce 5114
4ef15008 5115 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5116 if (!line) {
5117 *exit_status = EXIT_MEMORY;
5118 return log_oom();
5119 }
5120
5121 log_unit_struct(unit, LOG_DEBUG,
5122 "EXECUTABLE=%s", executable,
5123 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5124 }
dd305ec9 5125
5686391b
LP
5126 if (exec_fd >= 0) {
5127 uint8_t hot = 1;
5128
5129 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5130 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5131
5132 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5133 *exit_status = EXIT_EXEC;
5134 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5135 }
5136 }
5137
a6d9111c 5138 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5139
5140 if (exec_fd >= 0) {
5141 uint8_t hot = 0;
5142
5143 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5144 * that POLLHUP on it no longer means execve() succeeded. */
5145
5146 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5147 *exit_status = EXIT_EXEC;
5148 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5149 }
5150 }
12145637 5151
ff0af2a1 5152 *exit_status = EXIT_EXEC;
9f71ba8d 5153 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5154}
81a2b7ce 5155
34cf6c43 5156static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5157static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5158
f2341e0a
LP
5159int exec_spawn(Unit *unit,
5160 ExecCommand *command,
d35fbf6b
DM
5161 const ExecContext *context,
5162 const ExecParameters *params,
5163 ExecRuntime *runtime,
29206d46 5164 DynamicCreds *dcreds,
d35fbf6b 5165 pid_t *ret) {
8351ceae 5166
ee39ca20 5167 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5168 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5169 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5170 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5171 _cleanup_free_ char *line = NULL;
d35fbf6b 5172 pid_t pid;
8351ceae 5173
f2341e0a 5174 assert(unit);
d35fbf6b
DM
5175 assert(command);
5176 assert(context);
5177 assert(ret);
5178 assert(params);
25b583d7 5179 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5180
d35fbf6b
DM
5181 if (context->std_input == EXEC_INPUT_SOCKET ||
5182 context->std_output == EXEC_OUTPUT_SOCKET ||
5183 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5184
d85ff944
YW
5185 if (params->n_socket_fds > 1)
5186 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5187
d85ff944
YW
5188 if (params->n_socket_fds == 0)
5189 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5190
d35fbf6b
DM
5191 socket_fd = params->fds[0];
5192 } else {
5193 socket_fd = -1;
5194 fds = params->fds;
9b141911 5195 n_socket_fds = params->n_socket_fds;
25b583d7 5196 n_storage_fds = params->n_storage_fds;
d35fbf6b 5197 }
94f04347 5198
34cf6c43 5199 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5200 if (r < 0)
5201 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5202
f2341e0a 5203 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5204 if (r < 0)
f2341e0a 5205 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5206
4ef15008 5207 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5208 if (!line)
5209 return log_oom();
fab56fc5 5210
9f71ba8d
ZJS
5211 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5212 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5213 mac_selinux_maybe_reload();
5214
c2503e35
RH
5215 log_unit_struct(unit, LOG_DEBUG,
5216 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5217 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5218 the mount namespace in the child, but we want to log
5219 from the parent, so we need to use the (possibly
5220 inaccurate) path here. */
5221 LOG_UNIT_INVOCATION_ID(unit));
12145637 5222
78f93209
LP
5223 if (params->cgroup_path) {
5224 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5225 if (r < 0)
5226 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5227 if (r > 0) { /* We are using a child cgroup */
5228 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5229 if (r < 0)
5230 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
5231
5232 /* Normally we would not propagate the oomd xattrs to children but since we created this
5233 * sub-cgroup internally we should do it. */
5234 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
5235 }
5236 }
5237
d35fbf6b
DM
5238 pid = fork();
5239 if (pid < 0)
74129a12 5240 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5241
5242 if (pid == 0) {
12145637 5243 int exit_status = EXIT_SUCCESS;
ff0af2a1 5244
f2341e0a
LP
5245 r = exec_child(unit,
5246 command,
ff0af2a1
LP
5247 context,
5248 params,
5249 runtime,
29206d46 5250 dcreds,
ff0af2a1 5251 socket_fd,
52c239d7 5252 named_iofds,
4c47affc 5253 fds,
9b141911 5254 n_socket_fds,
25b583d7 5255 n_storage_fds,
ff0af2a1 5256 files_env,
00d9ef85 5257 unit->manager->user_lookup_fds[1],
12145637
LP
5258 &exit_status);
5259
e1714f02
ZJS
5260 if (r < 0) {
5261 const char *status =
5262 exit_status_to_string(exit_status,
e04ed6db 5263 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5264
c2503e35
RH
5265 log_unit_struct_errno(unit, LOG_ERR, r,
5266 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5267 LOG_UNIT_INVOCATION_ID(unit),
5268 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5269 status, command->path),
5270 "EXECUTABLE=%s", command->path);
e1714f02 5271 }
4c2630eb 5272
ff0af2a1 5273 _exit(exit_status);
034c6ed7
LP
5274 }
5275
f2341e0a 5276 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5277
78f93209
LP
5278 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5279 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5280 * process will be killed too). */
5281 if (subcgroup_path)
5282 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5283
b58b4116 5284 exec_status_start(&command->exec_status, pid);
9fb86720 5285
034c6ed7 5286 *ret = pid;
5cb5a6ff
LP
5287 return 0;
5288}
5289
034c6ed7
LP
5290void exec_context_init(ExecContext *c) {
5291 assert(c);
5292
4c12626c 5293 c->umask = 0022;
0692548c 5294 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5295 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5296 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5297 c->syslog_level_prefix = true;
353e12c2 5298 c->ignore_sigpipe = true;
3a43da28 5299 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5300 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5301 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5302 c->directories[t].mode = 0755;
12213aed 5303 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5304 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5305 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5306 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5307 c->log_level_max = -1;
005bfaf1
TM
5308#if HAVE_SECCOMP
5309 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5310#endif
51462135
DDM
5311 c->tty_rows = UINT_MAX;
5312 c->tty_cols = UINT_MAX;
b070c7c0 5313 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5314}
5315
613b411c 5316void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5317 assert(c);
5318
6796073e
LP
5319 c->environment = strv_free(c->environment);
5320 c->environment_files = strv_free(c->environment_files);
b4c14404 5321 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5322 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5323
31ce987c 5324 rlimit_free_all(c->rlimit);
034c6ed7 5325
5b10116e 5326 for (size_t l = 0; l < 3; l++) {
52c239d7 5327 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5328 c->stdio_file[l] = mfree(c->stdio_file[l]);
5329 }
52c239d7 5330
a1e58e8e
LP
5331 c->working_directory = mfree(c->working_directory);
5332 c->root_directory = mfree(c->root_directory);
915e6d16 5333 c->root_image = mfree(c->root_image);
18d73705 5334 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5335 c->root_hash = mfree(c->root_hash);
5336 c->root_hash_size = 0;
5337 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5338 c->root_hash_sig = mfree(c->root_hash_sig);
5339 c->root_hash_sig_size = 0;
5340 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5341 c->root_verity = mfree(c->root_verity);
93f59701 5342 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5343 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5344 c->tty_path = mfree(c->tty_path);
5345 c->syslog_identifier = mfree(c->syslog_identifier);
5346 c->user = mfree(c->user);
5347 c->group = mfree(c->group);
034c6ed7 5348
6796073e 5349 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5350
a1e58e8e 5351 c->pam_name = mfree(c->pam_name);
5b6319dc 5352
2a624c36
AP
5353 c->read_only_paths = strv_free(c->read_only_paths);
5354 c->read_write_paths = strv_free(c->read_write_paths);
5355 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5356 c->exec_paths = strv_free(c->exec_paths);
5357 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5358 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5359
d2d6c096 5360 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5361 c->bind_mounts = NULL;
5362 c->n_bind_mounts = 0;
2abd4e38
YW
5363 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5364 c->temporary_filesystems = NULL;
5365 c->n_temporary_filesystems = 0;
b3d13314 5366 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5367
0985c7c4 5368 cpu_set_reset(&c->cpu_set);
b070c7c0 5369 numa_policy_reset(&c->numa_policy);
86a3475b 5370
a1e58e8e
LP
5371 c->utmp_id = mfree(c->utmp_id);
5372 c->selinux_context = mfree(c->selinux_context);
5373 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5374 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5375
b1994387
ILG
5376 c->restrict_filesystems = set_free(c->restrict_filesystems);
5377
8cfa775f 5378 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5379 c->syscall_archs = set_free(c->syscall_archs);
5380 c->address_families = set_free(c->address_families);
e66cf1a3 5381
5b10116e 5382 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5383 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5384
5385 c->log_level_max = -1;
5386
5387 exec_context_free_log_extra_fields(c);
08f3be7a 5388
5ac1530e
ZJS
5389 c->log_ratelimit_interval_usec = 0;
5390 c->log_ratelimit_burst = 0;
90fc172e 5391
08f3be7a
LP
5392 c->stdin_data = mfree(c->stdin_data);
5393 c->stdin_data_size = 0;
a8d08f39
LP
5394
5395 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5396 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5397
5398 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5399
43144be4 5400 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5401 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5402}
5403
34cf6c43 5404int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5405 assert(c);
5406
5407 if (!runtime_prefix)
5408 return 0;
5409
211a3d87 5410 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5411 _cleanup_free_ char *p = NULL;
e66cf1a3 5412
494d0247 5413 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5414 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5415 else
211a3d87 5416 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5417 if (!p)
5418 return -ENOMEM;
5419
7bc4bf4a
LP
5420 /* We execute this synchronously, since we need to be sure this is gone when we start the
5421 * service next. */
c6878637 5422 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5423
211a3d87
LB
5424 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5425 _cleanup_free_ char *symlink_abs = NULL;
5426
5427 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5428 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5429 else
5430 symlink_abs = path_join(runtime_prefix, *symlink);
5431 if (!symlink_abs)
5432 return -ENOMEM;
5433
5434 (void) unlink(symlink_abs);
5435 }
5436
e66cf1a3
LP
5437 }
5438
5439 return 0;
5cb5a6ff
LP
5440}
5441
bb0c0d6f
LP
5442int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5443 _cleanup_free_ char *p = NULL;
5444
5445 assert(c);
5446
5447 if (!runtime_prefix || !unit)
5448 return 0;
5449
5450 p = path_join(runtime_prefix, "credentials", unit);
5451 if (!p)
5452 return -ENOMEM;
5453
5454 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5455 * unmount it, and afterwards remove the mount point */
5456 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5457 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5458
5459 return 0;
5460}
5461
34cf6c43 5462static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5463 assert(c);
5464
a1e58e8e 5465 c->path = mfree(c->path);
6796073e 5466 c->argv = strv_free(c->argv);
43d0fcbd
LP
5467}
5468
da6053d0 5469void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5470 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5471 exec_command_done(c+i);
5472}
5473
f1acf85a 5474ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5475 ExecCommand *i;
5476
5477 while ((i = c)) {
71fda00f 5478 LIST_REMOVE(command, c, i);
43d0fcbd 5479 exec_command_done(i);
5cb5a6ff
LP
5480 free(i);
5481 }
f1acf85a
ZJS
5482
5483 return NULL;
5cb5a6ff
LP
5484}
5485
da6053d0 5486void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5487 for (size_t i = 0; i < n; i++)
f1acf85a 5488 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5489}
5490
6a1d4d9f 5491void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5492 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5493 exec_status_reset(&c[i].exec_status);
5494}
5495
5496void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5497 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5498 LIST_FOREACH(command, z, c[i])
5499 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5500}
5501
039f0e70 5502typedef struct InvalidEnvInfo {
34cf6c43 5503 const Unit *unit;
039f0e70
LP
5504 const char *path;
5505} InvalidEnvInfo;
5506
5507static void invalid_env(const char *p, void *userdata) {
5508 InvalidEnvInfo *info = userdata;
5509
f2341e0a 5510 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5511}
5512
52c239d7
LB
5513const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5514 assert(c);
5515
5516 switch (fd_index) {
5073ff6b 5517
52c239d7
LB
5518 case STDIN_FILENO:
5519 if (c->std_input != EXEC_INPUT_NAMED_FD)
5520 return NULL;
5073ff6b 5521
52c239d7 5522 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5523
52c239d7
LB
5524 case STDOUT_FILENO:
5525 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5526 return NULL;
5073ff6b 5527
52c239d7 5528 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5529
52c239d7
LB
5530 case STDERR_FILENO:
5531 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5532 return NULL;
5073ff6b 5533
52c239d7 5534 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5535
52c239d7
LB
5536 default:
5537 return NULL;
5538 }
5539}
5540
2caa38e9
LP
5541static int exec_context_named_iofds(
5542 const ExecContext *c,
5543 const ExecParameters *p,
5544 int named_iofds[static 3]) {
5545
5b10116e 5546 size_t targets;
56fbd561 5547 const char* stdio_fdname[3];
da6053d0 5548 size_t n_fds;
52c239d7
LB
5549
5550 assert(c);
5551 assert(p);
2caa38e9 5552 assert(named_iofds);
52c239d7
LB
5553
5554 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5555 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5556 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5557
5b10116e 5558 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5559 stdio_fdname[i] = exec_context_fdname(c, i);
5560
4c47affc
FB
5561 n_fds = p->n_storage_fds + p->n_socket_fds;
5562
5b10116e 5563 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5564 if (named_iofds[STDIN_FILENO] < 0 &&
5565 c->std_input == EXEC_INPUT_NAMED_FD &&
5566 stdio_fdname[STDIN_FILENO] &&
5567 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5568
52c239d7
LB
5569 named_iofds[STDIN_FILENO] = p->fds[i];
5570 targets--;
56fbd561
ZJS
5571
5572 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5573 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5574 stdio_fdname[STDOUT_FILENO] &&
5575 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5576
52c239d7
LB
5577 named_iofds[STDOUT_FILENO] = p->fds[i];
5578 targets--;
56fbd561
ZJS
5579
5580 } else if (named_iofds[STDERR_FILENO] < 0 &&
5581 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5582 stdio_fdname[STDERR_FILENO] &&
5583 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5584
52c239d7
LB
5585 named_iofds[STDERR_FILENO] = p->fds[i];
5586 targets--;
5587 }
5588
56fbd561 5589 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5590}
5591
398a5009
ZJS
5592static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5593 _cleanup_strv_free_ char **v = NULL;
398a5009 5594 int r;
8c7be95e
LP
5595
5596 assert(c);
398a5009 5597 assert(ret);
8c7be95e
LP
5598
5599 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5600 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5601 bool ignore = false;
5602 char *fn = *i;
8c7be95e
LP
5603
5604 if (fn[0] == '-') {
5605 ignore = true;
313cefa1 5606 fn++;
8c7be95e
LP
5607 }
5608
5609 if (!path_is_absolute(fn)) {
8c7be95e
LP
5610 if (ignore)
5611 continue;
8c7be95e
LP
5612 return -EINVAL;
5613 }
5614
2bef10ab 5615 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5616 r = safe_glob(fn, 0, &pglob);
5617 if (r < 0) {
2bef10ab
PL
5618 if (ignore)
5619 continue;
398a5009 5620 return r;
2bef10ab 5621 }
8c7be95e 5622
d8c92e8b
ZJS
5623 /* When we don't match anything, -ENOENT should be returned */
5624 assert(pglob.gl_pathc > 0);
5625
5b10116e 5626 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5627 _cleanup_strv_free_ char **p = NULL;
5628
5629 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5630 if (r < 0) {
2bef10ab
PL
5631 if (ignore)
5632 continue;
398a5009 5633 return r;
e9c1ea9d 5634 }
398a5009 5635
ebc05a09 5636 /* Log invalid environment variables with filename */
039f0e70
LP
5637 if (p) {
5638 InvalidEnvInfo info = {
f2341e0a 5639 .unit = unit,
039f0e70
LP
5640 .path = pglob.gl_pathv[n]
5641 };
5642
5643 p = strv_env_clean_with_callback(p, invalid_env, &info);
5644 }
8c7be95e 5645
398a5009
ZJS
5646 if (!v)
5647 v = TAKE_PTR(p);
2bef10ab 5648 else {
398a5009 5649 char **m = strv_env_merge(v, p);
c84a9488 5650 if (!m)
2bef10ab 5651 return -ENOMEM;
2bef10ab 5652
398a5009 5653 strv_free_and_replace(v, m);
2bef10ab 5654 }
8c7be95e
LP
5655 }
5656 }
5657
398a5009 5658 *ret = TAKE_PTR(v);
8c7be95e
LP
5659
5660 return 0;
5661}
5662
6ac8fdc9 5663static bool tty_may_match_dev_console(const char *tty) {
7b912648 5664 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5665
1e22b5cd
LP
5666 if (!tty)
5667 return true;
5668
a119ec7c 5669 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5670
5671 /* trivial identity? */
5672 if (streq(tty, "console"))
5673 return true;
5674
7b912648
LP
5675 if (resolve_dev_console(&resolved) < 0)
5676 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5677
5678 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5679 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5680}
5681
6c0ae739
LP
5682static bool exec_context_may_touch_tty(const ExecContext *ec) {
5683 assert(ec);
1e22b5cd 5684
6c0ae739 5685 return ec->tty_reset ||
1e22b5cd
LP
5686 ec->tty_vhangup ||
5687 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5688 is_terminal_input(ec->std_input) ||
5689 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5690 is_terminal_output(ec->std_error);
5691}
5692
5693bool exec_context_may_touch_console(const ExecContext *ec) {
5694
5695 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5696 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5697}
5698
15ae422b 5699static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5700 assert(f);
5701
5702 STRV_FOREACH(g, l)
5703 fprintf(f, " %s", *g);
5704}
5705
ddc155b2
TM
5706static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5707 assert(f);
5708 assert(prefix);
5709 assert(name);
5710
5711 if (!strv_isempty(strv)) {
a7bd1656 5712 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5713 strv_fprintf(f, strv);
5714 fputs("\n", f);
5715 }
5716}
5717
34cf6c43 5718void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5719 int r;
9eba9da4 5720
5cb5a6ff
LP
5721 assert(c);
5722 assert(f);
5723
4ad49000 5724 prefix = strempty(prefix);
5cb5a6ff
LP
5725
5726 fprintf(f,
94f04347
LP
5727 "%sUMask: %04o\n"
5728 "%sWorkingDirectory: %s\n"
451a074f 5729 "%sRootDirectory: %s\n"
15ae422b 5730 "%sNonBlocking: %s\n"
64747e2d 5731 "%sPrivateTmp: %s\n"
7f112f50 5732 "%sPrivateDevices: %s\n"
59eeb84b 5733 "%sProtectKernelTunables: %s\n"
e66a2f65 5734 "%sProtectKernelModules: %s\n"
84703040 5735 "%sProtectKernelLogs: %s\n"
fc64760d 5736 "%sProtectClock: %s\n"
59eeb84b 5737 "%sProtectControlGroups: %s\n"
d251207d
LP
5738 "%sPrivateNetwork: %s\n"
5739 "%sPrivateUsers: %s\n"
1b8689f9
LP
5740 "%sProtectHome: %s\n"
5741 "%sProtectSystem: %s\n"
5d997827 5742 "%sMountAPIVFS: %s\n"
f3e43635 5743 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5744 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5745 "%sRestrictRealtime: %s\n"
f69567cb 5746 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5747 "%sKeyringMode: %s\n"
4e399953
LP
5748 "%sProtectHostname: %s\n"
5749 "%sProtectProc: %s\n"
5750 "%sProcSubset: %s\n",
5cb5a6ff 5751 prefix, c->umask,
14eb3285
LP
5752 prefix, empty_to_root(c->working_directory),
5753 prefix, empty_to_root(c->root_directory),
15ae422b 5754 prefix, yes_no(c->non_blocking),
64747e2d 5755 prefix, yes_no(c->private_tmp),
7f112f50 5756 prefix, yes_no(c->private_devices),
59eeb84b 5757 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5758 prefix, yes_no(c->protect_kernel_modules),
84703040 5759 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5760 prefix, yes_no(c->protect_clock),
59eeb84b 5761 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5762 prefix, yes_no(c->private_network),
5763 prefix, yes_no(c->private_users),
1b8689f9
LP
5764 prefix, protect_home_to_string(c->protect_home),
5765 prefix, protect_system_to_string(c->protect_system),
5e98086d 5766 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5767 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5768 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5769 prefix, yes_no(c->restrict_realtime),
f69567cb 5770 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5771 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5772 prefix, yes_no(c->protect_hostname),
5773 prefix, protect_proc_to_string(c->protect_proc),
5774 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5775
915e6d16
LP
5776 if (c->root_image)
5777 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5778
18d73705 5779 if (c->root_image_options) {
18d73705
LB
5780 fprintf(f, "%sRootImageOptions:", prefix);
5781 LIST_FOREACH(mount_options, o, c->root_image_options)
5782 if (!isempty(o->options))
9ece6444
LB
5783 fprintf(f, " %s:%s",
5784 partition_designator_to_string(o->partition_designator),
5785 o->options);
18d73705
LB
5786 fprintf(f, "\n");
5787 }
5788
0389f4fa
LB
5789 if (c->root_hash) {
5790 _cleanup_free_ char *encoded = NULL;
5791 encoded = hexmem(c->root_hash, c->root_hash_size);
5792 if (encoded)
5793 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5794 }
5795
5796 if (c->root_hash_path)
5797 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5798
d4d55b0d
LB
5799 if (c->root_hash_sig) {
5800 _cleanup_free_ char *encoded = NULL;
5801 ssize_t len;
5802 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5803 if (len)
5804 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5805 }
5806
5807 if (c->root_hash_sig_path)
5808 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5809
0389f4fa
LB
5810 if (c->root_verity)
5811 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5812
8c7be95e
LP
5813 STRV_FOREACH(e, c->environment)
5814 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5815
5816 STRV_FOREACH(e, c->environment_files)
5817 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5818
b4c14404
FB
5819 STRV_FOREACH(e, c->pass_environment)
5820 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5821
00819cc1
LP
5822 STRV_FOREACH(e, c->unset_environment)
5823 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5824
53f47dfc
YW
5825 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5826
5b10116e 5827 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5828 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5829
211a3d87
LB
5830 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5831 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5832
5833 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5834 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5835 }
3536f49e 5836 }
c2bbd90b 5837
5291f26d 5838 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5839
fb33a393 5840 if (c->nice_set)
5291f26d 5841 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5842
dd6c17b1 5843 if (c->oom_score_adjust_set)
5291f26d 5844 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5845
ad21e542 5846 if (c->coredump_filter_set)
5291f26d 5847 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5848
5b10116e 5849 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5850 if (c->rlimit[i]) {
4c3a2b84 5851 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5852 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5853 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5854 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5855 }
94f04347 5856
f8b69d1d 5857 if (c->ioprio_set) {
1756a011 5858 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5859
5bead76e 5860 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5861 if (r >= 0)
5862 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5863
5bead76e 5864 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5865 }
94f04347 5866
f8b69d1d 5867 if (c->cpu_sched_set) {
1756a011 5868 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5869
837df140
YW
5870 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5871 if (r >= 0)
5872 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5873
94f04347 5874 fprintf(f,
38b48754
LP
5875 "%sCPUSchedulingPriority: %i\n"
5876 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5877 prefix, c->cpu_sched_priority,
5878 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5879 }
94f04347 5880
0985c7c4 5881 if (c->cpu_set.set) {
e7fca352
MS
5882 _cleanup_free_ char *affinity = NULL;
5883
5884 affinity = cpu_set_to_range_string(&c->cpu_set);
5885 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5886 }
5887
b070c7c0
MS
5888 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5889 _cleanup_free_ char *nodes = NULL;
5890
5891 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5892 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5893 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5894 }
5895
3a43da28 5896 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5897 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5898
5899 fprintf(f,
80876c20
LP
5900 "%sStandardInput: %s\n"
5901 "%sStandardOutput: %s\n"
5902 "%sStandardError: %s\n",
5903 prefix, exec_input_to_string(c->std_input),
5904 prefix, exec_output_to_string(c->std_output),
5905 prefix, exec_output_to_string(c->std_error));
5906
befc4a80
LP
5907 if (c->std_input == EXEC_INPUT_NAMED_FD)
5908 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5909 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5910 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5911 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5912 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5913
5914 if (c->std_input == EXEC_INPUT_FILE)
5915 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5916 if (c->std_output == EXEC_OUTPUT_FILE)
5917 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5918 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5919 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5920 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5921 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5922 if (c->std_error == EXEC_OUTPUT_FILE)
5923 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5924 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5925 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5926 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5927 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5928
80876c20
LP
5929 if (c->tty_path)
5930 fprintf(f,
6ea832a2
LP
5931 "%sTTYPath: %s\n"
5932 "%sTTYReset: %s\n"
5933 "%sTTYVHangup: %s\n"
51462135
DDM
5934 "%sTTYVTDisallocate: %s\n"
5935 "%sTTYRows: %u\n"
5936 "%sTTYColumns: %u\n",
6ea832a2
LP
5937 prefix, c->tty_path,
5938 prefix, yes_no(c->tty_reset),
5939 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5940 prefix, yes_no(c->tty_vt_disallocate),
5941 prefix, c->tty_rows,
5942 prefix, c->tty_cols);
94f04347 5943
9f6444eb 5944 if (IN_SET(c->std_output,
9f6444eb
LP
5945 EXEC_OUTPUT_KMSG,
5946 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5947 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5948 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5949 IN_SET(c->std_error,
9f6444eb
LP
5950 EXEC_OUTPUT_KMSG,
5951 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5952 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5953 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5954
5ce70e5b 5955 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5956
837df140
YW
5957 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5958 if (r >= 0)
5959 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5960
837df140
YW
5961 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5962 if (r >= 0)
5963 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5964 }
94f04347 5965
d3070fbd
LP
5966 if (c->log_level_max >= 0) {
5967 _cleanup_free_ char *t = NULL;
5968
5969 (void) log_level_to_string_alloc(c->log_level_max, &t);
5970
5971 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5972 }
5973
5291f26d 5974 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5975 fprintf(f,
5976 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5977 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5978
5ac1530e
ZJS
5979 if (c->log_ratelimit_burst > 0)
5980 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5981
5b10116e
ZJS
5982 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5983 fprintf(f, "%sLogExtraFields: ", prefix);
5984 fwrite(c->log_extra_fields[j].iov_base,
5985 1, c->log_extra_fields[j].iov_len,
5986 f);
5987 fputc('\n', f);
d3070fbd
LP
5988 }
5989
91dd5f7c
LP
5990 if (c->log_namespace)
5991 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5992
07d46372
YW
5993 if (c->secure_bits) {
5994 _cleanup_free_ char *str = NULL;
5995
5996 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5997 if (r >= 0)
5998 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5999 }
94f04347 6000
a103496c 6001 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 6002 _cleanup_free_ char *str = NULL;
94f04347 6003
dd1f5bd0
YW
6004 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6005 if (r >= 0)
6006 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6007 }
6008
6009 if (c->capability_ambient_set != 0) {
dd1f5bd0 6010 _cleanup_free_ char *str = NULL;
755d4b67 6011
dd1f5bd0
YW
6012 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6013 if (r >= 0)
6014 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6015 }
6016
6017 if (c->user)
f2d3769a 6018 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6019 if (c->group)
f2d3769a 6020 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6021
29206d46
LP
6022 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6023
ddc155b2 6024 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6025
5b6319dc 6026 if (c->pam_name)
f2d3769a 6027 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6028
ddc155b2
TM
6029 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6030 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6031 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6032 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6033 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6034 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6035
5b10116e
ZJS
6036 for (size_t i = 0; i < c->n_bind_mounts; i++)
6037 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6038 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6039 c->bind_mounts[i].ignore_enoent ? "-": "",
6040 c->bind_mounts[i].source,
6041 c->bind_mounts[i].destination,
6042 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6043
5b10116e
ZJS
6044 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6045 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6046
5b10116e
ZJS
6047 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6048 t->path,
6049 isempty(t->options) ? "" : ":",
6050 strempty(t->options));
6051 }
2abd4e38 6052
169c1bda
LP
6053 if (c->utmp_id)
6054 fprintf(f,
6055 "%sUtmpIdentifier: %s\n",
6056 prefix, c->utmp_id);
7b52a628
MS
6057
6058 if (c->selinux_context)
6059 fprintf(f,
5f8640fb
LP
6060 "%sSELinuxContext: %s%s\n",
6061 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6062
80c21aea
WC
6063 if (c->apparmor_profile)
6064 fprintf(f,
6065 "%sAppArmorProfile: %s%s\n",
6066 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6067
6068 if (c->smack_process_label)
6069 fprintf(f,
6070 "%sSmackProcessLabel: %s%s\n",
6071 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6072
050f7277 6073 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6074 fprintf(f,
6075 "%sPersonality: %s\n",
6076 prefix, strna(personality_to_string(c->personality)));
6077
78e864e5
TM
6078 fprintf(f,
6079 "%sLockPersonality: %s\n",
6080 prefix, yes_no(c->lock_personality));
6081
17df7223 6082 if (c->syscall_filter) {
17df7223 6083 fprintf(f,
57183d11 6084 "%sSystemCallFilter: ",
17df7223
LP
6085 prefix);
6086
6b000af4 6087 if (!c->syscall_allow_list)
17df7223
LP
6088 fputc('~', f);
6089
349cc4a5 6090#if HAVE_SECCOMP
d5a99b7c
JJ
6091 void *id, *val;
6092 bool first = true;
90e74a66 6093 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6094 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6095 const char *errno_name = NULL;
6096 int num = PTR_TO_INT(val);
17df7223
LP
6097
6098 if (first)
6099 first = false;
6100 else
6101 fputc(' ', f);
6102
57183d11 6103 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6104 fputs(strna(name), f);
8cfa775f
YW
6105
6106 if (num >= 0) {
005bfaf1 6107 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6108 if (errno_name)
6109 fprintf(f, ":%s", errno_name);
6110 else
6111 fprintf(f, ":%d", num);
6112 }
17df7223 6113 }
351a19b1 6114#endif
17df7223
LP
6115
6116 fputc('\n', f);
6117 }
6118
57183d11 6119 if (c->syscall_archs) {
57183d11
LP
6120 fprintf(f,
6121 "%sSystemCallArchitectures:",
6122 prefix);
6123
349cc4a5 6124#if HAVE_SECCOMP
d5a99b7c 6125 void *id;
90e74a66 6126 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6127 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6128#endif
6129 fputc('\n', f);
6130 }
6131
add00535
LP
6132 if (exec_context_restrict_namespaces_set(c)) {
6133 _cleanup_free_ char *s = NULL;
6134
86c2a9f1 6135 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6136 if (r >= 0)
6137 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6138 prefix, strna(s));
add00535
LP
6139 }
6140
b1994387 6141#if HAVE_LIBBPF
8fe84dc8
YW
6142 if (exec_context_restrict_filesystems_set(c)) {
6143 char *fs;
6144 SET_FOREACH(fs, c->restrict_filesystems)
6145 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6146 }
b1994387
ILG
6147#endif
6148
a8d08f39
LP
6149 if (c->network_namespace_path)
6150 fprintf(f,
6151 "%sNetworkNamespacePath: %s\n",
6152 prefix, c->network_namespace_path);
6153
3df90f24 6154 if (c->syscall_errno > 0) {
3df90f24
YW
6155 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6156
005bfaf1 6157#if HAVE_SECCOMP
d5a99b7c 6158 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6159 if (errno_name)
005bfaf1 6160 fputs(errno_name, f);
3df90f24 6161 else
005bfaf1
TM
6162 fprintf(f, "%d", c->syscall_errno);
6163#endif
6164 fputc('\n', f);
3df90f24 6165 }
b3d13314 6166
5b10116e 6167 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6168 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6169 c->mount_images[i].ignore_enoent ? "-": "",
6170 c->mount_images[i].source,
79e20ceb 6171 c->mount_images[i].destination);
427353f6 6172 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6173 fprintf(f, ":%s:%s",
427353f6 6174 partition_designator_to_string(o->partition_designator),
79e20ceb 6175 strempty(o->options));
427353f6
LB
6176 fprintf(f, "\n");
6177 }
93f59701
LB
6178
6179 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6180 fprintf(f, "%sExtensionImages: %s%s", prefix,
6181 c->extension_images[i].ignore_enoent ? "-": "",
6182 c->extension_images[i].source);
6183 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6184 fprintf(f, ":%s:%s",
6185 partition_designator_to_string(o->partition_designator),
6186 strempty(o->options));
6187 fprintf(f, "\n");
6188 }
a07b9926
LB
6189
6190 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6191}
6192
34cf6c43 6193bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6194 assert(c);
6195
61233823 6196 /* Returns true if the process forked off would run under
a931ad47
LP
6197 * an unchanged UID or as root. */
6198
6199 if (!c->user)
6200 return true;
6201
6202 if (streq(c->user, "root") || streq(c->user, "0"))
6203 return true;
6204
6205 return false;
6206}
6207
34cf6c43 6208int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6209 int p;
6210
6211 assert(c);
6212
6213 if (c->ioprio_set)
6214 return c->ioprio;
6215
6216 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6217 if (p < 0)
0692548c 6218 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6219
8b330d7d 6220 return ioprio_normalize(p);
7f452159
LP
6221}
6222
5e98086d
ZJS
6223bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6224 assert(c);
6225
61198784 6226 /* Explicit setting wins */
5e98086d
ZJS
6227 if (c->mount_apivfs_set)
6228 return c->mount_apivfs;
6229
61198784 6230 /* Default to "yes" if root directory or image are specified */
74e12520 6231 if (exec_context_with_rootfs(c))
61198784
ZJS
6232 return true;
6233
5e98086d
ZJS
6234 return false;
6235}
6236
d3070fbd 6237void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6238 assert(c);
6239
5b10116e 6240 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6241 free(c->log_extra_fields[l].iov_base);
6242 c->log_extra_fields = mfree(c->log_extra_fields);
6243 c->n_log_extra_fields = 0;
6244}
6245
6f765baf 6246void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6247 _cleanup_close_ int fd = -1;
6248 const char *path;
6249 struct stat st;
6f765baf
LP
6250 int r;
6251
6252 assert(c);
6253
6254 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6255 exec_context_tty_reset(c, NULL);
6256
6257 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6258 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6259 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6260 if (!exec_context_may_touch_tty(c))
6261 return;
6f765baf 6262
0ba976e8
LP
6263 path = exec_context_tty_path(c);
6264 if (!path)
6265 return;
6f765baf 6266
0ba976e8
LP
6267 fd = open(path, O_PATH|O_CLOEXEC);
6268 if (fd < 0)
6269 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6270 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6271 path);
6272
6273 if (fstat(fd, &st) < 0)
6274 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6275
6276 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6277 * if things are a character device, since a proper check either means we'd have to open the TTY and
6278 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6279 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6280 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6281 if (!S_ISCHR(st.st_mode))
6282 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6283
6284 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6285 if (r < 0)
6286 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6287}
6288
4c2f5842
LP
6289int exec_context_get_clean_directories(
6290 ExecContext *c,
6291 char **prefix,
6292 ExecCleanMask mask,
6293 char ***ret) {
6294
6295 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6296 int r;
6297
6298 assert(c);
6299 assert(prefix);
6300 assert(ret);
6301
5b10116e 6302 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6303 if (!FLAGS_SET(mask, 1U << t))
6304 continue;
6305
6306 if (!prefix[t])
6307 continue;
6308
211a3d87 6309 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6310 char *j;
6311
211a3d87 6312 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6313 if (!j)
6314 return -ENOMEM;
6315
6316 r = strv_consume(&l, j);
6317 if (r < 0)
6318 return r;
7f622a19
YW
6319
6320 /* Also remove private directories unconditionally. */
6321 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6322 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6323 if (!j)
6324 return -ENOMEM;
6325
6326 r = strv_consume(&l, j);
6327 if (r < 0)
6328 return r;
6329 }
6330
211a3d87
LB
6331 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6332 j = path_join(prefix[t], *symlink);
7f622a19
YW
6333 if (!j)
6334 return -ENOMEM;
6335
6336 r = strv_consume(&l, j);
6337 if (r < 0)
6338 return r;
6339 }
4c2f5842
LP
6340 }
6341 }
6342
6343 *ret = TAKE_PTR(l);
6344 return 0;
6345}
6346
6347int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6348 ExecCleanMask mask = 0;
6349
6350 assert(c);
6351 assert(ret);
6352
6353 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6354 if (c->directories[t].n_items > 0)
4c2f5842
LP
6355 mask |= 1U << t;
6356
6357 *ret = mask;
6358 return 0;
6359}
6360
b58b4116 6361void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6362 assert(s);
5cb5a6ff 6363
2ed26ed0
LP
6364 *s = (ExecStatus) {
6365 .pid = pid,
6366 };
6367
b58b4116
LP
6368 dual_timestamp_get(&s->start_timestamp);
6369}
6370
34cf6c43 6371void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6372 assert(s);
6373
d46b79bb 6374 if (s->pid != pid)
2ed26ed0
LP
6375 *s = (ExecStatus) {
6376 .pid = pid,
6377 };
b58b4116 6378
63983207 6379 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6380
034c6ed7
LP
6381 s->code = code;
6382 s->status = status;
169c1bda 6383
6f765baf
LP
6384 if (context && context->utmp_id)
6385 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6386}
6387
6a1d4d9f
LP
6388void exec_status_reset(ExecStatus *s) {
6389 assert(s);
6390
6391 *s = (ExecStatus) {};
6392}
6393
34cf6c43 6394void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6395 assert(s);
6396 assert(f);
6397
9fb86720
LP
6398 if (s->pid <= 0)
6399 return;
6400
4c940960
LP
6401 prefix = strempty(prefix);
6402
9fb86720 6403 fprintf(f,
ccd06097
ZJS
6404 "%sPID: "PID_FMT"\n",
6405 prefix, s->pid);
9fb86720 6406
af9d16e1 6407 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6408 fprintf(f,
6409 "%sStart Timestamp: %s\n",
04f5c018 6410 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6411
af9d16e1 6412 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6413 fprintf(f,
6414 "%sExit Timestamp: %s\n"
6415 "%sExit Code: %s\n"
6416 "%sExit Status: %i\n",
04f5c018 6417 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6418 prefix, sigchld_code_to_string(s->code),
6419 prefix, s->status);
5cb5a6ff 6420}
44d8db9e 6421
34cf6c43 6422static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6423 _cleanup_free_ char *cmd = NULL;
4c940960 6424 const char *prefix2;
44d8db9e
LP
6425
6426 assert(c);
6427 assert(f);
6428
4c940960 6429 prefix = strempty(prefix);
63c372cb 6430 prefix2 = strjoina(prefix, "\t");
44d8db9e 6431
4ef15008 6432 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
44d8db9e
LP
6433 fprintf(f,
6434 "%sCommand Line: %s\n",
7c248223 6435 prefix, cmd ?: strerror_safe(ENOMEM));
44d8db9e 6436
9fb86720 6437 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6438}
6439
6440void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6441 assert(f);
6442
4c940960 6443 prefix = strempty(prefix);
44d8db9e 6444
03677889
YW
6445 LIST_FOREACH(command, i, c)
6446 exec_command_dump(i, f, prefix);
44d8db9e 6447}
94f04347 6448
a6a80b4f
LP
6449void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6450 ExecCommand *end;
6451
6452 assert(l);
6453 assert(e);
6454
6455 if (*l) {
35b8ca3a 6456 /* It's kind of important, that we keep the order here */
71fda00f
LP
6457 LIST_FIND_TAIL(command, *l, end);
6458 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6459 } else
6460 *l = e;
6461}
6462
26fd040d
LP
6463int exec_command_set(ExecCommand *c, const char *path, ...) {
6464 va_list ap;
6465 char **l, *p;
6466
6467 assert(c);
6468 assert(path);
6469
6470 va_start(ap, path);
6471 l = strv_new_ap(path, ap);
6472 va_end(ap);
6473
6474 if (!l)
6475 return -ENOMEM;
6476
250a918d
LP
6477 p = strdup(path);
6478 if (!p) {
26fd040d
LP
6479 strv_free(l);
6480 return -ENOMEM;
6481 }
6482
6897dfe8 6483 free_and_replace(c->path, p);
26fd040d 6484
130d3d22 6485 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6486}
6487
86b23b07 6488int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6489 _cleanup_strv_free_ char **l = NULL;
86b23b07 6490 va_list ap;
86b23b07
JS
6491 int r;
6492
6493 assert(c);
6494 assert(path);
6495
6496 va_start(ap, path);
6497 l = strv_new_ap(path, ap);
6498 va_end(ap);
6499
6500 if (!l)
6501 return -ENOMEM;
6502
e287086b 6503 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6504 if (r < 0)
86b23b07 6505 return r;
86b23b07
JS
6506
6507 return 0;
6508}
6509
e8a565cb
YW
6510static void *remove_tmpdir_thread(void *p) {
6511 _cleanup_free_ char *path = p;
86b23b07 6512
e8a565cb
YW
6513 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6514 return NULL;
6515}
6516
6517static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6518 int r;
6519
6520 if (!rt)
6521 return NULL;
6522
6523 if (rt->manager)
6524 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6525
6526 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6527
6528 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6529 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6530
6531 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6532 if (r < 0)
e8a565cb 6533 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6534 else
6535 rt->tmp_dir = NULL;
e8a565cb 6536 }
613b411c 6537
56a13a49 6538 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6539 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6540
6541 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6542 if (r < 0)
e8a565cb 6543 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6544 else
6545 rt->var_tmp_dir = NULL;
e8a565cb
YW
6546 }
6547
6548 rt->id = mfree(rt->id);
6549 rt->tmp_dir = mfree(rt->tmp_dir);
6550 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6551 safe_close_pair(rt->netns_storage_socket);
a70581ff 6552 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6553 return mfree(rt);
6554}
6555
6556static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6557 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6558}
6559
56a13a49
ZJS
6560static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6561 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6562 ExecRuntime *n;
613b411c 6563
8e8009dc 6564 assert(ret);
613b411c 6565
56a13a49
ZJS
6566 id_copy = strdup(id);
6567 if (!id_copy)
6568 return -ENOMEM;
6569
8e8009dc
LP
6570 n = new(ExecRuntime, 1);
6571 if (!n)
613b411c
LP
6572 return -ENOMEM;
6573
8e8009dc 6574 *n = (ExecRuntime) {
56a13a49 6575 .id = TAKE_PTR(id_copy),
8e8009dc 6576 .netns_storage_socket = { -1, -1 },
a70581ff 6577 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6578 };
6579
6580 *ret = n;
613b411c
LP
6581 return 0;
6582}
6583
e8a565cb
YW
6584static int exec_runtime_add(
6585 Manager *m,
6586 const char *id,
56a13a49
ZJS
6587 char **tmp_dir,
6588 char **var_tmp_dir,
6589 int netns_storage_socket[2],
a70581ff 6590 int ipcns_storage_socket[2],
e8a565cb
YW
6591 ExecRuntime **ret) {
6592
6593 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6594 int r;
6595
e8a565cb 6596 assert(m);
613b411c
LP
6597 assert(id);
6598
a70581ff 6599 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6600
56a13a49 6601 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6602 if (r < 0)
6603 return r;
6604
63083706 6605 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6606 if (r < 0)
6607 return r;
e8a565cb 6608
56a13a49
ZJS
6609 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6610 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6611 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6612
6613 if (netns_storage_socket) {
56a13a49
ZJS
6614 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6615 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6616 }
6617
a70581ff
XR
6618 if (ipcns_storage_socket) {
6619 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6620 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6621 }
6622
e8a565cb
YW
6623 rt->manager = m;
6624
6625 if (ret)
6626 *ret = rt;
e8a565cb 6627 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6628 TAKE_PTR(rt);
e8a565cb
YW
6629 return 0;
6630}
6631
74aaf59b
LP
6632static int exec_runtime_make(
6633 Manager *m,
6634 const ExecContext *c,
6635 const char *id,
6636 ExecRuntime **ret) {
6637
56a13a49 6638 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6639 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6640 int r;
6641
6642 assert(m);
6643 assert(c);
6644 assert(id);
6645
6646 /* It is not necessary to create ExecRuntime object. */
a70581ff 6647 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6648 *ret = NULL;
e8a565cb 6649 return 0;
74aaf59b 6650 }
e8a565cb 6651
efa2f3a1
TM
6652 if (c->private_tmp &&
6653 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6654 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6655 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6656 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6657 if (r < 0)
6658 return r;
6659 }
6660
a8d08f39 6661 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6662 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6663 return -errno;
6664 }
6665
a70581ff
XR
6666 if (c->private_ipc || c->ipc_namespace_path) {
6667 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6668 return -errno;
6669 }
6670
6671 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6672 if (r < 0)
6673 return r;
6674
613b411c
LP
6675 return 1;
6676}
6677
e8a565cb
YW
6678int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6679 ExecRuntime *rt;
6680 int r;
613b411c 6681
e8a565cb
YW
6682 assert(m);
6683 assert(id);
6684 assert(ret);
6685
6686 rt = hashmap_get(m->exec_runtime_by_id, id);
6687 if (rt)
387f6955 6688 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6689 goto ref;
6690
74aaf59b
LP
6691 if (!create) {
6692 *ret = NULL;
e8a565cb 6693 return 0;
74aaf59b 6694 }
e8a565cb
YW
6695
6696 /* If not found, then create a new object. */
6697 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6698 if (r < 0)
e8a565cb 6699 return r;
74aaf59b
LP
6700 if (r == 0) {
6701 /* When r == 0, it is not necessary to create ExecRuntime object. */
6702 *ret = NULL;
6703 return 0;
6704 }
613b411c 6705
e8a565cb
YW
6706ref:
6707 /* increment reference counter. */
6708 rt->n_ref++;
6709 *ret = rt;
6710 return 1;
6711}
613b411c 6712
e8a565cb
YW
6713ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6714 if (!rt)
613b411c
LP
6715 return NULL;
6716
e8a565cb 6717 assert(rt->n_ref > 0);
613b411c 6718
e8a565cb
YW
6719 rt->n_ref--;
6720 if (rt->n_ref > 0)
f2341e0a
LP
6721 return NULL;
6722
e8a565cb 6723 return exec_runtime_free(rt, destroy);
613b411c
LP
6724}
6725
e8a565cb
YW
6726int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6727 ExecRuntime *rt;
e8a565cb
YW
6728
6729 assert(m);
613b411c
LP
6730 assert(f);
6731 assert(fds);
6732
90e74a66 6733 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6734 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6735
e8a565cb
YW
6736 if (rt->tmp_dir)
6737 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6738
e8a565cb
YW
6739 if (rt->var_tmp_dir)
6740 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6741
e8a565cb
YW
6742 if (rt->netns_storage_socket[0] >= 0) {
6743 int copy;
613b411c 6744
e8a565cb
YW
6745 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6746 if (copy < 0)
6747 return copy;
613b411c 6748
e8a565cb
YW
6749 fprintf(f, " netns-socket-0=%i", copy);
6750 }
613b411c 6751
e8a565cb
YW
6752 if (rt->netns_storage_socket[1] >= 0) {
6753 int copy;
613b411c 6754
e8a565cb
YW
6755 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6756 if (copy < 0)
6757 return copy;
613b411c 6758
e8a565cb
YW
6759 fprintf(f, " netns-socket-1=%i", copy);
6760 }
6761
a70581ff
XR
6762 if (rt->ipcns_storage_socket[0] >= 0) {
6763 int copy;
6764
6765 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6766 if (copy < 0)
6767 return copy;
6768
6769 fprintf(f, " ipcns-socket-0=%i", copy);
6770 }
6771
6772 if (rt->ipcns_storage_socket[1] >= 0) {
6773 int copy;
6774
6775 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6776 if (copy < 0)
6777 return copy;
6778
6779 fprintf(f, " ipcns-socket-1=%i", copy);
6780 }
6781
e8a565cb 6782 fputc('\n', f);
613b411c
LP
6783 }
6784
6785 return 0;
6786}
6787
e8a565cb
YW
6788int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6789 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6790 ExecRuntime *rt;
613b411c
LP
6791 int r;
6792
e8a565cb
YW
6793 /* This is for the migration from old (v237 or earlier) deserialization text.
6794 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6795 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6796 * so or not from the serialized text, then we always creates a new object owned by this. */
6797
6798 assert(u);
613b411c
LP
6799 assert(key);
6800 assert(value);
6801
e8a565cb
YW
6802 /* Manager manages ExecRuntime objects by the unit id.
6803 * So, we omit the serialized text when the unit does not have id (yet?)... */
6804 if (isempty(u->id)) {
6805 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6806 return 0;
6807 }
613b411c 6808
cbc165d1
ZJS
6809 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6810 return log_oom();
e8a565cb
YW
6811
6812 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6813 if (!rt) {
cbc165d1 6814 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6815 return log_oom();
613b411c 6816
e8a565cb
YW
6817 rt = rt_create;
6818 }
6819
6820 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6821 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6822 return -ENOMEM;
613b411c
LP
6823
6824 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6825 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6826 return -ENOMEM;
613b411c
LP
6827
6828 } else if (streq(key, "netns-socket-0")) {
6829 int fd;
6830
e8a565cb 6831 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6832 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6833 return 0;
613b411c 6834 }
e8a565cb
YW
6835
6836 safe_close(rt->netns_storage_socket[0]);
6837 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6838
613b411c
LP
6839 } else if (streq(key, "netns-socket-1")) {
6840 int fd;
6841
e8a565cb 6842 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6843 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6844 return 0;
613b411c 6845 }
e8a565cb
YW
6846
6847 safe_close(rt->netns_storage_socket[1]);
6848 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6849
613b411c
LP
6850 } else
6851 return 0;
6852
e8a565cb
YW
6853 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6854 if (rt_create) {
6855 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6856 if (r < 0) {
3fe91079 6857 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6858 return 0;
6859 }
613b411c 6860
e8a565cb 6861 rt_create->manager = u->manager;
613b411c 6862
e8a565cb 6863 /* Avoid cleanup */
56a13a49 6864 TAKE_PTR(rt_create);
e8a565cb 6865 }
98b47d54 6866
e8a565cb
YW
6867 return 1;
6868}
613b411c 6869
56a13a49
ZJS
6870int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6871 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6872 char *id = NULL;
a70581ff 6873 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6874 const char *p, *v = value;
6875 size_t n;
613b411c 6876
e8a565cb
YW
6877 assert(m);
6878 assert(value);
6879 assert(fds);
98b47d54 6880
e8a565cb 6881 n = strcspn(v, " ");
2f82562b 6882 id = strndupa_safe(v, n);
e8a565cb
YW
6883 if (v[n] != ' ')
6884 goto finalize;
6885 p = v + n + 1;
6886
6887 v = startswith(p, "tmp-dir=");
6888 if (v) {
6889 n = strcspn(v, " ");
56a13a49
ZJS
6890 tmp_dir = strndup(v, n);
6891 if (!tmp_dir)
6892 return log_oom();
e8a565cb
YW
6893 if (v[n] != ' ')
6894 goto finalize;
6895 p = v + n + 1;
6896 }
6897
6898 v = startswith(p, "var-tmp-dir=");
6899 if (v) {
6900 n = strcspn(v, " ");
56a13a49
ZJS
6901 var_tmp_dir = strndup(v, n);
6902 if (!var_tmp_dir)
6903 return log_oom();
e8a565cb
YW
6904 if (v[n] != ' ')
6905 goto finalize;
6906 p = v + n + 1;
6907 }
6908
6909 v = startswith(p, "netns-socket-0=");
6910 if (v) {
6911 char *buf;
6912
6913 n = strcspn(v, " ");
2f82562b 6914 buf = strndupa_safe(v, n);
c413bb28 6915
a70581ff 6916 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6917 if (r < 0)
6918 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6919 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6920 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6921 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6922 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6923 if (v[n] != ' ')
6924 goto finalize;
6925 p = v + n + 1;
613b411c
LP
6926 }
6927
e8a565cb
YW
6928 v = startswith(p, "netns-socket-1=");
6929 if (v) {
6930 char *buf;
98b47d54 6931
e8a565cb 6932 n = strcspn(v, " ");
2f82562b 6933 buf = strndupa_safe(v, n);
a70581ff
XR
6934
6935 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6936 if (r < 0)
6937 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6938 if (!fdset_contains(fds, netns_fdpair[1]))
6939 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6940 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6941 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6942 if (v[n] != ' ')
6943 goto finalize;
6944 p = v + n + 1;
6945 }
6946
6947 v = startswith(p, "ipcns-socket-0=");
6948 if (v) {
6949 char *buf;
6950
6951 n = strcspn(v, " ");
2f82562b 6952 buf = strndupa_safe(v, n);
a70581ff
XR
6953
6954 r = safe_atoi(buf, &ipcns_fdpair[0]);
6955 if (r < 0)
6956 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6957 if (!fdset_contains(fds, ipcns_fdpair[0]))
6958 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6959 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6960 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6961 if (v[n] != ' ')
6962 goto finalize;
6963 p = v + n + 1;
6964 }
6965
6966 v = startswith(p, "ipcns-socket-1=");
6967 if (v) {
6968 char *buf;
6969
6970 n = strcspn(v, " ");
2f82562b 6971 buf = strndupa_safe(v, n);
a70581ff
XR
6972
6973 r = safe_atoi(buf, &ipcns_fdpair[1]);
6974 if (r < 0)
6975 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6976 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6977 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6978 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6979 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6980 }
98b47d54 6981
e8a565cb 6982finalize:
a70581ff 6983 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6984 if (r < 0)
56a13a49
ZJS
6985 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6986 return 0;
e8a565cb 6987}
613b411c 6988
e8a565cb
YW
6989void exec_runtime_vacuum(Manager *m) {
6990 ExecRuntime *rt;
e8a565cb
YW
6991
6992 assert(m);
6993
6994 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6995
90e74a66 6996 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6997 if (rt->n_ref > 0)
6998 continue;
6999
7000 (void) exec_runtime_free(rt, false);
7001 }
613b411c
LP
7002}
7003
b9c04eaf
YW
7004void exec_params_clear(ExecParameters *p) {
7005 if (!p)
7006 return;
7007
c3f8a065
LP
7008 p->environment = strv_free(p->environment);
7009 p->fd_names = strv_free(p->fd_names);
7010 p->fds = mfree(p->fds);
7011 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7012}
7013
bb0c0d6f
LP
7014ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7015 if (!sc)
7016 return NULL;
7017
7018 free(sc->id);
7019 free(sc->data);
7020 return mfree(sc);
7021}
7022
43144be4
LP
7023ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7024 if (!lc)
7025 return NULL;
7026
7027 free(lc->id);
7028 free(lc->path);
7029 return mfree(lc);
7030}
7031
211a3d87
LB
7032void exec_directory_done(ExecDirectory *d) {
7033 if (!d)
7034 return;
7035
7036 for (size_t i = 0; i < d->n_items; i++) {
7037 free(d->items[i].path);
7038 strv_free(d->items[i].symlinks);
7039 }
7040
7041 d->items = mfree(d->items);
7042 d->n_items = 0;
7043 d->mode = 0755;
7044}
7045
7046int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7047 _cleanup_strv_free_ char **s = NULL;
7048 _cleanup_free_ char *p = NULL;
7049
7050 assert(d);
7051 assert(n);
7052 assert(path);
7053
7054 p = strdup(path);
7055 if (!p)
7056 return -ENOMEM;
7057
7058 if (symlinks) {
7059 s = strv_copy(symlinks);
7060 if (!s)
7061 return -ENOMEM;
7062 }
7063
7064 if (!GREEDY_REALLOC(*d, *n + 1))
7065 return -ENOMEM;
7066
7067 (*d)[(*n) ++] = (ExecDirectoryItem) {
7068 .path = TAKE_PTR(p),
7069 .symlinks = TAKE_PTR(s),
7070 };
7071
7072 return 0;
7073}
7074
bb0c0d6f 7075DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7076DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7077
80876c20
LP
7078static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7079 [EXEC_INPUT_NULL] = "null",
7080 [EXEC_INPUT_TTY] = "tty",
7081 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7082 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7083 [EXEC_INPUT_SOCKET] = "socket",
7084 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7085 [EXEC_INPUT_DATA] = "data",
2038c3f5 7086 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7087};
7088
8a0867d6
LP
7089DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7090
94f04347 7091static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7092 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7093 [EXEC_OUTPUT_NULL] = "null",
80876c20 7094 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7095 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7096 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7097 [EXEC_OUTPUT_JOURNAL] = "journal",
7098 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7099 [EXEC_OUTPUT_SOCKET] = "socket",
7100 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7101 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7102 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7103 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7104};
7105
7106DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7107
7108static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7109 [EXEC_UTMP_INIT] = "init",
7110 [EXEC_UTMP_LOGIN] = "login",
7111 [EXEC_UTMP_USER] = "user",
7112};
7113
7114DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7115
7116static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7117 [EXEC_PRESERVE_NO] = "no",
7118 [EXEC_PRESERVE_YES] = "yes",
7119 [EXEC_PRESERVE_RESTART] = "restart",
7120};
7121
7122DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7123
6b7b2ed9 7124/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7125static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7126 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7127 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7128 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7129 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7130 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7131};
7132
7133DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7134
211a3d87
LB
7135/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7136static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7137 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7138 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7139 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7140 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7141 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7142};
7143
7144DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7145
6b7b2ed9
LP
7146/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7147 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7148 * directories, specifically .timer units with their timestamp touch file. */
7149static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7150 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7151 [EXEC_DIRECTORY_STATE] = "state",
7152 [EXEC_DIRECTORY_CACHE] = "cache",
7153 [EXEC_DIRECTORY_LOGS] = "logs",
7154 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7155};
7156
7157DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7158
7159/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7160 * the service payload in. */
fb2042dd
YW
7161static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7162 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7163 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7164 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7165 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7166 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7167};
7168
7169DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7170
b1edf445
LP
7171static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7172 [EXEC_KEYRING_INHERIT] = "inherit",
7173 [EXEC_KEYRING_PRIVATE] = "private",
7174 [EXEC_KEYRING_SHARED] = "shared",
7175};
7176
7177DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);