]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #23982 from medhefgo/boot-misc
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
032b3afb 66#include "ioprio-util.h"
a1164ae3 67#include "label.h"
8dd4c05b
LP
68#include "log.h"
69#include "macro.h"
e8a565cb 70#include "manager.h"
2a341bb9 71#include "manager-dump.h"
0a970718 72#include "memory-util.h"
f5947a5e 73#include "missing_fs.h"
5bead76e 74#include "missing_ioprio.h"
35cd0ba5 75#include "mkdir-label.h"
21935150 76#include "mount-util.h"
bb0c0d6f 77#include "mountpoint-util.h"
8dd4c05b 78#include "namespace.h"
6bedfcbb 79#include "parse-util.h"
8dd4c05b 80#include "path-util.h"
0b452006 81#include "process-util.h"
d3dcf4e3 82#include "random-util.h"
3989bdc1 83#include "recurse-dir.h"
78f22b97 84#include "rlimit-util.h"
8dd4c05b 85#include "rm-rf.h"
349cc4a5 86#if HAVE_SECCOMP
3ffd4af2
LP
87#include "seccomp-util.h"
88#endif
07d46372 89#include "securebits-util.h"
8dd4c05b 90#include "selinux-util.h"
24882e06 91#include "signal-util.h"
8dd4c05b 92#include "smack-util.h"
57b7a260 93#include "socket-util.h"
fd63e712 94#include "special.h"
949befd3 95#include "stat-util.h"
8b43440b 96#include "string-table.h"
07630cea 97#include "string-util.h"
8dd4c05b 98#include "strv.h"
7ccbd1ae 99#include "syslog-util.h"
8dd4c05b 100#include "terminal-util.h"
bb0c0d6f 101#include "tmpfile-util.h"
566b7d23 102#include "umask-util.h"
2d3b784d 103#include "unit-serialize.h"
b1d4f8e1 104#include "user-util.h"
8dd4c05b 105#include "utmp-wtmp.h"
5cb5a6ff 106
e056b01d 107#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 108#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 109
531dca78
LP
110#define SNDBUF_SIZE (8*1024*1024)
111
da6053d0 112static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
113 if (n_fds <= 0)
114 return 0;
115
a0d40ac5
LP
116 /* Modifies the fds array! (sorts it) */
117
034c6ed7
LP
118 assert(fds);
119
5b10116e
ZJS
120 for (int start = 0;;) {
121 int restart_from = -1;
034c6ed7 122
5b10116e 123 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
124 int nfd;
125
126 /* Already at right index? */
127 if (fds[i] == i+3)
128 continue;
129
3cc2aff1
LP
130 nfd = fcntl(fds[i], F_DUPFD, i + 3);
131 if (nfd < 0)
034c6ed7
LP
132 return -errno;
133
03e334a1 134 safe_close(fds[i]);
034c6ed7
LP
135 fds[i] = nfd;
136
137 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 138 * let's remember that and try again from here */
034c6ed7
LP
139 if (nfd != i+3 && restart_from < 0)
140 restart_from = i;
141 }
142
143 if (restart_from < 0)
144 break;
145
146 start = restart_from;
147 }
148
149 return 0;
150}
151
25b583d7 152static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 153 size_t n_fds;
e2c76839 154 int r;
47a71eed 155
25b583d7 156 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
9b141911
FB
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
47a71eed 164
5b10116e 165 for (size_t i = 0; i < n_fds; i++) {
47a71eed 166
9b141911
FB
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
47a71eed 172
451a074f
LP
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
47a71eed 176
3cc2aff1
LP
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
e2c76839 179 return r;
47a71eed
LP
180 }
181
182 return 0;
183}
184
1e22b5cd 185static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
186 assert(context);
187
1e22b5cd
LP
188 if (context->stdio_as_fds)
189 return NULL;
190
80876c20
LP
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195}
196
1e22b5cd
LP
197static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
198 const char *path;
199
6ea832a2
LP
200 assert(context);
201
1e22b5cd 202 path = exec_context_tty_path(context);
6ea832a2 203
1e22b5cd
LP
204 if (context->tty_vhangup) {
205 if (p && p->stdin_fd >= 0)
206 (void) terminal_vhangup_fd(p->stdin_fd);
207 else if (path)
208 (void) terminal_vhangup(path);
209 }
6ea832a2 210
1e22b5cd
LP
211 if (context->tty_reset) {
212 if (p && p->stdin_fd >= 0)
213 (void) reset_terminal_fd(p->stdin_fd, true);
214 else if (path)
215 (void) reset_terminal(path);
216 }
217
51462135
DDM
218 if (p && p->stdin_fd >= 0)
219 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
220
1e22b5cd
LP
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
6ea832a2
LP
223}
224
6af760f3
LP
225static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230}
231
3a1286b6 232static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
6af760f3
LP
235 EXEC_OUTPUT_KMSG_AND_CONSOLE,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237}
238
aac8c0c3
LP
239static bool is_kmsg_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_KMSG,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE);
243}
244
6af760f3
LP
245static bool exec_context_needs_term(const ExecContext *c) {
246 assert(c);
247
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
249
250 if (is_terminal_input(c->std_input))
251 return true;
252
253 if (is_terminal_output(c->std_output))
254 return true;
255
256 if (is_terminal_output(c->std_error))
257 return true;
258
259 return !!c->tty_path;
3a1286b6
MS
260}
261
80876c20 262static int open_null_as(int flags, int nfd) {
046a82c1 263 int fd;
071830ff 264
80876c20 265 assert(nfd >= 0);
071830ff 266
613b411c
LP
267 fd = open("/dev/null", flags|O_NOCTTY);
268 if (fd < 0)
071830ff
LP
269 return -errno;
270
046a82c1 271 return move_fd(fd, nfd, false);
071830ff
LP
272}
273
91dd5f7c
LP
274static int connect_journal_socket(
275 int fd,
276 const char *log_namespace,
277 uid_t uid,
278 gid_t gid) {
279
524daa8c
ZJS
280 uid_t olduid = UID_INVALID;
281 gid_t oldgid = GID_INVALID;
91dd5f7c 282 const char *j;
524daa8c
ZJS
283 int r;
284
91dd5f7c
LP
285 j = log_namespace ?
286 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
287 "/run/systemd/journal/stdout";
91dd5f7c 288
cad93f29 289 if (gid_is_valid(gid)) {
524daa8c
ZJS
290 oldgid = getgid();
291
92a17af9 292 if (setegid(gid) < 0)
524daa8c
ZJS
293 return -errno;
294 }
295
cad93f29 296 if (uid_is_valid(uid)) {
524daa8c
ZJS
297 olduid = getuid();
298
92a17af9 299 if (seteuid(uid) < 0) {
524daa8c
ZJS
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
1861986a 305 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 306
1861986a
LP
307 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
308 an LSM interferes. */
524daa8c 309
cad93f29 310 if (uid_is_valid(uid))
524daa8c
ZJS
311 (void) seteuid(olduid);
312
313 restore_gid:
cad93f29 314 if (gid_is_valid(gid))
524daa8c
ZJS
315 (void) setegid(oldgid);
316
317 return r;
318}
319
fd1f9c89 320static int connect_logger_as(
34cf6c43 321 const Unit *unit,
fd1f9c89 322 const ExecContext *context,
af635cf3 323 const ExecParameters *params,
fd1f9c89
LP
324 ExecOutput output,
325 const char *ident,
fd1f9c89
LP
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
2ac1ff68
EV
330 _cleanup_close_ int fd = -1;
331 int r;
071830ff
LP
332
333 assert(context);
af635cf3 334 assert(params);
80876c20
LP
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
071830ff 338
54fe0cdb
LP
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
80876c20 341 return -errno;
071830ff 342
91dd5f7c 343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
344 if (r < 0)
345 return r;
071830ff 346
2ac1ff68 347 if (shutdown(fd, SHUT_RD) < 0)
80876c20 348 return -errno;
071830ff 349
fd1f9c89 350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 351
2ac1ff68 352 if (dprintf(fd,
62bca2c6 353 "%s\n"
80876c20
LP
354 "%s\n"
355 "%i\n"
54fe0cdb
LP
356 "%i\n"
357 "%i\n"
358 "%i\n"
4f4a1dbf 359 "%i\n",
c867611e 360 context->syslog_identifier ?: ident,
af635cf3 361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
f3dc6af2 364 false,
aac8c0c3 365 is_kmsg_output(output),
2ac1ff68
EV
366 is_terminal_output(output)) < 0)
367 return -errno;
80876c20 368
2ac1ff68 369 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 370}
2ac1ff68 371
3a274a21 372static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 373 int fd;
071830ff 374
80876c20
LP
375 assert(path);
376 assert(nfd >= 0);
fd1f9c89 377
3a274a21 378 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 379 if (fd < 0)
80876c20 380 return fd;
071830ff 381
046a82c1 382 return move_fd(fd, nfd, false);
80876c20 383}
071830ff 384
2038c3f5 385static int acquire_path(const char *path, int flags, mode_t mode) {
15a3e96f 386 _cleanup_close_ int fd = -1;
86fca584 387 int r;
071830ff 388
80876c20 389 assert(path);
071830ff 390
2038c3f5
LP
391 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
392 flags |= O_CREAT;
393
394 fd = open(path, flags|O_NOCTTY, mode);
395 if (fd >= 0)
15a3e96f 396 return TAKE_FD(fd);
071830ff 397
2038c3f5
LP
398 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
399 return -errno;
2038c3f5
LP
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
1861986a
LP
407 r = connect_unix_path(fd, AT_FDCWD, path);
408 if (IN_SET(r, -ENOTSOCK, -EINVAL))
409 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
410 * wasn't an AF_UNIX socket after all */
411 return -ENXIO;
412 if (r < 0)
413 return r;
071830ff 414
2038c3f5
LP
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
86fca584 420 r = 0;
15a3e96f 421 if (r < 0)
2038c3f5 422 return -errno;
2038c3f5 423
15a3e96f 424 return TAKE_FD(fd);
80876c20 425}
071830ff 426
08f3be7a
LP
427static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
1e3ad081
LP
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
071830ff 440
03fd9c49 441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
442 return EXEC_INPUT_NULL;
443
08f3be7a
LP
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
03fd9c49 447 return std_input;
4f2d528d
LP
448}
449
7966a916 450static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 451
7966a916 452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
453 return EXEC_OUTPUT_INHERIT;
454
7966a916 455 return output;
4f2d528d
LP
456}
457
a34ceba6
LP
458static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
52c239d7 461 int socket_fd,
2caa38e9 462 const int named_iofds[static 3]) {
a34ceba6 463
4f2d528d 464 ExecInput i;
51462135 465 int r;
4f2d528d
LP
466
467 assert(context);
a34ceba6 468 assert(params);
2caa38e9 469 assert(named_iofds);
a34ceba6
LP
470
471 if (params->stdin_fd >= 0) {
472 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
473 return -errno;
474
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
476 if (isatty(STDIN_FILENO)) {
477 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
478 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 479 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 480 }
a34ceba6
LP
481
482 return STDIN_FILENO;
483 }
4f2d528d 484
08f3be7a 485 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
486
487 switch (i) {
071830ff 488
80876c20
LP
489 case EXEC_INPUT_NULL:
490 return open_null_as(O_RDONLY, STDIN_FILENO);
491
492 case EXEC_INPUT_TTY:
493 case EXEC_INPUT_TTY_FORCE:
494 case EXEC_INPUT_TTY_FAIL: {
046a82c1 495 int fd;
071830ff 496
1e22b5cd 497 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
498 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
499 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
500 ACQUIRE_TERMINAL_WAIT,
3a43da28 501 USEC_INFINITY);
970edce6 502 if (fd < 0)
80876c20
LP
503 return fd;
504
51462135
DDM
505 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
506 if (r < 0)
507 return r;
508
046a82c1 509 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
510 }
511
4f2d528d 512 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
513 assert(socket_fd >= 0);
514
7c248223 515 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 516
52c239d7 517 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
518 assert(named_iofds[STDIN_FILENO] >= 0);
519
52c239d7 520 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 521 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 522
08f3be7a
LP
523 case EXEC_INPUT_DATA: {
524 int fd;
525
526 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
527 if (fd < 0)
528 return fd;
529
530 return move_fd(fd, STDIN_FILENO, false);
531 }
532
2038c3f5
LP
533 case EXEC_INPUT_FILE: {
534 bool rw;
535 int fd;
536
537 assert(context->stdio_file[STDIN_FILENO]);
538
539 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
540 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
541
542 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
543 if (fd < 0)
544 return fd;
545
546 return move_fd(fd, STDIN_FILENO, false);
547 }
548
80876c20 549 default:
04499a70 550 assert_not_reached();
80876c20
LP
551 }
552}
553
41fc585a
LP
554static bool can_inherit_stderr_from_stdout(
555 const ExecContext *context,
556 ExecOutput o,
557 ExecOutput e) {
558
559 assert(context);
560
561 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
562 * stderr fd */
563
564 if (e == EXEC_OUTPUT_INHERIT)
565 return true;
566 if (e != o)
567 return false;
568
569 if (e == EXEC_OUTPUT_NAMED_FD)
570 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
571
8d7dab1f 572 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
573 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
574
575 return true;
576}
577
a34ceba6 578static int setup_output(
34cf6c43 579 const Unit *unit,
a34ceba6
LP
580 const ExecContext *context,
581 const ExecParameters *params,
582 int fileno,
583 int socket_fd,
2caa38e9 584 const int named_iofds[static 3],
a34ceba6 585 const char *ident,
7bce046b
LP
586 uid_t uid,
587 gid_t gid,
588 dev_t *journal_stream_dev,
589 ino_t *journal_stream_ino) {
a34ceba6 590
4f2d528d
LP
591 ExecOutput o;
592 ExecInput i;
47c1d80d 593 int r;
4f2d528d 594
f2341e0a 595 assert(unit);
80876c20 596 assert(context);
a34ceba6 597 assert(params);
80876c20 598 assert(ident);
7bce046b
LP
599 assert(journal_stream_dev);
600 assert(journal_stream_ino);
80876c20 601
a34ceba6
LP
602 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
603
604 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
605 return -errno;
606
607 return STDOUT_FILENO;
608 }
609
610 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
611 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
612 return -errno;
613
614 return STDERR_FILENO;
615 }
616
08f3be7a 617 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 618 o = fixup_output(context->std_output, socket_fd);
4f2d528d 619
eb17e935
MS
620 if (fileno == STDERR_FILENO) {
621 ExecOutput e;
622 e = fixup_output(context->std_error, socket_fd);
80876c20 623
eb17e935
MS
624 /* This expects the input and output are already set up */
625
626 /* Don't change the stderr file descriptor if we inherit all
627 * the way and are not on a tty */
628 if (e == EXEC_OUTPUT_INHERIT &&
629 o == EXEC_OUTPUT_INHERIT &&
630 i == EXEC_INPUT_NULL &&
631 !is_terminal_input(context->std_input) &&
7966a916 632 getppid() != 1)
eb17e935
MS
633 return fileno;
634
635 /* Duplicate from stdout if possible */
41fc585a 636 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 637 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 638
eb17e935 639 o = e;
80876c20 640
eb17e935 641 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
642 /* If input got downgraded, inherit the original value */
643 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 644 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 645
08f3be7a
LP
646 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
647 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 648 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 649
acb591e4
LP
650 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
651 if (getppid() != 1)
eb17e935 652 return fileno;
94f04347 653
eb17e935
MS
654 /* We need to open /dev/null here anew, to get the right access mode. */
655 return open_null_as(O_WRONLY, fileno);
071830ff 656 }
94f04347 657
eb17e935 658 switch (o) {
80876c20
LP
659
660 case EXEC_OUTPUT_NULL:
eb17e935 661 return open_null_as(O_WRONLY, fileno);
80876c20
LP
662
663 case EXEC_OUTPUT_TTY:
4f2d528d 664 if (is_terminal_input(i))
7c248223 665 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
666
667 /* We don't reset the terminal if this is just about output */
1e22b5cd 668 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 669
9a6bca7a 670 case EXEC_OUTPUT_KMSG:
28dbc1e8 671 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
672 case EXEC_OUTPUT_JOURNAL:
673 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 674 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 675 if (r < 0) {
7966a916
ZJS
676 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
677 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 678 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
679 } else {
680 struct stat st;
681
682 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
683 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
684 * services to detect whether they are connected to the journal or not.
685 *
686 * If both stdout and stderr are connected to a stream then let's make sure to store the data
687 * about STDERR as that's usually the best way to do logging. */
7bce046b 688
ab2116b1
LP
689 if (fstat(fileno, &st) >= 0 &&
690 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
691 *journal_stream_dev = st.st_dev;
692 *journal_stream_ino = st.st_ino;
693 }
47c1d80d
MS
694 }
695 return r;
4f2d528d
LP
696
697 case EXEC_OUTPUT_SOCKET:
698 assert(socket_fd >= 0);
e75a9ed1 699
7c248223 700 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 701
52c239d7 702 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
703 assert(named_iofds[fileno] >= 0);
704
52c239d7 705 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 706 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 707
566b7d23 708 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
709 case EXEC_OUTPUT_FILE_APPEND:
710 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 711 bool rw;
566b7d23 712 int fd, flags;
2038c3f5
LP
713
714 assert(context->stdio_file[fileno]);
715
716 rw = context->std_input == EXEC_INPUT_FILE &&
717 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
718
719 if (rw)
7c248223 720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 721
566b7d23
ZD
722 flags = O_WRONLY;
723 if (o == EXEC_OUTPUT_FILE_APPEND)
724 flags |= O_APPEND;
8d7dab1f
LW
725 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
726 flags |= O_TRUNC;
566b7d23
ZD
727
728 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
729 if (fd < 0)
730 return fd;
731
566b7d23 732 return move_fd(fd, fileno, 0);
2038c3f5
LP
733 }
734
94f04347 735 default:
04499a70 736 assert_not_reached();
94f04347 737 }
071830ff
LP
738}
739
02a51aba 740static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 741 int r;
02a51aba
LP
742
743 assert(fd >= 0);
02a51aba 744
1ff74fb6 745 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
746 if (isatty(fd) < 1) {
747 if (IN_SET(errno, EINVAL, ENOTTY))
748 return 0; /* not a tty */
1ff74fb6 749
02a51aba 750 return -errno;
4b3b5bc7 751 }
02a51aba 752
4b3b5bc7 753 /* This might fail. What matters are the results. */
f2df231f 754 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
755 if (r < 0)
756 return r;
02a51aba 757
4b3b5bc7 758 return 1;
02a51aba
LP
759}
760
aedec452 761static int setup_confirm_stdio(
51462135 762 const ExecContext *context,
aedec452
LP
763 const char *vc,
764 int *ret_saved_stdin,
765 int *ret_saved_stdout) {
766
3d18b167
LP
767 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
768 int r;
80876c20 769
aedec452
LP
770 assert(ret_saved_stdin);
771 assert(ret_saved_stdout);
80876c20 772
af6da548
LP
773 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
774 if (saved_stdin < 0)
775 return -errno;
80876c20 776
af6da548 777 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
778 if (saved_stdout < 0)
779 return -errno;
80876c20 780
8854d795 781 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
782 if (fd < 0)
783 return fd;
80876c20 784
af6da548
LP
785 r = chown_terminal(fd, getuid());
786 if (r < 0)
3d18b167 787 return r;
02a51aba 788
3d18b167
LP
789 r = reset_terminal_fd(fd, true);
790 if (r < 0)
791 return r;
80876c20 792
51462135
DDM
793 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
794 if (r < 0)
795 return r;
796
aedec452
LP
797 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
798 TAKE_FD(fd);
2b33ab09
LP
799 if (r < 0)
800 return r;
80876c20 801
aedec452
LP
802 *ret_saved_stdin = TAKE_FD(saved_stdin);
803 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 804 return 0;
80876c20
LP
805}
806
63d77c92 807static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
808 assert(err < 0);
809
810 if (err == -ETIMEDOUT)
63d77c92 811 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
812 else {
813 errno = -err;
63d77c92 814 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
815 }
816}
817
63d77c92 818static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 819 _cleanup_close_ int fd = -1;
80876c20 820
3b20f877 821 assert(vc);
80876c20 822
7d5ceb64 823 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 824 if (fd < 0)
3b20f877 825 return;
80876c20 826
63d77c92 827 write_confirm_error_fd(err, fd, u);
af6da548 828}
80876c20 829
3d18b167 830static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 831 int r = 0;
80876c20 832
af6da548
LP
833 assert(saved_stdin);
834 assert(saved_stdout);
835
836 release_terminal();
837
838 if (*saved_stdin >= 0)
80876c20 839 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 840 r = -errno;
80876c20 841
af6da548 842 if (*saved_stdout >= 0)
80876c20 843 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 844 r = -errno;
80876c20 845
3d18b167
LP
846 *saved_stdin = safe_close(*saved_stdin);
847 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
848
849 return r;
850}
851
3b20f877
FB
852enum {
853 CONFIRM_PRETEND_FAILURE = -1,
854 CONFIRM_PRETEND_SUCCESS = 0,
855 CONFIRM_EXECUTE = 1,
856};
857
51462135 858static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 859 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 860 _cleanup_free_ char *e = NULL;
3b20f877 861 char c;
af6da548 862
3b20f877 863 /* For any internal errors, assume a positive response. */
51462135 864 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 865 if (r < 0) {
63d77c92 866 write_confirm_error(r, vc, u);
3b20f877
FB
867 return CONFIRM_EXECUTE;
868 }
af6da548 869
b0eb2944
FB
870 /* confirm_spawn might have been disabled while we were sleeping. */
871 if (manager_is_confirm_spawn_disabled(u->manager)) {
872 r = 1;
873 goto restore_stdio;
874 }
af6da548 875
2bcd3c26
FB
876 e = ellipsize(cmdline, 60, 100);
877 if (!e) {
878 log_oom();
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
af6da548 882
d172b175 883 for (;;) {
539622bd 884 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 885 if (r < 0) {
63d77c92 886 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
887 r = CONFIRM_EXECUTE;
888 goto restore_stdio;
889 }
af6da548 890
d172b175 891 switch (c) {
b0eb2944
FB
892 case 'c':
893 printf("Resuming normal execution.\n");
894 manager_disable_confirm_spawn();
895 r = 1;
896 break;
dd6f9ac0
FB
897 case 'D':
898 unit_dump(u, stdout, " ");
899 continue; /* ask again */
d172b175
FB
900 case 'f':
901 printf("Failing execution.\n");
902 r = CONFIRM_PRETEND_FAILURE;
903 break;
904 case 'h':
b0eb2944
FB
905 printf(" c - continue, proceed without asking anymore\n"
906 " D - dump, show the state of the unit\n"
dd6f9ac0 907 " f - fail, don't execute the command and pretend it failed\n"
d172b175 908 " h - help\n"
eedf223a 909 " i - info, show a short summary of the unit\n"
56fde33a 910 " j - jobs, show jobs that are in progress\n"
d172b175
FB
911 " s - skip, don't execute the command and pretend it succeeded\n"
912 " y - yes, execute the command\n");
dd6f9ac0 913 continue; /* ask again */
eedf223a
FB
914 case 'i':
915 printf(" Description: %s\n"
916 " Unit: %s\n"
917 " Command: %s\n",
918 u->id, u->description, cmdline);
919 continue; /* ask again */
56fde33a
FB
920 case 'j':
921 manager_dump_jobs(u->manager, stdout, " ");
922 continue; /* ask again */
539622bd
FB
923 case 'n':
924 /* 'n' was removed in favor of 'f'. */
925 printf("Didn't understand 'n', did you mean 'f'?\n");
926 continue; /* ask again */
d172b175
FB
927 case 's':
928 printf("Skipping execution.\n");
929 r = CONFIRM_PRETEND_SUCCESS;
930 break;
931 case 'y':
932 r = CONFIRM_EXECUTE;
933 break;
934 default:
04499a70 935 assert_not_reached();
d172b175 936 }
3b20f877 937 break;
3b20f877 938 }
af6da548 939
3b20f877 940restore_stdio:
af6da548 941 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 942 return r;
80876c20
LP
943}
944
4d885bd3
DH
945static int get_fixed_user(const ExecContext *c, const char **user,
946 uid_t *uid, gid_t *gid,
947 const char **home, const char **shell) {
81a2b7ce 948 int r;
4d885bd3 949 const char *name;
81a2b7ce 950
4d885bd3 951 assert(c);
81a2b7ce 952
23deef88
LP
953 if (!c->user)
954 return 0;
955
4d885bd3
DH
956 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
957 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 958
23deef88 959 name = c->user;
fafff8f1 960 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
961 if (r < 0)
962 return r;
81a2b7ce 963
4d885bd3
DH
964 *user = name;
965 return 0;
966}
967
968static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
969 int r;
970 const char *name;
971
972 assert(c);
973
974 if (!c->group)
975 return 0;
976
977 name = c->group;
fafff8f1 978 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
979 if (r < 0)
980 return r;
981
982 *group = name;
983 return 0;
984}
985
cdc5d5c5
DH
986static int get_supplementary_groups(const ExecContext *c, const char *user,
987 const char *group, gid_t gid,
988 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
989 int r, k = 0;
990 int ngroups_max;
991 bool keep_groups = false;
992 gid_t *groups = NULL;
993 _cleanup_free_ gid_t *l_gids = NULL;
994
995 assert(c);
996
bbeea271
DH
997 /*
998 * If user is given, then lookup GID and supplementary groups list.
999 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1000 * here and as early as possible so we keep the list of supplementary
1001 * groups of the caller.
bbeea271
DH
1002 */
1003 if (user && gid_is_valid(gid) && gid != 0) {
1004 /* First step, initialize groups from /etc/groups */
1005 if (initgroups(user, gid) < 0)
1006 return -errno;
1007
1008 keep_groups = true;
1009 }
1010
ac6e8be6 1011 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1012 return 0;
1013
366ddd25
DH
1014 /*
1015 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016 * be positive, otherwise fail.
1017 */
1018 errno = 0;
1019 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1020 if (ngroups_max <= 0)
1021 return errno_or_else(EOPNOTSUPP);
366ddd25 1022
4d885bd3
DH
1023 l_gids = new(gid_t, ngroups_max);
1024 if (!l_gids)
1025 return -ENOMEM;
81a2b7ce 1026
4d885bd3
DH
1027 if (keep_groups) {
1028 /*
1029 * Lookup the list of groups that the user belongs to, we
1030 * avoid NSS lookups here too for gid=0.
1031 */
1032 k = ngroups_max;
1033 if (getgrouplist(user, gid, l_gids, &k) < 0)
1034 return -EINVAL;
1035 } else
1036 k = 0;
81a2b7ce 1037
4d885bd3
DH
1038 STRV_FOREACH(i, c->supplementary_groups) {
1039 const char *g;
81a2b7ce 1040
4d885bd3
DH
1041 if (k >= ngroups_max)
1042 return -E2BIG;
81a2b7ce 1043
4d885bd3 1044 g = *i;
fafff8f1 1045 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1046 if (r < 0)
1047 return r;
81a2b7ce 1048
4d885bd3
DH
1049 k++;
1050 }
81a2b7ce 1051
4d885bd3
DH
1052 /*
1053 * Sets ngids to zero to drop all supplementary groups, happens
1054 * when we are under root and SupplementaryGroups= is empty.
1055 */
1056 if (k == 0) {
1057 *ngids = 0;
1058 return 0;
1059 }
81a2b7ce 1060
4d885bd3
DH
1061 /* Otherwise get the final list of supplementary groups */
1062 groups = memdup(l_gids, sizeof(gid_t) * k);
1063 if (!groups)
1064 return -ENOMEM;
1065
1066 *supplementary_gids = groups;
1067 *ngids = k;
1068
1069 groups = NULL;
1070
1071 return 0;
1072}
1073
34cf6c43 1074static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1075 int r;
1076
709dbeac
YW
1077 /* Handle SupplementaryGroups= if it is not empty */
1078 if (ngids > 0) {
4d885bd3
DH
1079 r = maybe_setgroups(ngids, supplementary_gids);
1080 if (r < 0)
97f0e76f 1081 return r;
4d885bd3 1082 }
81a2b7ce 1083
4d885bd3
DH
1084 if (gid_is_valid(gid)) {
1085 /* Then set our gids */
1086 if (setresgid(gid, gid, gid) < 0)
1087 return -errno;
81a2b7ce
LP
1088 }
1089
1090 return 0;
1091}
1092
dbdc4098
TK
1093static int set_securebits(int bits, int mask) {
1094 int current, applied;
1095 current = prctl(PR_GET_SECUREBITS);
1096 if (current < 0)
1097 return -errno;
1098 /* Clear all securebits defined in mask and set bits */
1099 applied = (current & ~mask) | bits;
1100 if (current == applied)
1101 return 0;
1102 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1103 return -errno;
1104 return 1;
1105}
1106
81a2b7ce 1107static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1108 assert(context);
dbdc4098 1109 int r;
81a2b7ce 1110
4d885bd3
DH
1111 if (!uid_is_valid(uid))
1112 return 0;
1113
479050b3 1114 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1115 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116 * required, so we also need keep-caps in this case.
1117 */
81a2b7ce 1118
dbdc4098 1119 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1120
1121 /* First step: If we need to keep capabilities but
1122 * drop privileges we need to make sure we keep our
cbb21cca 1123 * caps, while we drop privileges. */
693ced48 1124 if (uid != 0) {
dbdc4098
TK
1125 /* Add KEEP_CAPS to the securebits */
1126 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1127 if (r < 0)
1128 return r;
693ced48 1129 }
81a2b7ce
LP
1130 }
1131
479050b3 1132 /* Second step: actually set the uids */
81a2b7ce
LP
1133 if (setresuid(uid, uid, uid) < 0)
1134 return -errno;
1135
1136 /* At this point we should have all necessary capabilities but
1137 are otherwise a normal user. However, the caps might got
1138 corrupted due to the setresuid() so we need clean them up
1139 later. This is done outside of this call. */
1140
1141 return 0;
1142}
1143
349cc4a5 1144#if HAVE_PAM
5b6319dc
LP
1145
1146static int null_conv(
1147 int num_msg,
1148 const struct pam_message **msg,
1149 struct pam_response **resp,
1150 void *appdata_ptr) {
1151
1152 /* We don't support conversations */
1153
1154 return PAM_CONV_ERR;
1155}
1156
cefc33ae
LP
1157#endif
1158
5b6319dc
LP
1159static int setup_pam(
1160 const char *name,
1161 const char *user,
940c5210 1162 uid_t uid,
2d6fce8d 1163 gid_t gid,
5b6319dc 1164 const char *tty,
421bb42d 1165 char ***env, /* updated on success */
5b8d1f6b 1166 const int fds[], size_t n_fds) {
5b6319dc 1167
349cc4a5 1168#if HAVE_PAM
cefc33ae 1169
5b6319dc
LP
1170 static const struct pam_conv conv = {
1171 .conv = null_conv,
1172 .appdata_ptr = NULL
1173 };
1174
2d7c6aa2 1175 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1176 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1177 pam_handle_t *handle = NULL;
d6e5f3ad 1178 sigset_t old_ss;
7bb70b6e 1179 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1180 bool close_session = false;
1181 pid_t pam_pid = 0, parent_pid;
970edce6 1182 int flags = 0;
5b6319dc
LP
1183
1184 assert(name);
1185 assert(user);
2065ca69 1186 assert(env);
5b6319dc
LP
1187
1188 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1189 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1190 * systemd via the cgroup logic. It will then remove the PAM
1191 * session again. The parent process will exec() the actual
1192 * daemon. We do things this way to ensure that the main PID
1193 * of the daemon is the one we initially fork()ed. */
1194
7bb70b6e
LP
1195 r = barrier_create(&barrier);
1196 if (r < 0)
2d7c6aa2
DH
1197 goto fail;
1198
553d2243 1199 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1200 flags |= PAM_SILENT;
1201
f546241b
ZJS
1202 pam_code = pam_start(name, user, &conv, &handle);
1203 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1204 handle = NULL;
1205 goto fail;
1206 }
1207
3cd24c1a
LP
1208 if (!tty) {
1209 _cleanup_free_ char *q = NULL;
1210
1211 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212 * out if that's the case, and read the TTY off it. */
1213
1214 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1215 tty = strjoina("/dev/", q);
e73bf342
MS
1216 else
1217 /* If everything else failed then let's just use value "systemd". This will cause that session
1218 * isn't going to be marked as "background" and user manager will be started. */
1219 tty = "systemd";
3cd24c1a
LP
1220 }
1221
e73bf342
MS
1222 pam_code = pam_set_item(handle, PAM_TTY, tty);
1223 if (pam_code != PAM_SUCCESS)
1224 goto fail;
5b6319dc 1225
84eada2f
JW
1226 STRV_FOREACH(nv, *env) {
1227 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1228 if (pam_code != PAM_SUCCESS)
1229 goto fail;
1230 }
1231
970edce6 1232 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1233 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1234 goto fail;
1235
3bb39ea9
DG
1236 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1237 if (pam_code != PAM_SUCCESS)
46d7c6af 1238 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1239
970edce6 1240 pam_code = pam_open_session(handle, flags);
f546241b 1241 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1242 goto fail;
1243
1244 close_session = true;
1245
f546241b
ZJS
1246 e = pam_getenvlist(handle);
1247 if (!e) {
5b6319dc
LP
1248 pam_code = PAM_BUF_ERR;
1249 goto fail;
1250 }
1251
cafc5ca1 1252 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1253
72c0a2c2 1254 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1255
df0ff127 1256 parent_pid = getpid_cached();
5b6319dc 1257
4c253ed1
LP
1258 r = safe_fork("(sd-pam)", 0, &pam_pid);
1259 if (r < 0)
5b6319dc 1260 goto fail;
4c253ed1 1261 if (r == 0) {
7bb70b6e 1262 int sig, ret = EXIT_PAM;
5b6319dc 1263
cafc5ca1 1264 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1265 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1266
1da37e58
ZJS
1267 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1268 * those fds are open here that have been opened by PAM. */
4c253ed1 1269 (void) close_many(fds, n_fds);
5b6319dc 1270
cafc5ca1
LP
1271 /* Drop privileges - we don't need any to pam_close_session and this will make
1272 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1273 * threads to fail to exit normally */
2d6fce8d 1274
97f0e76f
LP
1275 r = maybe_setgroups(0, NULL);
1276 if (r < 0)
1277 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1278 if (setresgid(gid, gid, gid) < 0)
1279 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1280 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1281 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1282
9c274488 1283 (void) ignore_signals(SIGPIPE);
ce30c8dc 1284
cafc5ca1
LP
1285 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1286 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1287 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1288 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1289 goto child_finish;
1290
cafc5ca1
LP
1291 /* Tell the parent that our setup is done. This is especially important regarding dropping
1292 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1293 *
cafc5ca1 1294 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1295 (void) barrier_place(&barrier);
2d7c6aa2 1296
643f4706 1297 /* Check if our parent process might already have died? */
5b6319dc 1298 if (getppid() == parent_pid) {
d6e5f3ad
DM
1299 sigset_t ss;
1300
1301 assert_se(sigemptyset(&ss) >= 0);
1302 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1303
3dead8d9
LP
1304 for (;;) {
1305 if (sigwait(&ss, &sig) < 0) {
1306 if (errno == EINTR)
1307 continue;
1308
1309 goto child_finish;
1310 }
5b6319dc 1311
3dead8d9
LP
1312 assert(sig == SIGTERM);
1313 break;
1314 }
5b6319dc
LP
1315 }
1316
3bb39ea9
DG
1317 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1318 if (pam_code != PAM_SUCCESS)
1319 goto child_finish;
1320
3dead8d9 1321 /* If our parent died we'll end the session */
f546241b 1322 if (getppid() != parent_pid) {
970edce6 1323 pam_code = pam_close_session(handle, flags);
f546241b 1324 if (pam_code != PAM_SUCCESS)
5b6319dc 1325 goto child_finish;
f546241b 1326 }
5b6319dc 1327
7bb70b6e 1328 ret = 0;
5b6319dc
LP
1329
1330 child_finish:
7feb2b57
LP
1331 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1332 * know about this. See pam_end(3) */
1333 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1334 _exit(ret);
5b6319dc
LP
1335 }
1336
2d7c6aa2
DH
1337 barrier_set_role(&barrier, BARRIER_PARENT);
1338
cafc5ca1
LP
1339 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1340 * here. */
5b6319dc
LP
1341 handle = NULL;
1342
3b8bddde 1343 /* Unblock SIGTERM again in the parent */
72c0a2c2 1344 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1345
cafc5ca1
LP
1346 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1347 * this fd around. */
5b6319dc
LP
1348 closelog();
1349
cafc5ca1
LP
1350 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1351 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1352 if (!barrier_place_and_sync(&barrier))
1353 log_error("PAM initialization failed");
1354
130d3d22 1355 return strv_free_and_replace(*env, e);
5b6319dc
LP
1356
1357fail:
970edce6
ZJS
1358 if (pam_code != PAM_SUCCESS) {
1359 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1360 r = -EPERM; /* PAM errors do not map to errno */
1361 } else
1362 log_error_errno(r, "PAM failed: %m");
9ba35398 1363
5b6319dc
LP
1364 if (handle) {
1365 if (close_session)
970edce6 1366 pam_code = pam_close_session(handle, flags);
5b6319dc 1367
7feb2b57 1368 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1369 }
1370
5b6319dc 1371 closelog();
7bb70b6e 1372 return r;
cefc33ae
LP
1373#else
1374 return 0;
5b6319dc 1375#endif
cefc33ae 1376}
5b6319dc 1377
5d6b1584
LP
1378static void rename_process_from_path(const char *path) {
1379 char process_name[11];
1380 const char *p;
1381 size_t l;
1382
1383 /* This resulting string must fit in 10 chars (i.e. the length
1384 * of "/sbin/init") to look pretty in /bin/ps */
1385
2b6bf07d 1386 p = basename(path);
5d6b1584
LP
1387 if (isempty(p)) {
1388 rename_process("(...)");
1389 return;
1390 }
1391
1392 l = strlen(p);
1393 if (l > 8) {
1394 /* The end of the process name is usually more
1395 * interesting, since the first bit might just be
1396 * "systemd-" */
1397 p = p + l - 8;
1398 l = 8;
1399 }
1400
1401 process_name[0] = '(';
1402 memcpy(process_name+1, p, l);
1403 process_name[1+l] = ')';
1404 process_name[1+l+1] = 0;
1405
1406 rename_process(process_name);
1407}
1408
469830d1
LP
1409static bool context_has_address_families(const ExecContext *c) {
1410 assert(c);
1411
6b000af4 1412 return c->address_families_allow_list ||
469830d1
LP
1413 !set_isempty(c->address_families);
1414}
1415
1416static bool context_has_syscall_filters(const ExecContext *c) {
1417 assert(c);
1418
6b000af4 1419 return c->syscall_allow_list ||
8cfa775f 1420 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1421}
1422
9df2cdd8
TM
1423static bool context_has_syscall_logs(const ExecContext *c) {
1424 assert(c);
1425
1426 return c->syscall_log_allow_list ||
1427 !hashmap_isempty(c->syscall_log);
1428}
1429
469830d1
LP
1430static bool context_has_no_new_privileges(const ExecContext *c) {
1431 assert(c);
1432
1433 if (c->no_new_privileges)
1434 return true;
1435
1436 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1437 return false;
1438
1439 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1440 return c->lock_personality ||
469830d1 1441 c->memory_deny_write_execute ||
0538d2a8 1442 c->private_devices ||
fc64760d 1443 c->protect_clock ||
0538d2a8 1444 c->protect_hostname ||
469830d1
LP
1445 c->protect_kernel_tunables ||
1446 c->protect_kernel_modules ||
84703040 1447 c->protect_kernel_logs ||
0538d2a8
YW
1448 context_has_address_families(c) ||
1449 exec_context_restrict_namespaces_set(c) ||
1450 c->restrict_realtime ||
1451 c->restrict_suid_sgid ||
78e864e5 1452 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1453 context_has_syscall_filters(c) ||
1454 context_has_syscall_logs(c);
469830d1
LP
1455}
1456
bb0c0d6f
LP
1457static bool exec_context_has_credentials(const ExecContext *context) {
1458
1459 assert(context);
1460
1461 return !hashmap_isempty(context->set_credentials) ||
43144be4 1462 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1463}
1464
349cc4a5 1465#if HAVE_SECCOMP
17df7223 1466
83f12b27 1467static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1468
1469 if (is_seccomp_available())
1470 return false;
1471
f673b62d 1472 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1473 return true;
83f12b27
FS
1474}
1475
165a31c0 1476static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1477 uint32_t negative_action, default_action, action;
165a31c0 1478 int r;
8351ceae 1479
469830d1 1480 assert(u);
c0467cf3 1481 assert(c);
8351ceae 1482
469830d1 1483 if (!context_has_syscall_filters(c))
83f12b27
FS
1484 return 0;
1485
469830d1
LP
1486 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1487 return 0;
e9642be2 1488
005bfaf1 1489 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1490
6b000af4 1491 if (c->syscall_allow_list) {
469830d1
LP
1492 default_action = negative_action;
1493 action = SCMP_ACT_ALLOW;
7c66bae2 1494 } else {
469830d1
LP
1495 default_action = SCMP_ACT_ALLOW;
1496 action = negative_action;
57183d11 1497 }
8351ceae 1498
165a31c0 1499 if (needs_ambient_hack) {
6b000af4 1500 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1501 if (r < 0)
1502 return r;
1503 }
1504
b54f36c6 1505 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1506}
1507
9df2cdd8
TM
1508static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1509#ifdef SCMP_ACT_LOG
1510 uint32_t default_action, action;
1511#endif
1512
1513 assert(u);
1514 assert(c);
1515
1516 if (!context_has_syscall_logs(c))
1517 return 0;
1518
1519#ifdef SCMP_ACT_LOG
1520 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1521 return 0;
1522
1523 if (c->syscall_log_allow_list) {
1524 /* Log nothing but the ones listed */
1525 default_action = SCMP_ACT_ALLOW;
1526 action = SCMP_ACT_LOG;
1527 } else {
1528 /* Log everything but the ones listed */
1529 default_action = SCMP_ACT_LOG;
1530 action = SCMP_ACT_ALLOW;
1531 }
1532
1533 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1534#else
1535 /* old libseccomp */
1536 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1537 return 0;
1538#endif
1539}
1540
469830d1
LP
1541static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1542 assert(u);
4298d0b5
LP
1543 assert(c);
1544
469830d1 1545 if (set_isempty(c->syscall_archs))
83f12b27
FS
1546 return 0;
1547
469830d1
LP
1548 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1549 return 0;
4298d0b5 1550
469830d1
LP
1551 return seccomp_restrict_archs(c->syscall_archs);
1552}
4298d0b5 1553
469830d1
LP
1554static int apply_address_families(const Unit* u, const ExecContext *c) {
1555 assert(u);
1556 assert(c);
4298d0b5 1557
469830d1
LP
1558 if (!context_has_address_families(c))
1559 return 0;
4298d0b5 1560
469830d1
LP
1561 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1562 return 0;
4298d0b5 1563
6b000af4 1564 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1565}
4298d0b5 1566
83f12b27 1567static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1568 assert(u);
f3e43635
TM
1569 assert(c);
1570
469830d1 1571 if (!c->memory_deny_write_execute)
83f12b27
FS
1572 return 0;
1573
469830d1
LP
1574 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1575 return 0;
f3e43635 1576
469830d1 1577 return seccomp_memory_deny_write_execute();
f3e43635
TM
1578}
1579
83f12b27 1580static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1581 assert(u);
f4170c67
LP
1582 assert(c);
1583
469830d1 1584 if (!c->restrict_realtime)
83f12b27
FS
1585 return 0;
1586
469830d1
LP
1587 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1588 return 0;
f4170c67 1589
469830d1 1590 return seccomp_restrict_realtime();
f4170c67
LP
1591}
1592
f69567cb
LP
1593static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
1596
1597 if (!c->restrict_suid_sgid)
1598 return 0;
1599
1600 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1601 return 0;
1602
1603 return seccomp_restrict_suid_sgid();
1604}
1605
59e856c7 1606static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1607 assert(u);
59eeb84b
LP
1608 assert(c);
1609
1610 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1611 * let's protect even those systems where this is left on in the kernel. */
1612
469830d1 1613 if (!c->protect_kernel_tunables)
59eeb84b
LP
1614 return 0;
1615
469830d1
LP
1616 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1617 return 0;
59eeb84b 1618
469830d1 1619 return seccomp_protect_sysctl();
59eeb84b
LP
1620}
1621
59e856c7 1622static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1623 assert(u);
502d704e
DH
1624 assert(c);
1625
25a8d8a0 1626 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1627
469830d1
LP
1628 if (!c->protect_kernel_modules)
1629 return 0;
1630
502d704e
DH
1631 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1632 return 0;
1633
b54f36c6 1634 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1635}
1636
84703040
KK
1637static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1638 assert(u);
1639 assert(c);
1640
1641 if (!c->protect_kernel_logs)
1642 return 0;
1643
1644 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1645 return 0;
1646
1647 return seccomp_protect_syslog();
1648}
1649
daf8f72b 1650static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1651 assert(u);
1652 assert(c);
1653
1654 if (!c->protect_clock)
1655 return 0;
1656
1657 if (skip_seccomp_unavailable(u, "ProtectClock="))
1658 return 0;
1659
1660 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1661}
1662
59e856c7 1663static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1664 assert(u);
ba128bb8
LP
1665 assert(c);
1666
8f81a5f6 1667 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1668
469830d1
LP
1669 if (!c->private_devices)
1670 return 0;
1671
ba128bb8
LP
1672 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1673 return 0;
1674
b54f36c6 1675 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1676}
1677
34cf6c43 1678static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1679 assert(u);
add00535
LP
1680 assert(c);
1681
1682 if (!exec_context_restrict_namespaces_set(c))
1683 return 0;
1684
1685 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1686 return 0;
1687
1688 return seccomp_restrict_namespaces(c->restrict_namespaces);
1689}
1690
78e864e5 1691static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1692 unsigned long personality;
1693 int r;
78e864e5
TM
1694
1695 assert(u);
1696 assert(c);
1697
1698 if (!c->lock_personality)
1699 return 0;
1700
1701 if (skip_seccomp_unavailable(u, "LockPersonality="))
1702 return 0;
1703
e8132d63
LP
1704 personality = c->personality;
1705
1706 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1707 if (personality == PERSONALITY_INVALID) {
1708
1709 r = opinionated_personality(&personality);
1710 if (r < 0)
1711 return r;
1712 }
78e864e5
TM
1713
1714 return seccomp_lock_personality(personality);
1715}
1716
c0467cf3 1717#endif
8351ceae 1718
7a8288f6 1719#if HAVE_LIBBPF
7a8288f6
DM
1720static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1721 assert(u);
1722 assert(c);
1723
1724 if (!exec_context_restrict_filesystems_set(c))
1725 return 0;
1726
46004616
ZJS
1727 if (!u->manager->restrict_fs) {
1728 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1729 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1730 return 0;
46004616 1731 }
7a8288f6
DM
1732
1733 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1734}
1735#endif
1736
daf8f72b 1737static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1738 assert(u);
1739 assert(c);
1740
1741 if (!c->protect_hostname)
1742 return 0;
1743
1744 if (ns_type_supported(NAMESPACE_UTS)) {
1745 if (unshare(CLONE_NEWUTS) < 0) {
1746 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1747 *ret_exit_status = EXIT_NAMESPACE;
1748 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1749 }
1750
1751 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1752 }
1753 } else
1754 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1755
1756#if HAVE_SECCOMP
8f3e342f
ZJS
1757 int r;
1758
daf8f72b
LP
1759 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1760 return 0;
1761
1762 r = seccomp_protect_hostname();
1763 if (r < 0) {
1764 *ret_exit_status = EXIT_SECCOMP;
1765 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1766 }
1767#endif
1768
1769 return 0;
1770}
1771
3042bbeb 1772static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1773 assert(idle_pipe);
1774
54eb2300
LP
1775 idle_pipe[1] = safe_close(idle_pipe[1]);
1776 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1777
1778 if (idle_pipe[0] >= 0) {
1779 int r;
1780
1781 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1782
1783 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1784 ssize_t n;
1785
31a7eb86 1786 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1787 n = write(idle_pipe[3], "x", 1);
1788 if (n > 0)
cd972d69 1789 /* Wait for systemd to react to the signal above. */
54756dce 1790 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1791 }
1792
54eb2300 1793 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1794
1795 }
1796
54eb2300 1797 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1798}
1799
fb2042dd
YW
1800static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1801
7cae38c4 1802static int build_environment(
34cf6c43 1803 const Unit *u,
9fa95f85 1804 const ExecContext *c,
1e22b5cd 1805 const ExecParameters *p,
da6053d0 1806 size_t n_fds,
7cae38c4
LP
1807 const char *home,
1808 const char *username,
1809 const char *shell,
7bce046b
LP
1810 dev_t journal_stream_dev,
1811 ino_t journal_stream_ino,
7cae38c4
LP
1812 char ***ret) {
1813
1814 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1815 size_t n_env = 0;
7cae38c4
LP
1816 char *x;
1817
4b58153d 1818 assert(u);
7cae38c4 1819 assert(c);
7c1cb6f1 1820 assert(p);
7cae38c4
LP
1821 assert(ret);
1822
dc4e2940 1823#define N_ENV_VARS 17
8d5bb13d 1824 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1825 if (!our_env)
1826 return -ENOMEM;
1827
1828 if (n_fds > 0) {
8dd4c05b
LP
1829 _cleanup_free_ char *joined = NULL;
1830
df0ff127 1831 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1832 return -ENOMEM;
1833 our_env[n_env++] = x;
1834
da6053d0 1835 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1836 return -ENOMEM;
1837 our_env[n_env++] = x;
8dd4c05b 1838
1e22b5cd 1839 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1840 if (!joined)
1841 return -ENOMEM;
1842
605405c6 1843 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1844 if (!x)
1845 return -ENOMEM;
1846 our_env[n_env++] = x;
7cae38c4
LP
1847 }
1848
b08af3b1 1849 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1850 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1851 return -ENOMEM;
1852 our_env[n_env++] = x;
1853
1e22b5cd 1854 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1855 return -ENOMEM;
1856 our_env[n_env++] = x;
1857 }
1858
de90700f
LP
1859 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1860 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1861 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1862 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1863 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1864 if (!x)
1865 return -ENOMEM;
1866 our_env[n_env++] = x;
1867 }
1868
7cae38c4 1869 if (home) {
b910cc72 1870 x = strjoin("HOME=", home);
7cae38c4
LP
1871 if (!x)
1872 return -ENOMEM;
7bbead1d 1873
4ff361cc 1874 path_simplify(x + 5);
7cae38c4
LP
1875 our_env[n_env++] = x;
1876 }
1877
1878 if (username) {
b910cc72 1879 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1880 if (!x)
1881 return -ENOMEM;
1882 our_env[n_env++] = x;
1883
b910cc72 1884 x = strjoin("USER=", username);
7cae38c4
LP
1885 if (!x)
1886 return -ENOMEM;
1887 our_env[n_env++] = x;
1888 }
1889
1890 if (shell) {
b910cc72 1891 x = strjoin("SHELL=", shell);
7cae38c4
LP
1892 if (!x)
1893 return -ENOMEM;
7bbead1d 1894
4ff361cc 1895 path_simplify(x + 6);
7cae38c4
LP
1896 our_env[n_env++] = x;
1897 }
1898
4b58153d
LP
1899 if (!sd_id128_is_null(u->invocation_id)) {
1900 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1901 return -ENOMEM;
1902
1903 our_env[n_env++] = x;
1904 }
1905
6af760f3
LP
1906 if (exec_context_needs_term(c)) {
1907 const char *tty_path, *term = NULL;
1908
1909 tty_path = exec_context_tty_path(c);
1910
e8cf09b2
LP
1911 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1912 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1913 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1914
e8cf09b2 1915 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1916 term = getenv("TERM");
e8cf09b2 1917
6af760f3
LP
1918 if (!term)
1919 term = default_term_for_tty(tty_path);
7cae38c4 1920
b910cc72 1921 x = strjoin("TERM=", term);
7cae38c4
LP
1922 if (!x)
1923 return -ENOMEM;
1924 our_env[n_env++] = x;
1925 }
1926
7bce046b
LP
1927 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1928 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1929 return -ENOMEM;
1930
1931 our_env[n_env++] = x;
1932 }
1933
91dd5f7c
LP
1934 if (c->log_namespace) {
1935 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1936 if (!x)
1937 return -ENOMEM;
1938
1939 our_env[n_env++] = x;
1940 }
1941
5b10116e 1942 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1943 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1944 const char *n;
1945
1946 if (!p->prefix[t])
1947 continue;
1948
211a3d87 1949 if (c->directories[t].n_items == 0)
fb2042dd
YW
1950 continue;
1951
1952 n = exec_directory_env_name_to_string(t);
1953 if (!n)
1954 continue;
1955
211a3d87
LB
1956 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1957 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1958
211a3d87
LB
1959 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1960 if (!prefixed)
1961 return -ENOMEM;
1962
1963 if (!strextend_with_separator(&joined, ":", prefixed))
1964 return -ENOMEM;
1965 }
fb2042dd
YW
1966
1967 x = strjoin(n, "=", joined);
1968 if (!x)
1969 return -ENOMEM;
1970
1971 our_env[n_env++] = x;
1972 }
1973
bb0c0d6f
LP
1974 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1975 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1976 if (!x)
1977 return -ENOMEM;
1978
1979 our_env[n_env++] = x;
1980 }
1981
dc4e2940
YW
1982 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1983 return -ENOMEM;
1984
1985 our_env[n_env++] = x;
1986
7cae38c4 1987 our_env[n_env++] = NULL;
8d5bb13d
LP
1988 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1989#undef N_ENV_VARS
7cae38c4 1990
ae2a15bc 1991 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1992
1993 return 0;
1994}
1995
b4c14404
FB
1996static int build_pass_environment(const ExecContext *c, char ***ret) {
1997 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1998 size_t n_env = 0;
b4c14404
FB
1999
2000 STRV_FOREACH(i, c->pass_environment) {
2001 _cleanup_free_ char *x = NULL;
2002 char *v;
2003
2004 v = getenv(*i);
2005 if (!v)
2006 continue;
605405c6 2007 x = strjoin(*i, "=", v);
b4c14404
FB
2008 if (!x)
2009 return -ENOMEM;
00819cc1 2010
319a4f4b 2011 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2012 return -ENOMEM;
00819cc1 2013
1cc6c93a 2014 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2015 pass_env[n_env] = NULL;
b4c14404
FB
2016 }
2017
ae2a15bc 2018 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2019
2020 return 0;
2021}
2022
5e8deb94 2023bool exec_needs_mount_namespace(
8b44a3d2
LP
2024 const ExecContext *context,
2025 const ExecParameters *params,
4657abb5 2026 const ExecRuntime *runtime) {
8b44a3d2
LP
2027
2028 assert(context);
8b44a3d2 2029
915e6d16
LP
2030 if (context->root_image)
2031 return true;
2032
2a624c36
AP
2033 if (!strv_isempty(context->read_write_paths) ||
2034 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2035 !strv_isempty(context->inaccessible_paths) ||
2036 !strv_isempty(context->exec_paths) ||
2037 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2038 return true;
2039
42b1d8e0 2040 if (context->n_bind_mounts > 0)
d2d6c096
LP
2041 return true;
2042
2abd4e38
YW
2043 if (context->n_temporary_filesystems > 0)
2044 return true;
2045
b3d13314
LB
2046 if (context->n_mount_images > 0)
2047 return true;
2048
93f59701
LB
2049 if (context->n_extension_images > 0)
2050 return true;
2051
a07b9926
LB
2052 if (!strv_isempty(context->extension_directories))
2053 return true;
2054
37ed15d7 2055 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2056 return true;
2057
2058 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2059 return true;
2060
8b44a3d2 2061 if (context->private_devices ||
228af36f 2062 context->private_mounts ||
8b44a3d2 2063 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2064 context->protect_home != PROTECT_HOME_NO ||
2065 context->protect_kernel_tunables ||
c575770b 2066 context->protect_kernel_modules ||
94a7b275 2067 context->protect_kernel_logs ||
4e399953
LP
2068 context->protect_control_groups ||
2069 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2070 context->proc_subset != PROC_SUBSET_ALL ||
2071 context->private_ipc ||
2072 context->ipc_namespace_path)
8b44a3d2
LP
2073 return true;
2074
37c56f89 2075 if (context->root_directory) {
5e98086d 2076 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2077 return true;
2078
5b10116e 2079 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2080 if (params && !params->prefix[t])
37c56f89
YW
2081 continue;
2082
211a3d87 2083 if (context->directories[t].n_items > 0)
37c56f89
YW
2084 return true;
2085 }
2086 }
5d997827 2087
42b1d8e0 2088 if (context->dynamic_user &&
211a3d87
LB
2089 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2090 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2091 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2092 return true;
2093
91dd5f7c
LP
2094 if (context->log_namespace)
2095 return true;
2096
8b44a3d2
LP
2097 return false;
2098}
2099
5749f855 2100static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2101 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2102 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2103 _cleanup_close_ int unshare_ready_fd = -1;
2104 _cleanup_(sigkill_waitp) pid_t pid = 0;
2105 uint64_t c = 1;
d251207d
LP
2106 ssize_t n;
2107 int r;
2108
5749f855
AZ
2109 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2110 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2111 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2112 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2113 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2114 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2115 * continues execution normally.
2116 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2117 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2118
5749f855
AZ
2119 /* Can only set up multiple mappings with CAP_SETUID. */
2120 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2121 r = asprintf(&uid_map,
5749f855 2122 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2123 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2124 ouid, ouid, uid, uid);
2125 else
2126 r = asprintf(&uid_map,
2127 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2128 ouid, ouid);
d251207d 2129
5749f855
AZ
2130 if (r < 0)
2131 return -ENOMEM;
2132
2133 /* Can only set up multiple mappings with CAP_SETGID. */
2134 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2135 r = asprintf(&gid_map,
5749f855 2136 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2137 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2138 ogid, ogid, gid, gid);
2139 else
2140 r = asprintf(&gid_map,
2141 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2142 ogid, ogid);
2143
2144 if (r < 0)
2145 return -ENOMEM;
d251207d
LP
2146
2147 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2148 * namespace. */
2149 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2150 if (unshare_ready_fd < 0)
2151 return -errno;
2152
2153 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2154 * failed. */
2155 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2156 return -errno;
2157
4c253ed1
LP
2158 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2159 if (r < 0)
2160 return r;
2161 if (r == 0) {
d251207d
LP
2162 _cleanup_close_ int fd = -1;
2163 const char *a;
2164 pid_t ppid;
2165
2166 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2167 * here, after the parent opened its own user namespace. */
2168
2169 ppid = getppid();
2170 errno_pipe[0] = safe_close(errno_pipe[0]);
2171
2172 /* Wait until the parent unshared the user namespace */
2173 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2174 r = -errno;
2175 goto child_fail;
2176 }
2177
2178 /* Disable the setgroups() system call in the child user namespace, for good. */
2179 a = procfs_file_alloca(ppid, "setgroups");
2180 fd = open(a, O_WRONLY|O_CLOEXEC);
2181 if (fd < 0) {
2182 if (errno != ENOENT) {
2183 r = -errno;
2184 goto child_fail;
2185 }
2186
2187 /* If the file is missing the kernel is too old, let's continue anyway. */
2188 } else {
2189 if (write(fd, "deny\n", 5) < 0) {
2190 r = -errno;
2191 goto child_fail;
2192 }
2193
2194 fd = safe_close(fd);
2195 }
2196
2197 /* First write the GID map */
2198 a = procfs_file_alloca(ppid, "gid_map");
2199 fd = open(a, O_WRONLY|O_CLOEXEC);
2200 if (fd < 0) {
2201 r = -errno;
2202 goto child_fail;
2203 }
2204 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2205 r = -errno;
2206 goto child_fail;
2207 }
2208 fd = safe_close(fd);
2209
2210 /* The write the UID map */
2211 a = procfs_file_alloca(ppid, "uid_map");
2212 fd = open(a, O_WRONLY|O_CLOEXEC);
2213 if (fd < 0) {
2214 r = -errno;
2215 goto child_fail;
2216 }
2217 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2218 r = -errno;
2219 goto child_fail;
2220 }
2221
2222 _exit(EXIT_SUCCESS);
2223
2224 child_fail:
2225 (void) write(errno_pipe[1], &r, sizeof(r));
2226 _exit(EXIT_FAILURE);
2227 }
2228
2229 errno_pipe[1] = safe_close(errno_pipe[1]);
2230
2231 if (unshare(CLONE_NEWUSER) < 0)
2232 return -errno;
2233
2234 /* Let the child know that the namespace is ready now */
2235 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2236 return -errno;
2237
2238 /* Try to read an error code from the child */
2239 n = read(errno_pipe[0], &r, sizeof(r));
2240 if (n < 0)
2241 return -errno;
2242 if (n == sizeof(r)) { /* an error code was sent to us */
2243 if (r < 0)
2244 return r;
2245 return -EIO;
2246 }
2247 if (n != 0) /* on success we should have read 0 bytes */
2248 return -EIO;
2249
8f03de53 2250 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2251 if (r < 0)
2252 return r;
2e87a1fd 2253 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2254 return -EIO;
2255
2256 return 0;
2257}
2258
494d0247
YW
2259static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2260 if (!context->dynamic_user)
2261 return false;
2262
2263 if (type == EXEC_DIRECTORY_CONFIGURATION)
2264 return false;
2265
2266 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2267 return false;
2268
2269 return true;
2270}
2271
211a3d87
LB
2272static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2273 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2274 int r;
2275
2276 assert(source);
2277
2278 src_abs = path_join(root, source);
2279 if (!src_abs)
2280 return -ENOMEM;
2281
2282 STRV_FOREACH(dst, symlinks) {
2283 _cleanup_free_ char *dst_abs = NULL;
2284
2285 dst_abs = path_join(root, *dst);
2286 if (!dst_abs)
2287 return -ENOMEM;
2288
2289 r = mkdir_parents_label(dst_abs, 0755);
2290 if (r < 0)
2291 return r;
2292
2293 r = symlink_idempotent(src_abs, dst_abs, true);
2294 if (r < 0)
2295 return r;
2296 }
2297
2298 return 0;
2299}
2300
3536f49e 2301static int setup_exec_directory(
07689d5d
LP
2302 const ExecContext *context,
2303 const ExecParameters *params,
2304 uid_t uid,
3536f49e 2305 gid_t gid,
3536f49e 2306 ExecDirectoryType type,
211a3d87 2307 bool needs_mount_namespace,
3536f49e 2308 int *exit_status) {
07689d5d 2309
72fd1768 2310 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2311 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2312 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2313 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2314 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2315 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2316 };
07689d5d
LP
2317 int r;
2318
2319 assert(context);
2320 assert(params);
72fd1768 2321 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2322 assert(exit_status);
07689d5d 2323
3536f49e
YW
2324 if (!params->prefix[type])
2325 return 0;
2326
8679efde 2327 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2328 if (!uid_is_valid(uid))
2329 uid = 0;
2330 if (!gid_is_valid(gid))
2331 gid = 0;
2332 }
2333
211a3d87 2334 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2335 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2336
211a3d87 2337 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2338 if (!p) {
2339 r = -ENOMEM;
2340 goto fail;
2341 }
07689d5d 2342
23a7448e
YW
2343 r = mkdir_parents_label(p, 0755);
2344 if (r < 0)
3536f49e 2345 goto fail;
23a7448e 2346
494d0247 2347 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2348 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2349 * case we want to avoid leaving a directory around fully accessible that is owned by
2350 * a dynamic user whose UID is later on reused. To lock this down we use the same
2351 * trick used by container managers to prohibit host users to get access to files of
2352 * the same UID in containers: we place everything inside a directory that has an
2353 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2354 * for unprivileged host code. We then use fs namespacing to make this directory
2355 * permeable for the service itself.
6c47cd7d 2356 *
3f5b1508
LP
2357 * Specifically: for a service which wants a special directory "foo/" we first create
2358 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2359 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2360 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2361 * unprivileged host users can't look into it. Inside of the namespace of the unit
2362 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2363 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2364 * for the service and making sure it only gets access to the dirs it needs but no
2365 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2366 *
3f5b1508
LP
2367 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2368 * to be owned by the service itself.
2369 *
2370 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2371 * for sharing files or sockets with other services. */
6c47cd7d 2372
4ede9802
LP
2373 pp = path_join(params->prefix[type], "private");
2374 if (!pp) {
6c47cd7d
LP
2375 r = -ENOMEM;
2376 goto fail;
2377 }
2378
2379 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2380 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2381 if (r < 0)
2382 goto fail;
2383
211a3d87 2384 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2385 r = -ENOMEM;
2386 goto fail;
2387 }
2388
2389 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2390 r = mkdir_parents_label(pp, 0755);
2391 if (r < 0)
2392 goto fail;
2393
949befd3
LP
2394 if (is_dir(p, false) > 0 &&
2395 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2396
2397 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2398 * it over. Most likely the service has been upgraded from one that didn't use
2399 * DynamicUser=1, to one that does. */
2400
cf52c45d
LP
2401 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2402 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2403 exec_directory_type_to_string(type), p, pp);
2404
949befd3
LP
2405 if (rename(p, pp) < 0) {
2406 r = -errno;
2407 goto fail;
2408 }
2409 } else {
2410 /* Otherwise, create the actual directory for the service */
2411
2412 r = mkdir_label(pp, context->directories[type].mode);
2413 if (r < 0 && r != -EEXIST)
2414 goto fail;
2415 }
6c47cd7d 2416
df61e79a
LB
2417 /* And link it up from the original place. Note that if a mount namespace is going to be
2418 * used, then this symlink remains on the host, and a new one for the child namespace will
2419 * be created later. */
6c9c51e5 2420 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2421 if (r < 0)
2422 goto fail;
2423
6c47cd7d 2424 } else {
5c6d40d1
LP
2425 _cleanup_free_ char *target = NULL;
2426
2427 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2428 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2429 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2430
2431 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2432 * by DynamicUser=1 (see above)?
2433 *
2434 * We do this for all directory types except for ConfigurationDirectory=,
2435 * since they all support the private/ symlink logic at least in some
2436 * configurations, see above. */
5c6d40d1 2437
578dc69f
YW
2438 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2439 if (r < 0)
2440 goto fail;
2441
211a3d87 2442 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2443 if (!q) {
2444 r = -ENOMEM;
2445 goto fail;
2446 }
2447
578dc69f
YW
2448 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2449 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2450 if (r < 0)
2451 goto fail;
2452
2453 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2454
2455 /* Hmm, apparently DynamicUser= was once turned on for this service,
2456 * but is no longer. Let's move the directory back up. */
2457
cf52c45d
LP
2458 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2459 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2460 exec_directory_type_to_string(type), q, p);
2461
5c6d40d1
LP
2462 if (unlink(p) < 0) {
2463 r = -errno;
2464 goto fail;
2465 }
2466
2467 if (rename(q, p) < 0) {
2468 r = -errno;
2469 goto fail;
2470 }
2471 }
2472 }
2473
6c47cd7d 2474 r = mkdir_label(p, context->directories[type].mode);
d484580c 2475 if (r < 0) {
d484580c
LP
2476 if (r != -EEXIST)
2477 goto fail;
2478
206e9864
LP
2479 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2480 struct stat st;
2481
2482 /* Don't change the owner/access mode of the configuration directory,
2483 * as in the common case it is not written to by a service, and shall
2484 * not be writable. */
2485
2486 if (stat(p, &st) < 0) {
2487 r = -errno;
2488 goto fail;
2489 }
2490
2491 /* Still complain if the access mode doesn't match */
2492 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2493 log_warning("%s \'%s\' already exists but the mode is different. "
2494 "(File system: %o %sMode: %o)",
211a3d87 2495 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2496 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2497
6cff72eb 2498 continue;
206e9864 2499 }
6cff72eb 2500 }
a1164ae3 2501 }
07689d5d 2502
206e9864 2503 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2504 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2505 * current UID/GID ownership.) */
2506 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2507 if (r < 0)
2508 goto fail;
c71b2eb7 2509
607b358e
LP
2510 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2511 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2512 * assignments to exist. */
607b358e 2513 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2514 if (r < 0)
3536f49e 2515 goto fail;
07689d5d
LP
2516 }
2517
211a3d87
LB
2518 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2519 * they are set up later, to allow configuring empty var/run/etc. */
2520 if (!needs_mount_namespace)
2521 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2522 r = create_many_symlinks(params->prefix[type],
2523 context->directories[type].items[i].path,
2524 context->directories[type].items[i].symlinks);
2525 if (r < 0)
2526 goto fail;
2527 }
2528
07689d5d 2529 return 0;
3536f49e
YW
2530
2531fail:
2532 *exit_status = exit_status_table[type];
3536f49e 2533 return r;
07689d5d
LP
2534}
2535
bb0c0d6f
LP
2536static int write_credential(
2537 int dfd,
2538 const char *id,
2539 const void *data,
2540 size_t size,
2541 uid_t uid,
2542 bool ownership_ok) {
2543
2544 _cleanup_(unlink_and_freep) char *tmp = NULL;
2545 _cleanup_close_ int fd = -1;
2546 int r;
2547
2548 r = tempfn_random_child("", "cred", &tmp);
2549 if (r < 0)
2550 return r;
2551
2552 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2553 if (fd < 0) {
2554 tmp = mfree(tmp);
2555 return -errno;
2556 }
2557
43144be4 2558 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2559 if (r < 0)
2560 return r;
2561
2562 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2563 return -errno;
2564
2565 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2566 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2567 if (r < 0) {
2568 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2569 return r;
2570
2571 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2572 * to express: that the user gets read access and nothing
2573 * else. But if the backing fs can't support that (e.g. ramfs)
2574 * then we can use file ownership instead. But that's only safe if
2575 * we can then re-mount the whole thing read-only, so that the
2576 * user can no longer chmod() the file to gain write access. */
2577 return r;
2578
f5fbe71d 2579 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2580 return -errno;
2581 }
2582 }
2583
2584 if (renameat(dfd, tmp, dfd, id) < 0)
2585 return -errno;
2586
2587 tmp = mfree(tmp);
2588 return 0;
2589}
2590
2ad591a3
LP
2591static char **credential_search_path(
2592 const ExecParameters *params,
2593 bool encrypted) {
2594
2595 _cleanup_strv_free_ char **l = NULL;
2596
2597 assert(params);
2598
2599 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2600 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2601 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2602
2603 if (encrypted) {
2604 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2605 return NULL;
2606
2607 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2608 return NULL;
2609 }
2610
2611 if (params->received_credentials_directory)
2612 if (strv_extend(&l, params->received_credentials_directory) < 0)
2613 return NULL;
2614
2615 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2616 return NULL;
2617
2618 if (DEBUG_LOGGING) {
2619 _cleanup_free_ char *t = strv_join(l, ":");
2620
2621 log_debug("Credential search path is: %s", t);
2622 }
2623
2624 return TAKE_PTR(l);
2625}
2626
3989bdc1
AB
2627static int load_credential(
2628 const ExecContext *context,
2629 const ExecParameters *params,
10b44e1d
LP
2630 const char *id,
2631 const char *path,
2632 bool encrypted,
3989bdc1
AB
2633 const char *unit,
2634 int read_dfd,
2635 int write_dfd,
2636 uid_t uid,
2637 bool ownership_ok,
2638 uint64_t *left) {
2639
3989bdc1 2640 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2641 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2642 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2643 _cleanup_free_ char *bindname = NULL;
2644 const char *source = NULL;
3989bdc1 2645 bool missing_ok = true;
2ad591a3 2646 size_t size, add, maxsz;
3989bdc1
AB
2647 int r;
2648
10b44e1d
LP
2649 assert(context);
2650 assert(params);
2651 assert(id);
2652 assert(path);
2653 assert(unit);
2654 assert(write_dfd >= 0);
2655 assert(left);
2656
2ad591a3
LP
2657 if (read_dfd >= 0) {
2658 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2659 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2660 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2661 * open it. */
2662
2663 if (!filename_is_valid(path)) /* safety check */
2664 return -EINVAL;
2665
2666 missing_ok = true;
10b44e1d 2667 source = path;
2ad591a3
LP
2668
2669 } else if (path_is_absolute(path)) {
2670 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2671 * sockets */
2672
2673 if (!path_is_valid(path)) /* safety check */
2674 return -EINVAL;
2675
3989bdc1
AB
2676 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2677
2678 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2679 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2680 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2681 return -ENOMEM;
2682
2683 missing_ok = false;
2ad591a3 2684 source = path;
3989bdc1 2685
2ad591a3
LP
2686 } else if (credential_name_valid(path)) {
2687 /* If this is a relative path, take it as credential name relative to the credentials
2688 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2689 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2690
2691 search_path = credential_search_path(params, encrypted);
2692 if (!search_path)
3989bdc1
AB
2693 return -ENOMEM;
2694
2ad591a3 2695 missing_ok = true;
3989bdc1
AB
2696 } else
2697 source = NULL;
2698
2ad591a3
LP
2699 if (encrypted)
2700 flags |= READ_FULL_FILE_UNBASE64;
2701
2702 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2703
2704 if (search_path) {
2705 STRV_FOREACH(d, search_path) {
2706 _cleanup_free_ char *j = NULL;
2707
2708 j = path_join(*d, path);
2709 if (!j)
2710 return -ENOMEM;
2711
2712 r = read_full_file_full(
2713 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2714 UINT64_MAX,
2715 maxsz,
2716 flags,
2717 NULL,
2718 &data, &size);
2719 if (r != -ENOENT)
2720 break;
2721 }
2722 } else if (source)
3989bdc1
AB
2723 r = read_full_file_full(
2724 read_dfd, source,
2725 UINT64_MAX,
2ad591a3
LP
2726 maxsz,
2727 flags,
3989bdc1
AB
2728 bindname,
2729 &data, &size);
2730 else
2731 r = -ENOENT;
2732
10b44e1d 2733 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2734 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2735 * will get clear errors if we don't pass such a missing credential on as they
2736 * themselves will get ENOENT when trying to read them, which should not be much
2737 * worse than when we handle the error here and make it fatal.
2738 *
2739 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2740 * we are fine, too. */
10b44e1d 2741 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2742 return 0;
2743 }
2744 if (r < 0)
10b44e1d 2745 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2746
10b44e1d 2747 if (encrypted) {
3989bdc1
AB
2748 _cleanup_free_ void *plaintext = NULL;
2749 size_t plaintext_size = 0;
2750
10b44e1d 2751 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2752 if (r < 0)
2753 return r;
2754
2755 free_and_replace(data, plaintext);
2756 size = plaintext_size;
2757 }
2758
10b44e1d 2759 add = strlen(id) + size;
3989bdc1
AB
2760 if (add > *left)
2761 return -E2BIG;
2762
10b44e1d 2763 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2764 if (r < 0)
94602bff 2765 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2766
2767 *left -= add;
2768 return 0;
2769}
2770
2771struct load_cred_args {
3989bdc1
AB
2772 const ExecContext *context;
2773 const ExecParameters *params;
461345a1 2774 bool encrypted;
3989bdc1
AB
2775 const char *unit;
2776 int dfd;
2777 uid_t uid;
2778 bool ownership_ok;
2779 uint64_t *left;
2780};
2781
2782static int load_cred_recurse_dir_cb(
2783 RecurseDirEvent event,
2784 const char *path,
2785 int dir_fd,
2786 int inode_fd,
2787 const struct dirent *de,
2788 const struct statx *sx,
2789 void *userdata) {
2790
6394e5cd 2791 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2792 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2793 int r;
2794
2795 if (event != RECURSE_DIR_ENTRY)
2796 return RECURSE_DIR_CONTINUE;
2797
2798 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2799 return RECURSE_DIR_CONTINUE;
2800
11348386 2801 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2802 if (!sub_id)
2803 return -ENOMEM;
2804
2805 if (!credential_name_valid(sub_id))
1451435c 2806 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2807
5bec447a 2808 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2809 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2810 return RECURSE_DIR_CONTINUE;
2811 }
5bec447a
LP
2812 if (errno != ENOENT)
2813 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2814
10b44e1d
LP
2815 r = load_credential(
2816 args->context,
2817 args->params,
2818 sub_id,
2819 de->d_name,
461345a1 2820 args->encrypted,
10b44e1d
LP
2821 args->unit,
2822 dir_fd,
2823 args->dfd,
2824 args->uid,
2825 args->ownership_ok,
2826 args->left);
3989bdc1
AB
2827 if (r < 0)
2828 return r;
2829
2830 return RECURSE_DIR_CONTINUE;
2831}
2832
bb0c0d6f
LP
2833static int acquire_credentials(
2834 const ExecContext *context,
2835 const ExecParameters *params,
d3dcf4e3 2836 const char *unit,
bb0c0d6f
LP
2837 const char *p,
2838 uid_t uid,
2839 bool ownership_ok) {
2840
43144be4 2841 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2842 _cleanup_close_ int dfd = -1;
43144be4 2843 ExecLoadCredential *lc;
bb0c0d6f 2844 ExecSetCredential *sc;
bb0c0d6f
LP
2845 int r;
2846
2847 assert(context);
2848 assert(p);
2849
2850 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2851 if (dfd < 0)
2852 return -errno;
2853
43144be4
LP
2854 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2855 HASHMAP_FOREACH(lc, context->load_credentials) {
3989bdc1 2856 _cleanup_close_ int sub_fd = -1;
d3dcf4e3 2857
f344f7fd
LP
2858 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2859 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2860 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2861 * propagate a credential passed to us from further up. */
43144be4 2862
f344f7fd
LP
2863 if (path_is_absolute(lc->path)) {
2864 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2865 if (sub_fd < 0 && !IN_SET(errno,
2866 ENOTDIR, /* Not a directory */
2867 ENOENT)) /* Doesn't exist? */
2868 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2869 }
43144be4 2870
61c5a49e 2871 if (sub_fd < 0)
f344f7fd 2872 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2873 r = load_credential(
2874 context,
2875 params,
2876 lc->id,
2877 lc->path,
2878 lc->encrypted,
2879 unit,
2880 -1,
2881 dfd,
2882 uid,
2883 ownership_ok,
2884 &left);
61c5a49e 2885 else
10b44e1d 2886 /* Directory */
3989bdc1
AB
2887 r = recurse_dir(
2888 sub_fd,
11348386 2889 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2890 /* statx_mask= */ 0,
2891 /* n_depth_max= */ UINT_MAX,
9883cbb2 2892 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2893 load_cred_recurse_dir_cb,
2894 &(struct load_cred_args) {
3989bdc1
AB
2895 .context = context,
2896 .params = params,
461345a1 2897 .encrypted = lc->encrypted,
3989bdc1
AB
2898 .unit = unit,
2899 .dfd = dfd,
2900 .uid = uid,
2901 .ownership_ok = ownership_ok,
2902 .left = &left,
2903 });
61c5a49e
LP
2904 if (r < 0)
2905 return r;
bb0c0d6f
LP
2906 }
2907
9e6e9d61
LP
2908 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2909 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2910 HASHMAP_FOREACH(sc, context->set_credentials) {
2911 _cleanup_(erase_and_freep) void *plaintext = NULL;
2912 const char *data;
2913 size_t size, add;
2914
9e6e9d61
LP
2915 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2916 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2917 * slow and involved, hence it's nice to be able to skip that if the credential already
2918 * exists anyway. */
43144be4
LP
2919 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2920 continue;
2921 if (errno != ENOENT)
2922 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2923
2924 if (sc->encrypted) {
2925 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2926 if (r < 0)
2927 return r;
2928
2929 data = plaintext;
2930 } else {
2931 data = sc->data;
2932 size = sc->size;
2933 }
2934
2935 add = strlen(sc->id) + size;
2936 if (add > left)
2937 return -E2BIG;
2938
2939 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2940 if (r < 0)
2941 return r;
2942
43144be4
LP
2943 left -= add;
2944 }
2945
bb0c0d6f
LP
2946 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2947 return -errno;
2948
2949 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2950 * accessible */
2951
2952 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2953 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2954 if (r < 0) {
2955 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2956 return r;
2957
2958 if (!ownership_ok)
2959 return r;
2960
f5fbe71d 2961 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2962 return -errno;
2963 }
2964 }
2965
2966 return 0;
2967}
2968
2969static int setup_credentials_internal(
2970 const ExecContext *context,
2971 const ExecParameters *params,
d3dcf4e3 2972 const char *unit,
bb0c0d6f
LP
2973 const char *final, /* This is where the credential store shall eventually end up at */
2974 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2975 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2976 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2977 uid_t uid) {
2978
2979 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2980 * if we mounted something; false if we definitely can't mount anything */
2981 bool final_mounted;
2982 const char *where;
2983
2984 assert(context);
2985 assert(final);
2986 assert(workspace);
2987
2988 if (reuse_workspace) {
2989 r = path_is_mount_point(workspace, NULL, 0);
2990 if (r < 0)
2991 return r;
2992 if (r > 0)
2993 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2994 else
2995 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2996 } else
2997 workspace_mounted = -1; /* ditto */
2998
2999 r = path_is_mount_point(final, NULL, 0);
3000 if (r < 0)
3001 return r;
3002 if (r > 0) {
3003 /* If the final place already has something mounted, we use that. If the workspace also has
3004 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3005 * different). */
3006 final_mounted = true;
3007
3008 if (workspace_mounted < 0) {
3009 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3010 * the final version to the workspace, and make it writable, so that we can make
3011 * changes */
3012
21935150
LP
3013 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3014 if (r < 0)
3015 return r;
bb0c0d6f 3016
21935150
LP
3017 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3018 if (r < 0)
3019 return r;
bb0c0d6f
LP
3020
3021 workspace_mounted = true;
3022 }
3023 } else
3024 final_mounted = false;
3025
3026 if (workspace_mounted < 0) {
3027 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3028 for (int try = 0;; try++) {
3029
3030 if (try == 0) {
3031 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3032 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3033 if (r >= 0) {
bb0c0d6f
LP
3034 workspace_mounted = true;
3035 break;
3036 }
3037
3038 } else if (try == 1) {
3039 _cleanup_free_ char *opts = NULL;
3040
43144be4 3041 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3042 return -ENOMEM;
3043
3044 /* Fall back to "tmpfs" otherwise */
21935150
LP
3045 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3046 if (r >= 0) {
bb0c0d6f
LP
3047 workspace_mounted = true;
3048 break;
3049 }
3050
3051 } else {
3052 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3053 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3054 if (r < 0) {
3055 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3056 return r;
bb0c0d6f
LP
3057
3058 if (must_mount) /* If we it's not OK to use the plain directory
3059 * fallback, propagate all errors too */
21935150 3060 return r;
bb0c0d6f
LP
3061
3062 /* If we lack privileges to bind mount stuff, then let's gracefully
3063 * proceed for compat with container envs, and just use the final dir
3064 * as is. */
3065
3066 workspace_mounted = false;
3067 break;
3068 }
3069
3070 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3071 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3072 if (r < 0)
3073 return r;
bb0c0d6f
LP
3074
3075 workspace_mounted = true;
3076 break;
3077 }
3078 }
3079 }
3080
3081 assert(!must_mount || workspace_mounted > 0);
3082 where = workspace_mounted ? workspace : final;
3083
03bc11d1 3084 (void) label_fix_full(AT_FDCWD, where, final, 0);
e3a0a862 3085
d3dcf4e3 3086 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3087 if (r < 0)
3088 return r;
3089
3090 if (workspace_mounted) {
3091 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3092 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3093 if (r < 0)
3094 return r;
bb0c0d6f
LP
3095
3096 /* And mount it to the final place, read-only */
21935150
LP
3097 if (final_mounted)
3098 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3099 else
3100 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3101 if (r < 0)
3102 return r;
bb0c0d6f
LP
3103 } else {
3104 _cleanup_free_ char *parent = NULL;
3105
3106 /* If we do not have our own mount put used the plain directory fallback, then we need to
3107 * open access to the top-level credential directory and the per-service directory now */
3108
3109 parent = dirname_malloc(final);
3110 if (!parent)
3111 return -ENOMEM;
3112 if (chmod(parent, 0755) < 0)
3113 return -errno;
3114 }
3115
3116 return 0;
3117}
3118
3119static int setup_credentials(
3120 const ExecContext *context,
3121 const ExecParameters *params,
3122 const char *unit,
3123 uid_t uid) {
3124
3125 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3126 int r;
3127
3128 assert(context);
3129 assert(params);
3130
3131 if (!exec_context_has_credentials(context))
3132 return 0;
3133
3134 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3135 return -EINVAL;
3136
3137 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3138 * and the subdir we mount over with a read-only file system readable by the service's user */
3139 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3140 if (!q)
3141 return -ENOMEM;
3142
3143 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3144 if (r < 0 && r != -EEXIST)
3145 return r;
3146
3147 p = path_join(q, unit);
3148 if (!p)
3149 return -ENOMEM;
3150
3151 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3152 if (r < 0 && r != -EEXIST)
3153 return r;
3154
3155 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3156 if (r < 0) {
3157 _cleanup_free_ char *t = NULL, *u = NULL;
3158
3159 /* If this is not a privilege or support issue then propagate the error */
3160 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3161 return r;
3162
3163 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3164 * it into place, so that users can't access half-initialized credential stores. */
3165 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3166 if (!t)
3167 return -ENOMEM;
3168
3169 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3170 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3171 * after it is fully set up */
3172 u = path_join(t, unit);
3173 if (!u)
3174 return -ENOMEM;
3175
3176 FOREACH_STRING(i, t, u) {
3177 r = mkdir_label(i, 0700);
3178 if (r < 0 && r != -EEXIST)
3179 return r;
3180 }
3181
3182 r = setup_credentials_internal(
3183 context,
3184 params,
d3dcf4e3 3185 unit,
bb0c0d6f
LP
3186 p, /* final mount point */
3187 u, /* temporary workspace to overmount */
3188 true, /* reuse the workspace if it is already a mount */
3189 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3190 uid);
3191
3192 (void) rmdir(u); /* remove the workspace again if we can. */
3193
3194 if (r < 0)
3195 return r;
3196
3197 } else if (r == 0) {
3198
3199 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3200 * we can use the same directory for all cases, after turning off propagation. Question
3201 * though is: where do we turn off propagation exactly, and where do we place the workspace
3202 * directory? We need some place that is guaranteed to be a mount point in the host, and
3203 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3204 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3205 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3206 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3207 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3208 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3209 * propagation on the former, and then overmount the latter.
3210 *
3211 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3212 * for this purpose, but there are few other candidates that work equally well for us, and
3213 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3214 * that no one else sees this should be OK to do. */
bb0c0d6f 3215
21935150
LP
3216 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3217 if (r < 0)
bb0c0d6f
LP
3218 goto child_fail;
3219
3220 r = setup_credentials_internal(
3221 context,
3222 params,
d3dcf4e3 3223 unit,
bb0c0d6f
LP
3224 p, /* final mount point */
3225 "/dev/shm", /* temporary workspace to overmount */
3226 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3227 true, /* insist that something is mounted, do not allow fallback to plain directory */
3228 uid);
3229 if (r < 0)
3230 goto child_fail;
3231
3232 _exit(EXIT_SUCCESS);
3233
3234 child_fail:
3235 _exit(EXIT_FAILURE);
3236 }
3237
3238 return 0;
3239}
3240
92b423b9 3241#if ENABLE_SMACK
cefc33ae
LP
3242static int setup_smack(
3243 const ExecContext *context,
b83d5050 3244 int executable_fd) {
cefc33ae
LP
3245 int r;
3246
3247 assert(context);
b83d5050 3248 assert(executable_fd >= 0);
cefc33ae 3249
cefc33ae
LP
3250 if (context->smack_process_label) {
3251 r = mac_smack_apply_pid(0, context->smack_process_label);
3252 if (r < 0)
3253 return r;
3254 }
3255#ifdef SMACK_DEFAULT_PROCESS_LABEL
3256 else {
3257 _cleanup_free_ char *exec_label = NULL;
3258
b83d5050 3259 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3260 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3261 return r;
3262
3263 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
3264 if (r < 0)
3265 return r;
3266 }
cefc33ae
LP
3267#endif
3268
3269 return 0;
3270}
92b423b9 3271#endif
cefc33ae 3272
6c47cd7d
LP
3273static int compile_bind_mounts(
3274 const ExecContext *context,
3275 const ExecParameters *params,
3276 BindMount **ret_bind_mounts,
da6053d0 3277 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3278 char ***ret_empty_directories) {
3279
3280 _cleanup_strv_free_ char **empty_directories = NULL;
3281 BindMount *bind_mounts;
5b10116e 3282 size_t n, h = 0;
6c47cd7d
LP
3283 int r;
3284
3285 assert(context);
3286 assert(params);
3287 assert(ret_bind_mounts);
3288 assert(ret_n_bind_mounts);
3289 assert(ret_empty_directories);
3290
3291 n = context->n_bind_mounts;
5b10116e 3292 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3293 if (!params->prefix[t])
3294 continue;
3295
211a3d87 3296 n += context->directories[t].n_items;
6c47cd7d
LP
3297 }
3298
3299 if (n <= 0) {
3300 *ret_bind_mounts = NULL;
3301 *ret_n_bind_mounts = 0;
3302 *ret_empty_directories = NULL;
3303 return 0;
3304 }
3305
3306 bind_mounts = new(BindMount, n);
3307 if (!bind_mounts)
3308 return -ENOMEM;
3309
5b10116e 3310 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3311 BindMount *item = context->bind_mounts + i;
3312 char *s, *d;
3313
3314 s = strdup(item->source);
3315 if (!s) {
3316 r = -ENOMEM;
3317 goto finish;
3318 }
3319
3320 d = strdup(item->destination);
3321 if (!d) {
3322 free(s);
3323 r = -ENOMEM;
3324 goto finish;
3325 }
3326
3327 bind_mounts[h++] = (BindMount) {
3328 .source = s,
3329 .destination = d,
3330 .read_only = item->read_only,
3331 .recursive = item->recursive,
3332 .ignore_enoent = item->ignore_enoent,
3333 };
3334 }
3335
5b10116e 3336 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3337 if (!params->prefix[t])
3338 continue;
3339
211a3d87 3340 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3341 continue;
3342
494d0247 3343 if (exec_directory_is_private(context, t) &&
74e12520 3344 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3345 char *private_root;
3346
3347 /* So this is for a dynamic user, and we need to make sure the process can access its own
3348 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3349 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3350
657ee2d8 3351 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3352 if (!private_root) {
3353 r = -ENOMEM;
3354 goto finish;
3355 }
3356
3357 r = strv_consume(&empty_directories, private_root);
a635a7ae 3358 if (r < 0)
6c47cd7d 3359 goto finish;
6c47cd7d
LP
3360 }
3361
211a3d87 3362 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3363 char *s, *d;
3364
494d0247 3365 if (exec_directory_is_private(context, t))
211a3d87 3366 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3367 else
211a3d87 3368 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3369 if (!s) {
3370 r = -ENOMEM;
3371 goto finish;
3372 }
3373
494d0247 3374 if (exec_directory_is_private(context, t) &&
74e12520 3375 exec_context_with_rootfs(context))
5609f688
YW
3376 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3377 * directory is not created on the root directory. So, let's bind-mount the directory
3378 * on the 'non-private' place. */
211a3d87 3379 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3380 else
3381 d = strdup(s);
6c47cd7d
LP
3382 if (!d) {
3383 free(s);
3384 r = -ENOMEM;
3385 goto finish;
3386 }
3387
3388 bind_mounts[h++] = (BindMount) {
3389 .source = s,
3390 .destination = d,
3391 .read_only = false,
9ce4e4b0 3392 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3393 .recursive = true,
3394 .ignore_enoent = false,
3395 };
3396 }
3397 }
3398
3399 assert(h == n);
3400
3401 *ret_bind_mounts = bind_mounts;
3402 *ret_n_bind_mounts = n;
ae2a15bc 3403 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3404
3405 return (int) n;
3406
3407finish:
3408 bind_mount_free_many(bind_mounts, h);
3409 return r;
3410}
3411
df61e79a
LB
3412/* ret_symlinks will contain a list of pairs src:dest that describes
3413 * the symlinks to create later on. For example, the symlinks needed
3414 * to safely give private directories to DynamicUser=1 users. */
3415static int compile_symlinks(
3416 const ExecContext *context,
3417 const ExecParameters *params,
3418 char ***ret_symlinks) {
3419
3420 _cleanup_strv_free_ char **symlinks = NULL;
3421 int r;
3422
3423 assert(context);
3424 assert(params);
3425 assert(ret_symlinks);
3426
3427 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3428 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3429 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3430
211a3d87
LB
3431 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3432 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3433
211a3d87
LB
3434 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3435 dst_abs = path_join(params->prefix[dt], *symlink);
3436 if (!src_abs || !dst_abs)
3437 return -ENOMEM;
df61e79a 3438
211a3d87
LB
3439 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3440 if (r < 0)
3441 return r;
3442 }
3443
3fa80e5e 3444 if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
211a3d87
LB
3445 continue;
3446
3447 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3448 if (!private_path)
3449 return -ENOMEM;
3450
211a3d87 3451 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3452 if (!path)
3453 return -ENOMEM;
3454
3455 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3456 if (r < 0)
3457 return r;
3458 }
3459 }
3460
3461 *ret_symlinks = TAKE_PTR(symlinks);
3462
3463 return 0;
3464}
3465
4e677599
LP
3466static bool insist_on_sandboxing(
3467 const ExecContext *context,
3468 const char *root_dir,
3469 const char *root_image,
3470 const BindMount *bind_mounts,
3471 size_t n_bind_mounts) {
3472
4e677599
LP
3473 assert(context);
3474 assert(n_bind_mounts == 0 || bind_mounts);
3475
3476 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3477 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3478 * rearrange stuff in a way we cannot ignore gracefully. */
3479
3480 if (context->n_temporary_filesystems > 0)
3481 return true;
3482
3483 if (root_dir || root_image)
3484 return true;
3485
b3d13314
LB
3486 if (context->n_mount_images > 0)
3487 return true;
3488
4e677599
LP
3489 if (context->dynamic_user)
3490 return true;
3491
4355c04f
LB
3492 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3493 return true;
3494
4e677599
LP
3495 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3496 * essential. */
5b10116e 3497 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3498 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3499 return true;
3500
91dd5f7c
LP
3501 if (context->log_namespace)
3502 return true;
3503
4e677599
LP
3504 return false;
3505}
3506
6818c54c 3507static int apply_mount_namespace(
34cf6c43 3508 const Unit *u,
9f71ba8d 3509 ExecCommandFlags command_flags,
6818c54c
LP
3510 const ExecContext *context,
3511 const ExecParameters *params,
7cc5ef5f
ZJS
3512 const ExecRuntime *runtime,
3513 char **error_path) {
6818c54c 3514
df61e79a 3515 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3516 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3517 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3518 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3519 *extension_dir = NULL;
228af36f 3520 NamespaceInfo ns_info;
165a31c0 3521 bool needs_sandboxing;
6c47cd7d 3522 BindMount *bind_mounts = NULL;
da6053d0 3523 size_t n_bind_mounts = 0;
6818c54c 3524 int r;
93c6bb51 3525
2b3c1b9e
DH
3526 assert(context);
3527
915e6d16
LP
3528 if (params->flags & EXEC_APPLY_CHROOT) {
3529 root_image = context->root_image;
3530
3531 if (!root_image)
3532 root_dir = context->root_directory;
3533 }
93c6bb51 3534
6c47cd7d
LP
3535 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3536 if (r < 0)
3537 return r;
3538
211a3d87 3539 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3540 r = compile_symlinks(context, params, &symlinks);
3541 if (r < 0)
41abd7f6 3542 goto finalize;
df61e79a 3543
9f71ba8d 3544 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3545 if (needs_sandboxing) {
3546 /* The runtime struct only contains the parent of the private /tmp,
3547 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3548 * that is sticky, and that's the one we want to use here.
3549 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3550
3551 if (context->private_tmp && runtime) {
56a13a49
ZJS
3552 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3553 tmp_dir = runtime->tmp_dir;
3554 else if (runtime->tmp_dir)
3555 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3556
3557 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3558 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3559 else if (runtime->var_tmp_dir)
56a13a49 3560 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3561 }
3562
b5a33299
YW
3563 ns_info = (NamespaceInfo) {
3564 .ignore_protect_paths = false,
3565 .private_dev = context->private_devices,
3566 .protect_control_groups = context->protect_control_groups,
3567 .protect_kernel_tunables = context->protect_kernel_tunables,
3568 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3569 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3570 .protect_hostname = context->protect_hostname,
5e98086d 3571 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3572 .private_mounts = context->private_mounts,
52b3d652
LP
3573 .protect_home = context->protect_home,
3574 .protect_system = context->protect_system,
4e399953
LP
3575 .protect_proc = context->protect_proc,
3576 .proc_subset = context->proc_subset,
80271a44 3577 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3578 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3579 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3580 };
ecf63c91 3581 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3582 /*
3583 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3584 * sandbox info, otherwise enforce it, don't ignore protected paths and
3585 * fail if we are enable to apply the sandbox inside the mount namespace.
3586 */
3587 ns_info = (NamespaceInfo) {
3588 .ignore_protect_paths = true,
3589 };
3590 else
3591 ns_info = (NamespaceInfo) {};
b5a33299 3592
37ed15d7
FB
3593 if (context->mount_flags == MS_SHARED)
3594 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3595
a631cbfa
LP
3596 if (exec_context_has_credentials(context) &&
3597 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3598 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3599 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3600 if (!creds_path) {
3601 r = -ENOMEM;
3602 goto finalize;
3603 }
bbb4e7f3
LP
3604 }
3605
5e8deb94
LB
3606 if (MANAGER_IS_SYSTEM(u->manager)) {
3607 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3608 if (!propagate_dir) {
3609 r = -ENOMEM;
3610 goto finalize;
3611 }
3612
5e8deb94 3613 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3614 if (!incoming_dir) {
3615 r = -ENOMEM;
3616 goto finalize;
3617 }
24759d8f
LB
3618
3619 extension_dir = strdup("/run/systemd/unit-extensions");
3620 if (!extension_dir) {
3621 r = -ENOMEM;
3622 goto finalize;
3623 }
3624 } else
3625 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3626 r = -ENOMEM;
3627 goto finalize;
3628 }
5e8deb94 3629
18d73705 3630 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3631 &ns_info, context->read_write_paths,
165a31c0
LP
3632 needs_sandboxing ? context->read_only_paths : NULL,
3633 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3634 needs_sandboxing ? context->exec_paths : NULL,
3635 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3636 empty_directories,
df61e79a 3637 symlinks,
6c47cd7d
LP
3638 bind_mounts,
3639 n_bind_mounts,
2abd4e38
YW
3640 context->temporary_filesystems,
3641 context->n_temporary_filesystems,
b3d13314
LB
3642 context->mount_images,
3643 context->n_mount_images,
56a13a49
ZJS
3644 tmp_dir,
3645 var_tmp_dir,
bbb4e7f3 3646 creds_path,
91dd5f7c 3647 context->log_namespace,
915e6d16 3648 context->mount_flags,
d4d55b0d
LB
3649 context->root_hash, context->root_hash_size, context->root_hash_path,
3650 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3651 context->root_verity,
93f59701
LB
3652 context->extension_images,
3653 context->n_extension_images,
a07b9926 3654 context->extension_directories,
5e8deb94
LB
3655 propagate_dir,
3656 incoming_dir,
24759d8f 3657 extension_dir,
3bdc25a4 3658 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3659 error_path);
93c6bb51 3660
1beab8b0 3661 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3662 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3663 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3664 * completely different execution environment. */
aca835ed 3665 if (r == -ENOANO) {
4e677599
LP
3666 if (insist_on_sandboxing(
3667 context,
3668 root_dir, root_image,
3669 bind_mounts,
3670 n_bind_mounts)) {
3671 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3672 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3673 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3674
3675 r = -EOPNOTSUPP;
3676 } else {
aca835ed 3677 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3678 r = 0;
aca835ed 3679 }
93c6bb51
DH
3680 }
3681
8062e643 3682finalize:
4e677599 3683 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3684 return r;
3685}
3686
915e6d16
LP
3687static int apply_working_directory(
3688 const ExecContext *context,
3689 const ExecParameters *params,
3690 const char *home,
376fecf6 3691 int *exit_status) {
915e6d16 3692
6732edab 3693 const char *d, *wd;
2b3c1b9e
DH
3694
3695 assert(context);
376fecf6 3696 assert(exit_status);
2b3c1b9e 3697
6732edab
LP
3698 if (context->working_directory_home) {
3699
376fecf6
LP
3700 if (!home) {
3701 *exit_status = EXIT_CHDIR;
6732edab 3702 return -ENXIO;
376fecf6 3703 }
6732edab 3704
2b3c1b9e 3705 wd = home;
6732edab 3706
14eb3285
LP
3707 } else
3708 wd = empty_to_root(context->working_directory);
e7f1e7c6 3709
fa97f630 3710 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3711 d = wd;
fa97f630 3712 else
3b0e5bb5 3713 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3714
376fecf6
LP
3715 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3716 *exit_status = EXIT_CHDIR;
2b3c1b9e 3717 return -errno;
376fecf6 3718 }
e7f1e7c6
DH
3719
3720 return 0;
3721}
3722
fa97f630
JB
3723static int apply_root_directory(
3724 const ExecContext *context,
3725 const ExecParameters *params,
3726 const bool needs_mount_ns,
3727 int *exit_status) {
3728
3729 assert(context);
3730 assert(exit_status);
3731
5b10116e 3732 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3733 if (!needs_mount_ns && context->root_directory)
3734 if (chroot(context->root_directory) < 0) {
3735 *exit_status = EXIT_CHROOT;
3736 return -errno;
3737 }
fa97f630
JB
3738
3739 return 0;
3740}
3741
b1edf445 3742static int setup_keyring(
34cf6c43 3743 const Unit *u,
b1edf445
LP
3744 const ExecContext *context,
3745 const ExecParameters *p,
3746 uid_t uid, gid_t gid) {
3747
74dd6b51 3748 key_serial_t keyring;
e64c2d0b
DJL
3749 int r = 0;
3750 uid_t saved_uid;
3751 gid_t saved_gid;
74dd6b51
LP
3752
3753 assert(u);
b1edf445 3754 assert(context);
74dd6b51
LP
3755 assert(p);
3756
3757 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3758 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3759 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3760 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3761 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3762 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3763
b1edf445
LP
3764 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3765 return 0;
3766
e64c2d0b
DJL
3767 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3768 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3769 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3770 * & group is just as nasty as acquiring a reference to the user keyring. */
3771
3772 saved_uid = getuid();
3773 saved_gid = getgid();
3774
3775 if (gid_is_valid(gid) && gid != saved_gid) {
3776 if (setregid(gid, -1) < 0)
3777 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3778 }
3779
3780 if (uid_is_valid(uid) && uid != saved_uid) {
3781 if (setreuid(uid, -1) < 0) {
3782 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3783 goto out;
3784 }
3785 }
3786
74dd6b51
LP
3787 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3788 if (keyring == -1) {
3789 if (errno == ENOSYS)
8002fb97 3790 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3791 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3792 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3793 else if (errno == EDQUOT)
8002fb97 3794 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3795 else
e64c2d0b 3796 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3797
e64c2d0b 3798 goto out;
74dd6b51
LP
3799 }
3800
e64c2d0b
DJL
3801 /* When requested link the user keyring into the session keyring. */
3802 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3803
3804 if (keyctl(KEYCTL_LINK,
3805 KEY_SPEC_USER_KEYRING,
3806 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3807 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3808 goto out;
3809 }
3810 }
3811
3812 /* Restore uid/gid back */
3813 if (uid_is_valid(uid) && uid != saved_uid) {
3814 if (setreuid(saved_uid, -1) < 0) {
3815 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3816 goto out;
3817 }
3818 }
3819
3820 if (gid_is_valid(gid) && gid != saved_gid) {
3821 if (setregid(saved_gid, -1) < 0)
3822 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3823 }
3824
3825 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3826 if (!sd_id128_is_null(u->invocation_id)) {
3827 key_serial_t key;
3828
3829 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3830 if (key == -1)
8002fb97 3831 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3832 else {
3833 if (keyctl(KEYCTL_SETPERM, key,
3834 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3835 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3836 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3837 }
3838 }
3839
e64c2d0b 3840out:
37b22b3b 3841 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3842 /* no extra logging, as only the first already reported error matters */
3843 if (getuid() != saved_uid)
3844 (void) setreuid(saved_uid, -1);
b1edf445 3845
e64c2d0b
DJL
3846 if (getgid() != saved_gid)
3847 (void) setregid(saved_gid, -1);
b1edf445 3848
e64c2d0b 3849 return r;
74dd6b51
LP
3850}
3851
3042bbeb 3852static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3853 assert(array);
3854 assert(n);
2caa38e9 3855 assert(pair);
29206d46
LP
3856
3857 if (pair[0] >= 0)
3858 array[(*n)++] = pair[0];
3859 if (pair[1] >= 0)
3860 array[(*n)++] = pair[1];
3861}
3862
a34ceba6
LP
3863static int close_remaining_fds(
3864 const ExecParameters *params,
34cf6c43
YW
3865 const ExecRuntime *runtime,
3866 const DynamicCreds *dcreds,
00d9ef85 3867 int user_lookup_fd,
a34ceba6 3868 int socket_fd,
5b8d1f6b 3869 const int *fds, size_t n_fds) {
a34ceba6 3870
da6053d0 3871 size_t n_dont_close = 0;
00d9ef85 3872 int dont_close[n_fds + 12];
a34ceba6
LP
3873
3874 assert(params);
3875
3876 if (params->stdin_fd >= 0)
3877 dont_close[n_dont_close++] = params->stdin_fd;
3878 if (params->stdout_fd >= 0)
3879 dont_close[n_dont_close++] = params->stdout_fd;
3880 if (params->stderr_fd >= 0)
3881 dont_close[n_dont_close++] = params->stderr_fd;
3882
3883 if (socket_fd >= 0)
3884 dont_close[n_dont_close++] = socket_fd;
3885 if (n_fds > 0) {
3886 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3887 n_dont_close += n_fds;
3888 }
3889
a70581ff 3890 if (runtime) {
29206d46 3891 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3892 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3893 }
29206d46
LP
3894
3895 if (dcreds) {
3896 if (dcreds->user)
3897 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3898 if (dcreds->group)
3899 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3900 }
3901
00d9ef85
LP
3902 if (user_lookup_fd >= 0)
3903 dont_close[n_dont_close++] = user_lookup_fd;
3904
a34ceba6
LP
3905 return close_all_fds(dont_close, n_dont_close);
3906}
3907
00d9ef85
LP
3908static int send_user_lookup(
3909 Unit *unit,
3910 int user_lookup_fd,
3911 uid_t uid,
3912 gid_t gid) {
3913
3914 assert(unit);
3915
3916 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3917 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3918 * specified. */
3919
3920 if (user_lookup_fd < 0)
3921 return 0;
3922
3923 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3924 return 0;
3925
3926 if (writev(user_lookup_fd,
3927 (struct iovec[]) {
e6a7ec4b
LP
3928 IOVEC_INIT(&uid, sizeof(uid)),
3929 IOVEC_INIT(&gid, sizeof(gid)),
3930 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3931 return -errno;
3932
3933 return 0;
3934}
3935
6732edab
LP
3936static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3937 int r;
3938
3939 assert(c);
3940 assert(home);
3941 assert(buf);
3942
3943 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3944
3945 if (*home)
3946 return 0;
3947
3948 if (!c->working_directory_home)
3949 return 0;
3950
6732edab
LP
3951 r = get_home_dir(buf);
3952 if (r < 0)
3953 return r;
3954
3955 *home = *buf;
3956 return 1;
3957}
3958
da50b85a
LP
3959static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3960 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3961 int r;
3962
3963 assert(c);
3964 assert(p);
3965 assert(ret);
3966
3967 assert(c->dynamic_user);
3968
3969 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3970 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3971 * directories. */
3972
5b10116e 3973 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3974 if (t == EXEC_DIRECTORY_CONFIGURATION)
3975 continue;
3976
3977 if (!p->prefix[t])
3978 continue;
3979
211a3d87 3980 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3981 char *e;
3982
494d0247 3983 if (exec_directory_is_private(c, t))
211a3d87 3984 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3985 else
211a3d87 3986 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3987 if (!e)
3988 return -ENOMEM;
3989
3990 r = strv_consume(&list, e);
3991 if (r < 0)
3992 return r;
3993 }
3994 }
3995
ae2a15bc 3996 *ret = TAKE_PTR(list);
da50b85a
LP
3997
3998 return 0;
3999}
4000
78f93209
LP
4001static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
4002 bool using_subcgroup;
4003 char *p;
4004
4005 assert(params);
4006 assert(ret);
4007
4008 if (!params->cgroup_path)
4009 return -EINVAL;
4010
4011 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4012 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4013 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4014 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4015 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4016 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4017 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4018 * flag, which is only passed for the former statements, not for the latter. */
4019
4020 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4021 if (using_subcgroup)
657ee2d8 4022 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4023 else
4024 p = strdup(params->cgroup_path);
4025 if (!p)
4026 return -ENOMEM;
4027
4028 *ret = p;
4029 return using_subcgroup;
4030}
4031
e2b2fb7f
MS
4032static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4033 _cleanup_(cpu_set_reset) CPUSet s = {};
4034 int r;
4035
4036 assert(c);
4037 assert(ret);
4038
4039 if (!c->numa_policy.nodes.set) {
4040 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4041 return 0;
4042 }
4043
4044 r = numa_to_cpu_set(&c->numa_policy, &s);
4045 if (r < 0)
4046 return r;
4047
4048 cpu_set_reset(ret);
4049
4050 return cpu_set_add_all(ret, &s);
4051}
4052
4053bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4054 assert(c);
4055
4056 return c->cpu_affinity_from_numa;
4057}
4058
1da37e58
ZJS
4059static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4060 int r;
4061
4062 assert(fds);
4063 assert(n_fds);
4064 assert(*n_fds < fds_size);
4065 assert(ret_fd);
4066
4067 if (fd < 0) {
4068 *ret_fd = -1;
4069 return 0;
4070 }
4071
4072 if (fd < 3 + (int) *n_fds) {
4073 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4074 * the fds we pass to the process (or which are closed only during execve). */
4075
4076 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4077 if (r < 0)
4078 return -errno;
4079
4080 CLOSE_AND_REPLACE(fd, r);
4081 }
4082
4083 *ret_fd = fds[*n_fds] = fd;
4084 (*n_fds) ++;
4085 return 1;
4086}
4087
ff0af2a1 4088static int exec_child(
f2341e0a 4089 Unit *unit,
34cf6c43 4090 const ExecCommand *command,
ff0af2a1
LP
4091 const ExecContext *context,
4092 const ExecParameters *params,
4093 ExecRuntime *runtime,
29206d46 4094 DynamicCreds *dcreds,
ff0af2a1 4095 int socket_fd,
2caa38e9 4096 const int named_iofds[static 3],
4c47affc 4097 int *fds,
da6053d0 4098 size_t n_socket_fds,
25b583d7 4099 size_t n_storage_fds,
ff0af2a1 4100 char **files_env,
00d9ef85 4101 int user_lookup_fd,
12145637 4102 int *exit_status) {
d35fbf6b 4103
8c35c10d 4104 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4105 int r, ngids = 0, exec_fd;
4d885bd3
DH
4106 _cleanup_free_ gid_t *supplementary_gids = NULL;
4107 const char *username = NULL, *groupname = NULL;
5686391b 4108 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 4109 const char *home = NULL, *shell = NULL;
7ca69792 4110 char **final_argv = NULL;
7bce046b
LP
4111 dev_t journal_stream_dev = 0;
4112 ino_t journal_stream_ino = 0;
5749f855 4113 bool userns_set_up = false;
165a31c0
LP
4114 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4115 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4116 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4117 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4118#if HAVE_SELINUX
7f59dd35 4119 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4120 bool use_selinux = false;
ecfbc84f 4121#endif
f9fa32f0 4122#if ENABLE_SMACK
43b1f709 4123 bool use_smack = false;
ecfbc84f 4124#endif
349cc4a5 4125#if HAVE_APPARMOR
43b1f709 4126 bool use_apparmor = false;
ecfbc84f 4127#endif
5749f855
AZ
4128 uid_t saved_uid = getuid();
4129 gid_t saved_gid = getgid();
fed1e721
LP
4130 uid_t uid = UID_INVALID;
4131 gid_t gid = GID_INVALID;
1da37e58
ZJS
4132 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4133 n_keep_fds; /* total number of fds not to close */
165a31c0 4134 int secure_bits;
afb11bf1
DG
4135 _cleanup_free_ gid_t *gids_after_pam = NULL;
4136 int ngids_after_pam = 0;
034c6ed7 4137
f2341e0a 4138 assert(unit);
5cb5a6ff
LP
4139 assert(command);
4140 assert(context);
d35fbf6b 4141 assert(params);
ff0af2a1 4142 assert(exit_status);
d35fbf6b 4143
69339ae9
LP
4144 /* Explicitly test for CVE-2021-4034 inspired invocations */
4145 assert(command->path);
4146 assert(!strv_isempty(command->argv));
4147
d35fbf6b
DM
4148 rename_process_from_path(command->path);
4149
9c274488
LP
4150 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4151 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4152 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4153 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4154 SIGNALS_IGNORE);
d35fbf6b
DM
4155
4156 if (context->ignore_sigpipe)
9c274488 4157 (void) ignore_signals(SIGPIPE);
d35fbf6b 4158
ff0af2a1
LP
4159 r = reset_signal_mask();
4160 if (r < 0) {
4161 *exit_status = EXIT_SIGNAL_MASK;
12145637 4162 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4163 }
034c6ed7 4164
d35fbf6b
DM
4165 if (params->idle_pipe)
4166 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4167
2c027c62
LP
4168 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4169 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4170 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4171 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4172
d35fbf6b 4173 log_forget_fds();
2c027c62 4174 log_set_open_when_needed(true);
4f2d528d 4175
40a80078
LP
4176 /* In case anything used libc syslog(), close this here, too */
4177 closelog();
4178
b1994387 4179 int keep_fds[n_fds + 3];
1da37e58
ZJS
4180 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4181 n_keep_fds = n_fds;
4182
4183 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4184 if (r < 0) {
4185 *exit_status = EXIT_FDS;
4186 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4187 }
4188
b1994387 4189#if HAVE_LIBBPF
46004616
ZJS
4190 if (unit->manager->restrict_fs) {
4191 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4192 if (bpf_map_fd < 0) {
4193 *exit_status = EXIT_FDS;
46004616 4194 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4195 }
4196
4197 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4198 if (r < 0) {
4199 *exit_status = EXIT_FDS;
4200 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4201 }
4202 }
4203#endif
4204
1da37e58 4205 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4206 if (r < 0) {
4207 *exit_status = EXIT_FDS;
12145637 4208 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4209 }
4210
0af07108
ZJS
4211 if (!context->same_pgrp &&
4212 setsid() < 0) {
4213 *exit_status = EXIT_SETSID;
4214 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4215 }
9e2f7c11 4216
1e22b5cd 4217 exec_context_tty_reset(context, params);
d35fbf6b 4218
c891efaf 4219 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4220 _cleanup_free_ char *cmdline = NULL;
4221
4ef15008 4222 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4223 if (!cmdline) {
0460aa5c 4224 *exit_status = EXIT_MEMORY;
12145637 4225 return log_oom();
3b20f877 4226 }
d35fbf6b 4227
4ef15008 4228 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4229 if (r != CONFIRM_EXECUTE) {
4230 if (r == CONFIRM_PRETEND_SUCCESS) {
4231 *exit_status = EXIT_SUCCESS;
4232 return 0;
4233 }
ff0af2a1 4234 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4235 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4236 "Execution cancelled by the user");
d35fbf6b
DM
4237 }
4238 }
1a63a750 4239
d521916d
LP
4240 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4241 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4242 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4243 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4244 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4245 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4246 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4247 *exit_status = EXIT_MEMORY;
4248 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4249 }
4250
29206d46 4251 if (context->dynamic_user && dcreds) {
da50b85a 4252 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4253
d521916d 4254 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4255 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4256 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4257 *exit_status = EXIT_USER;
12145637 4258 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4259 }
4260
da50b85a
LP
4261 r = compile_suggested_paths(context, params, &suggested_paths);
4262 if (r < 0) {
4263 *exit_status = EXIT_MEMORY;
4264 return log_oom();
4265 }
4266
4267 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4268 if (r < 0) {
4269 *exit_status = EXIT_USER;
d85ff944
YW
4270 if (r == -EILSEQ)
4271 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4272 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4273 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4274 }
524daa8c 4275
70dd455c 4276 if (!uid_is_valid(uid)) {
29206d46 4277 *exit_status = EXIT_USER;
d85ff944 4278 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4279 }
4280
4281 if (!gid_is_valid(gid)) {
4282 *exit_status = EXIT_USER;
d85ff944 4283 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4284 }
5bc7452b 4285
29206d46
LP
4286 if (dcreds->user)
4287 username = dcreds->user->name;
4288
4289 } else {
4d885bd3
DH
4290 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4291 if (r < 0) {
4292 *exit_status = EXIT_USER;
12145637 4293 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4294 }
5bc7452b 4295
4d885bd3
DH
4296 r = get_fixed_group(context, &groupname, &gid);
4297 if (r < 0) {
4298 *exit_status = EXIT_GROUP;
12145637 4299 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4300 }
cdc5d5c5 4301 }
29206d46 4302
cdc5d5c5
DH
4303 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4304 r = get_supplementary_groups(context, username, groupname, gid,
4305 &supplementary_gids, &ngids);
4306 if (r < 0) {
4307 *exit_status = EXIT_GROUP;
12145637 4308 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4309 }
5bc7452b 4310
00d9ef85
LP
4311 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4312 if (r < 0) {
4313 *exit_status = EXIT_USER;
12145637 4314 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4315 }
4316
4317 user_lookup_fd = safe_close(user_lookup_fd);
4318
6732edab
LP
4319 r = acquire_home(context, uid, &home, &home_buffer);
4320 if (r < 0) {
4321 *exit_status = EXIT_CHDIR;
12145637 4322 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4323 }
4324
d35fbf6b
DM
4325 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4326 * must sure to drop O_NONBLOCK */
4327 if (socket_fd >= 0)
a34ceba6 4328 (void) fd_nonblock(socket_fd, false);
acbb0225 4329
4c70a4a7
MS
4330 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4331 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4332 if (params->cgroup_path) {
4333 _cleanup_free_ char *p = NULL;
4334
4335 r = exec_parameters_get_cgroup_path(params, &p);
4336 if (r < 0) {
4337 *exit_status = EXIT_CGROUP;
4338 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4339 }
4340
4341 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4342 if (r == -EUCLEAN) {
4343 *exit_status = EXIT_CGROUP;
4344 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4345 "because the cgroup or one of its parents or "
4346 "siblings is in the threaded mode: %m", p);
4347 }
4c70a4a7
MS
4348 if (r < 0) {
4349 *exit_status = EXIT_CGROUP;
4350 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4351 }
4352 }
4353
a8d08f39 4354 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4355 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4356 if (r < 0) {
4357 *exit_status = EXIT_NETWORK;
4358 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4359 }
4360 }
4361
a70581ff
XR
4362 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4363 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4364 if (r < 0) {
4365 *exit_status = EXIT_NAMESPACE;
4366 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4367 }
4368 }
4369
52c239d7 4370 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4371 if (r < 0) {
4372 *exit_status = EXIT_STDIN;
12145637 4373 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4374 }
034c6ed7 4375
52c239d7 4376 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4377 if (r < 0) {
4378 *exit_status = EXIT_STDOUT;
12145637 4379 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4380 }
4381
52c239d7 4382 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4383 if (r < 0) {
4384 *exit_status = EXIT_STDERR;
12145637 4385 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4386 }
4387
d35fbf6b 4388 if (context->oom_score_adjust_set) {
9f8168eb
LP
4389 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4390 * prohibit write access to this file, and we shouldn't trip up over that. */
4391 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4392 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4393 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4394 else if (r < 0) {
ff0af2a1 4395 *exit_status = EXIT_OOM_ADJUST;
12145637 4396 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4397 }
d35fbf6b
DM
4398 }
4399
ad21e542
ZJS
4400 if (context->coredump_filter_set) {
4401 r = set_coredump_filter(context->coredump_filter);
4402 if (ERRNO_IS_PRIVILEGE(r))
4403 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4404 else if (r < 0)
4405 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4406 }
4407
39090201
DJL
4408 if (context->nice_set) {
4409 r = setpriority_closest(context->nice);
4410 if (r < 0)
4411 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4412 }
613b411c 4413
d35fbf6b
DM
4414 if (context->cpu_sched_set) {
4415 struct sched_param param = {
4416 .sched_priority = context->cpu_sched_priority,
4417 };
4418
ff0af2a1
LP
4419 r = sched_setscheduler(0,
4420 context->cpu_sched_policy |
4421 (context->cpu_sched_reset_on_fork ?
4422 SCHED_RESET_ON_FORK : 0),
4423 &param);
4424 if (r < 0) {
4425 *exit_status = EXIT_SETSCHEDULER;
12145637 4426 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4427 }
d35fbf6b 4428 }
fc9b2a84 4429
e2b2fb7f
MS
4430 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4431 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4432 const CPUSet *cpu_set;
4433
4434 if (context->cpu_affinity_from_numa) {
4435 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4436 if (r < 0) {
4437 *exit_status = EXIT_CPUAFFINITY;
4438 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4439 }
4440
4441 cpu_set = &converted_cpu_set;
4442 } else
4443 cpu_set = &context->cpu_set;
4444
4445 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4446 *exit_status = EXIT_CPUAFFINITY;
12145637 4447 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4448 }
e2b2fb7f 4449 }
034c6ed7 4450
b070c7c0
MS
4451 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4452 r = apply_numa_policy(&context->numa_policy);
4453 if (r == -EOPNOTSUPP)
33fe9e3f 4454 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4455 else if (r < 0) {
4456 *exit_status = EXIT_NUMA_POLICY;
4457 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4458 }
4459 }
4460
d35fbf6b
DM
4461 if (context->ioprio_set)
4462 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4463 *exit_status = EXIT_IOPRIO;
12145637 4464 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4465 }
da726a4d 4466
d35fbf6b
DM
4467 if (context->timer_slack_nsec != NSEC_INFINITY)
4468 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4469 *exit_status = EXIT_TIMERSLACK;
12145637 4470 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4471 }
9eba9da4 4472
21022b9d
LP
4473 if (context->personality != PERSONALITY_INVALID) {
4474 r = safe_personality(context->personality);
4475 if (r < 0) {
ff0af2a1 4476 *exit_status = EXIT_PERSONALITY;
12145637 4477 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4478 }
21022b9d 4479 }
94f04347 4480
33331d11
VB
4481 if (context->utmp_id) {
4482 const char *line = context->tty_path ?
4483 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4484 NULL;
df0ff127 4485 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4486 line,
023a4f67
LP
4487 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4488 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4489 USER_PROCESS,
6a93917d 4490 username);
33331d11 4491 }
d35fbf6b 4492
08f67696 4493 if (uid_is_valid(uid)) {
ff0af2a1
LP
4494 r = chown_terminal(STDIN_FILENO, uid);
4495 if (r < 0) {
4496 *exit_status = EXIT_STDIN;
12145637 4497 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4498 }
d35fbf6b 4499 }
8e274523 4500
4e1dfa45 4501 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4502 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4503 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4504 * touch a single hierarchy too. */
584b8688 4505 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4506 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4507 if (r < 0) {
4508 *exit_status = EXIT_CGROUP;
12145637 4509 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4510 }
d35fbf6b 4511 }
034c6ed7 4512
211a3d87
LB
4513 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4514
5b10116e 4515 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4516 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4517 if (r < 0)
4518 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4519 }
94f04347 4520
bb0c0d6f
LP
4521 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4522 r = setup_credentials(context, params, unit->id, uid);
4523 if (r < 0) {
4524 *exit_status = EXIT_CREDENTIALS;
4525 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4526 }
4527 }
4528
7bce046b 4529 r = build_environment(
fd63e712 4530 unit,
7bce046b
LP
4531 context,
4532 params,
4533 n_fds,
4534 home,
4535 username,
4536 shell,
4537 journal_stream_dev,
4538 journal_stream_ino,
4539 &our_env);
2065ca69
JW
4540 if (r < 0) {
4541 *exit_status = EXIT_MEMORY;
12145637 4542 return log_oom();
2065ca69
JW
4543 }
4544
4545 r = build_pass_environment(context, &pass_env);
4546 if (r < 0) {
4547 *exit_status = EXIT_MEMORY;
12145637 4548 return log_oom();
2065ca69
JW
4549 }
4550
adf769b0
ZJS
4551 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4552 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4553 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4554 if (!strv_isempty(context->exec_search_path)) {
4555 _cleanup_free_ char *joined = NULL;
4556
4557 joined = strv_join(context->exec_search_path, ":");
4558 if (!joined) {
4559 *exit_status = EXIT_MEMORY;
4560 return log_oom();
4561 }
4562
4563 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4564 if (r < 0) {
4565 *exit_status = EXIT_MEMORY;
4566 return log_oom();
4567 }
4568 }
4569
4ab3d29f 4570 accum_env = strv_env_merge(params->environment,
2065ca69 4571 our_env,
8c35c10d 4572 joined_exec_search_path,
2065ca69
JW
4573 pass_env,
4574 context->environment,
44e5d006 4575 files_env);
2065ca69
JW
4576 if (!accum_env) {
4577 *exit_status = EXIT_MEMORY;
12145637 4578 return log_oom();
2065ca69 4579 }
1280503b 4580 accum_env = strv_env_clean(accum_env);
2065ca69 4581
096424d1 4582 (void) umask(context->umask);
b213e1c1 4583
b1edf445 4584 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4585 if (r < 0) {
4586 *exit_status = EXIT_KEYRING;
12145637 4587 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4588 }
4589
adf769b0
ZJS
4590 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4591 * from it. */
1703fa41 4592 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4593
adf769b0
ZJS
4594 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4595 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4596 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4597
adf769b0
ZJS
4598 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4599 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4600 * desired. */
165a31c0
LP
4601 if (needs_ambient_hack)
4602 needs_setuid = false;
4603 else
4604 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4605
4606 if (needs_sandboxing) {
adf769b0
ZJS
4607 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4608 * /sys being present. The actual MAC context application will happen later, as late as
4609 * possible, to avoid impacting our own code paths. */
7f18ef0a 4610
349cc4a5 4611#if HAVE_SELINUX
43b1f709 4612 use_selinux = mac_selinux_use();
7f18ef0a 4613#endif
f9fa32f0 4614#if ENABLE_SMACK
43b1f709 4615 use_smack = mac_smack_use();
7f18ef0a 4616#endif
349cc4a5 4617#if HAVE_APPARMOR
43b1f709 4618 use_apparmor = mac_apparmor_use();
7f18ef0a 4619#endif
165a31c0 4620 }
7f18ef0a 4621
ce932d2d
LP
4622 if (needs_sandboxing) {
4623 int which_failed;
4624
4625 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4626 * is set here. (See below.) */
4627
4628 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4629 if (r < 0) {
4630 *exit_status = EXIT_LIMITS;
4631 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4632 }
4633 }
4634
0af07108 4635 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4636 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4637 * wins here. (See above.) */
4638
1da37e58 4639 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4640 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4641 if (r < 0) {
4642 *exit_status = EXIT_PAM;
4643 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4644 }
ac45f971 4645
0af07108
ZJS
4646 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4647 if (ngids_after_pam < 0) {
4648 *exit_status = EXIT_MEMORY;
4649 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4650 }
b213e1c1 4651 }
5749f855 4652
0af07108 4653 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4654 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4655 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4656 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4657
4658 userns_set_up = true;
4659 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4660 if (r < 0) {
4661 *exit_status = EXIT_USER;
4662 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4663 }
4664 }
4665
a8d08f39
LP
4666 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4667
6e2d7c4f 4668 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4669 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4670 if (r == -EPERM)
4671 log_unit_warning_errno(unit, r,
4672 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4673 else if (r < 0) {
6e2d7c4f
MS
4674 *exit_status = EXIT_NETWORK;
4675 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4676 }
a8d08f39
LP
4677 } else if (context->network_namespace_path) {
4678 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4679 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4680 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4681 } else
4682 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4683 }
169c1bda 4684
a70581ff
XR
4685 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4686
4687 if (ns_type_supported(NAMESPACE_IPC)) {
4688 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4689 if (r == -EPERM)
4690 log_unit_warning_errno(unit, r,
4691 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4692 else if (r < 0) {
4693 *exit_status = EXIT_NAMESPACE;
4694 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4695 }
4696 } else if (context->ipc_namespace_path) {
4697 *exit_status = EXIT_NAMESPACE;
4698 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4699 "IPCNamespacePath= is not supported, refusing.");
4700 } else
4701 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4702 }
4703
ee818b89 4704 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4705 _cleanup_free_ char *error_path = NULL;
4706
9f71ba8d 4707 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4708 if (r < 0) {
4709 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4710 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4711 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4712 }
d35fbf6b 4713 }
81a2b7ce 4714
daf8f72b
LP
4715 if (needs_sandboxing) {
4716 r = apply_protect_hostname(unit, context, exit_status);
4717 if (r < 0)
4718 return r;
aecd5ac6
TM
4719 }
4720
5749f855
AZ
4721 /* Drop groups as early as possible.
4722 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4723 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4724 if (needs_setuid) {
afb11bf1
DG
4725 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4726 int ngids_to_enforce = 0;
4727
4728 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4729 ngids,
4730 gids_after_pam,
4731 ngids_after_pam,
4732 &gids_to_enforce);
4733 if (ngids_to_enforce < 0) {
4734 *exit_status = EXIT_MEMORY;
4735 return log_unit_error_errno(unit,
4736 ngids_to_enforce,
4737 "Failed to merge group lists. Group membership might be incorrect: %m");
4738 }
4739
4740 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4741 if (r < 0) {
4742 *exit_status = EXIT_GROUP;
12145637 4743 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4744 }
165a31c0 4745 }
096424d1 4746
5749f855
AZ
4747 /* If the user namespace was not set up above, try to do it now.
4748 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4749 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4750 * case of mount namespaces being less privileged when the mount point list is copied from a
4751 * different user namespace). */
9008e1ac 4752
5749f855
AZ
4753 if (needs_sandboxing && context->private_users && !userns_set_up) {
4754 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4755 if (r < 0) {
4756 *exit_status = EXIT_USER;
4757 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4758 }
4759 }
4760
9f71ba8d
ZJS
4761 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4762 * shall execute. */
4763
4764 _cleanup_free_ char *executable = NULL;
b83d5050 4765 _cleanup_close_ int executable_fd = -1;
8c35c10d 4766 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4767 if (r < 0) {
4768 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4769 log_unit_struct_errno(unit, LOG_INFO, r,
4770 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4771 LOG_UNIT_INVOCATION_ID(unit),
4772 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4773 command->path),
4774 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4775 return 0;
4776 }
4777
4778 *exit_status = EXIT_EXEC;
c2503e35
RH
4779
4780 return log_unit_struct_errno(unit, LOG_INFO, r,
4781 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4782 LOG_UNIT_INVOCATION_ID(unit),
4783 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4784 command->path),
4785 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4786 }
4787
b83d5050
ZJS
4788 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4789 if (r < 0) {
4790 *exit_status = EXIT_FDS;
4791 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4792 }
4793
9f71ba8d 4794#if HAVE_SELINUX
49590d67
MS
4795 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4796 int fd = -1;
4797
4798 if (socket_fd >= 0)
4799 fd = socket_fd;
4800 else if (params->n_socket_fds == 1)
4801 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4802 * use context from that fd to compute the label. */
4803 fd = params->fds[0];
4804
4805 if (fd >= 0) {
4806 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4807 if (r < 0) {
4808 if (!context->selinux_context_ignore) {
4809 *exit_status = EXIT_SELINUX_CONTEXT;
4810 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4811 }
4812 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4813 }
9f71ba8d
ZJS
4814 }
4815 }
4816#endif
4817
165a31c0 4818 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4819 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4820 * however if we have it as we want to keep it open until the final execve(). */
4821
1da37e58 4822 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4823 if (r >= 0)
4824 r = shift_fds(fds, n_fds);
4825 if (r >= 0)
25b583d7 4826 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4827 if (r < 0) {
4828 *exit_status = EXIT_FDS;
12145637 4829 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4830 }
e66cf1a3 4831
5686391b
LP
4832 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4833 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4834 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4835 * came this far. */
4836
165a31c0 4837 secure_bits = context->secure_bits;
e66cf1a3 4838
165a31c0
LP
4839 if (needs_sandboxing) {
4840 uint64_t bset;
e66cf1a3 4841
ce932d2d
LP
4842 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4843 * requested. (Note this is placed after the general resource limit initialization, see
4844 * above, in order to take precedence.) */
f4170c67
LP
4845 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4846 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4847 *exit_status = EXIT_LIMITS;
12145637 4848 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4849 }
4850 }
4851
37ac2744
JB
4852#if ENABLE_SMACK
4853 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4854 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4855 if (use_smack) {
b83d5050 4856 r = setup_smack(context, executable_fd);
29ff6247 4857 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4858 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4859 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4860 }
4861 }
4862#endif
4863
165a31c0
LP
4864 bset = context->capability_bounding_set;
4865 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4866 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4867 * instead of us doing that */
4868 if (needs_ambient_hack)
4869 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4870 (UINT64_C(1) << CAP_SETUID) |
4871 (UINT64_C(1) << CAP_SETGID);
4872
4873 if (!cap_test_all(bset)) {
4874 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4875 if (r < 0) {
4876 *exit_status = EXIT_CAPABILITIES;
12145637 4877 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4878 }
4c2630eb 4879 }
3b8bddde 4880
16fcb191
TK
4881 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4882 * keep-caps set.
4883 * To be able to raise the ambient capabilities after setresuid() they have to be
4884 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4885 * After setresuid() the ambient capabilities can be raised as they are present in
4886 * the permitted and inhertiable set. However it is possible that someone wants to
4887 * set ambient capabilities without changing the user, so we also set the ambient
4888 * capabilities here.
4889 * The requested ambient capabilities are raised in the inheritable set if the
4890 * second argument is true. */
943800f4 4891 if (!needs_ambient_hack) {
755d4b67
IP
4892 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4893 if (r < 0) {
4894 *exit_status = EXIT_CAPABILITIES;
12145637 4895 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4896 }
755d4b67 4897 }
165a31c0 4898 }
755d4b67 4899
fa97f630
JB
4900 /* chroot to root directory first, before we lose the ability to chroot */
4901 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4902 if (r < 0)
4903 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4904
165a31c0 4905 if (needs_setuid) {
08f67696 4906 if (uid_is_valid(uid)) {
ff0af2a1
LP
4907 r = enforce_user(context, uid);
4908 if (r < 0) {
4909 *exit_status = EXIT_USER;
12145637 4910 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4911 }
165a31c0
LP
4912
4913 if (!needs_ambient_hack &&
4914 context->capability_ambient_set != 0) {
755d4b67 4915
16fcb191 4916 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4917 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4918 if (r < 0) {
4919 *exit_status = EXIT_CAPABILITIES;
12145637 4920 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4921 }
755d4b67 4922 }
5b6319dc 4923 }
165a31c0 4924 }
d35fbf6b 4925
56ef8db9
JB
4926 /* Apply working directory here, because the working directory might be on NFS and only the user running
4927 * this service might have the correct privilege to change to the working directory */
fa97f630 4928 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4929 if (r < 0)
4930 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4931
165a31c0 4932 if (needs_sandboxing) {
37ac2744 4933 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4934 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4935 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4936 * are restricted. */
4937
349cc4a5 4938#if HAVE_SELINUX
43b1f709 4939 if (use_selinux) {
5cd9cd35
LP
4940 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4941
4942 if (exec_context) {
4943 r = setexeccon(exec_context);
006d1864
TM
4944 if (r < 0) {
4945 if (!context->selinux_context_ignore) {
4946 *exit_status = EXIT_SELINUX_CONTEXT;
4947 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4948 }
4949 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4950 }
4951 }
4952 }
4953#endif
4954
349cc4a5 4955#if HAVE_APPARMOR
43b1f709 4956 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4957 r = aa_change_onexec(context->apparmor_profile);
4958 if (r < 0 && !context->apparmor_profile_ignore) {
4959 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4960 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4961 }
4962 }
4963#endif
4964
165a31c0 4965 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4966 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4967 * CAP_SETPCAP. */
4968 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4969 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4970 * effective set here.
4971 * The effective set is overwritten during execve with the following values:
4972 * - ambient set (for non-root processes)
4973 * - (inheritable | bounding) set for root processes)
4974 *
4975 * Hence there is no security impact to raise it in the effective set before execve
4976 */
4977 r = capability_gain_cap_setpcap(NULL);
4978 if (r < 0) {
4979 *exit_status = EXIT_CAPABILITIES;
4980 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4981 }
755d4b67 4982 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4983 *exit_status = EXIT_SECUREBITS;
12145637 4984 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4985 }
dbdc4098 4986 }
5b6319dc 4987
59eeb84b 4988 if (context_has_no_new_privileges(context))
d35fbf6b 4989 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4990 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4991 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4992 }
4993
349cc4a5 4994#if HAVE_SECCOMP
469830d1
LP
4995 r = apply_address_families(unit, context);
4996 if (r < 0) {
4997 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4998 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4999 }
04aa0cb9 5000
469830d1
LP
5001 r = apply_memory_deny_write_execute(unit, context);
5002 if (r < 0) {
5003 *exit_status = EXIT_SECCOMP;
12145637 5004 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5005 }
f4170c67 5006
469830d1
LP
5007 r = apply_restrict_realtime(unit, context);
5008 if (r < 0) {
5009 *exit_status = EXIT_SECCOMP;
12145637 5010 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5011 }
5012
f69567cb
LP
5013 r = apply_restrict_suid_sgid(unit, context);
5014 if (r < 0) {
5015 *exit_status = EXIT_SECCOMP;
5016 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5017 }
5018
add00535
LP
5019 r = apply_restrict_namespaces(unit, context);
5020 if (r < 0) {
5021 *exit_status = EXIT_SECCOMP;
12145637 5022 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5023 }
5024
469830d1
LP
5025 r = apply_protect_sysctl(unit, context);
5026 if (r < 0) {
5027 *exit_status = EXIT_SECCOMP;
12145637 5028 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5029 }
5030
469830d1
LP
5031 r = apply_protect_kernel_modules(unit, context);
5032 if (r < 0) {
5033 *exit_status = EXIT_SECCOMP;
12145637 5034 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5035 }
5036
84703040
KK
5037 r = apply_protect_kernel_logs(unit, context);
5038 if (r < 0) {
5039 *exit_status = EXIT_SECCOMP;
5040 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5041 }
5042
fc64760d
KK
5043 r = apply_protect_clock(unit, context);
5044 if (r < 0) {
5045 *exit_status = EXIT_SECCOMP;
5046 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5047 }
5048
469830d1
LP
5049 r = apply_private_devices(unit, context);
5050 if (r < 0) {
5051 *exit_status = EXIT_SECCOMP;
12145637 5052 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5053 }
5054
5055 r = apply_syscall_archs(unit, context);
5056 if (r < 0) {
5057 *exit_status = EXIT_SECCOMP;
12145637 5058 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5059 }
5060
78e864e5
TM
5061 r = apply_lock_personality(unit, context);
5062 if (r < 0) {
5063 *exit_status = EXIT_SECCOMP;
12145637 5064 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5065 }
5066
9df2cdd8
TM
5067 r = apply_syscall_log(unit, context);
5068 if (r < 0) {
5069 *exit_status = EXIT_SECCOMP;
5070 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5071 }
5072
5cd9cd35
LP
5073 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5074 * by the filter as little as possible. */
165a31c0 5075 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5076 if (r < 0) {
5077 *exit_status = EXIT_SECCOMP;
12145637 5078 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5079 }
5080#endif
b1994387
ILG
5081
5082#if HAVE_LIBBPF
5083 r = apply_restrict_filesystems(unit, context);
5084 if (r < 0) {
5085 *exit_status = EXIT_BPF;
5086 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5087 }
5088#endif
5089
d35fbf6b 5090 }
034c6ed7 5091
00819cc1
LP
5092 if (!strv_isempty(context->unset_environment)) {
5093 char **ee = NULL;
5094
5095 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5096 if (!ee) {
5097 *exit_status = EXIT_MEMORY;
12145637 5098 return log_oom();
00819cc1
LP
5099 }
5100
130d3d22 5101 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5102 }
5103
7ca69792
AZ
5104 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5105 replaced_argv = replace_env_argv(command->argv, accum_env);
5106 if (!replaced_argv) {
5107 *exit_status = EXIT_MEMORY;
5108 return log_oom();
5109 }
5110 final_argv = replaced_argv;
5111 } else
5112 final_argv = command->argv;
034c6ed7 5113
f1d34068 5114 if (DEBUG_LOGGING) {
c2b2df60 5115 _cleanup_free_ char *line = NULL;
81a2b7ce 5116
4ef15008 5117 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5118 if (!line) {
5119 *exit_status = EXIT_MEMORY;
5120 return log_oom();
5121 }
5122
5123 log_unit_struct(unit, LOG_DEBUG,
5124 "EXECUTABLE=%s", executable,
5125 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5126 }
dd305ec9 5127
5686391b
LP
5128 if (exec_fd >= 0) {
5129 uint8_t hot = 1;
5130
5131 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5132 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5133
5134 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5135 *exit_status = EXIT_EXEC;
5136 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5137 }
5138 }
5139
a6d9111c 5140 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5141
5142 if (exec_fd >= 0) {
5143 uint8_t hot = 0;
5144
5145 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5146 * that POLLHUP on it no longer means execve() succeeded. */
5147
5148 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5149 *exit_status = EXIT_EXEC;
5150 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5151 }
5152 }
12145637 5153
ff0af2a1 5154 *exit_status = EXIT_EXEC;
9f71ba8d 5155 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5156}
81a2b7ce 5157
34cf6c43 5158static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5159static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5160
f2341e0a
LP
5161int exec_spawn(Unit *unit,
5162 ExecCommand *command,
d35fbf6b
DM
5163 const ExecContext *context,
5164 const ExecParameters *params,
5165 ExecRuntime *runtime,
29206d46 5166 DynamicCreds *dcreds,
d35fbf6b 5167 pid_t *ret) {
8351ceae 5168
ee39ca20 5169 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5170 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5171 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5172 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5173 _cleanup_free_ char *line = NULL;
d35fbf6b 5174 pid_t pid;
8351ceae 5175
f2341e0a 5176 assert(unit);
d35fbf6b
DM
5177 assert(command);
5178 assert(context);
5179 assert(ret);
5180 assert(params);
25b583d7 5181 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5182
d35fbf6b
DM
5183 if (context->std_input == EXEC_INPUT_SOCKET ||
5184 context->std_output == EXEC_OUTPUT_SOCKET ||
5185 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5186
d85ff944
YW
5187 if (params->n_socket_fds > 1)
5188 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5189
d85ff944
YW
5190 if (params->n_socket_fds == 0)
5191 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5192
d35fbf6b
DM
5193 socket_fd = params->fds[0];
5194 } else {
5195 socket_fd = -1;
5196 fds = params->fds;
9b141911 5197 n_socket_fds = params->n_socket_fds;
25b583d7 5198 n_storage_fds = params->n_storage_fds;
d35fbf6b 5199 }
94f04347 5200
34cf6c43 5201 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5202 if (r < 0)
5203 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5204
f2341e0a 5205 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5206 if (r < 0)
f2341e0a 5207 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5208
4ef15008 5209 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5210 if (!line)
5211 return log_oom();
fab56fc5 5212
9f71ba8d
ZJS
5213 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5214 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5215 mac_selinux_maybe_reload();
5216
c2503e35
RH
5217 log_unit_struct(unit, LOG_DEBUG,
5218 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5219 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5220 the mount namespace in the child, but we want to log
5221 from the parent, so we need to use the (possibly
5222 inaccurate) path here. */
5223 LOG_UNIT_INVOCATION_ID(unit));
12145637 5224
78f93209
LP
5225 if (params->cgroup_path) {
5226 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5227 if (r < 0)
5228 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5229 if (r > 0) { /* We are using a child cgroup */
5230 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5231 if (r < 0)
5232 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
5233
5234 /* Normally we would not propagate the oomd xattrs to children but since we created this
5235 * sub-cgroup internally we should do it. */
5236 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
5237 }
5238 }
5239
d35fbf6b
DM
5240 pid = fork();
5241 if (pid < 0)
74129a12 5242 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5243
5244 if (pid == 0) {
12145637 5245 int exit_status = EXIT_SUCCESS;
ff0af2a1 5246
f2341e0a
LP
5247 r = exec_child(unit,
5248 command,
ff0af2a1
LP
5249 context,
5250 params,
5251 runtime,
29206d46 5252 dcreds,
ff0af2a1 5253 socket_fd,
52c239d7 5254 named_iofds,
4c47affc 5255 fds,
9b141911 5256 n_socket_fds,
25b583d7 5257 n_storage_fds,
ff0af2a1 5258 files_env,
00d9ef85 5259 unit->manager->user_lookup_fds[1],
12145637
LP
5260 &exit_status);
5261
e1714f02
ZJS
5262 if (r < 0) {
5263 const char *status =
5264 exit_status_to_string(exit_status,
e04ed6db 5265 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5266
c2503e35
RH
5267 log_unit_struct_errno(unit, LOG_ERR, r,
5268 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5269 LOG_UNIT_INVOCATION_ID(unit),
5270 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5271 status, command->path),
5272 "EXECUTABLE=%s", command->path);
e1714f02 5273 }
4c2630eb 5274
ff0af2a1 5275 _exit(exit_status);
034c6ed7
LP
5276 }
5277
f2341e0a 5278 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5279
78f93209
LP
5280 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5281 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5282 * process will be killed too). */
5283 if (subcgroup_path)
5284 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5285
b58b4116 5286 exec_status_start(&command->exec_status, pid);
9fb86720 5287
034c6ed7 5288 *ret = pid;
5cb5a6ff
LP
5289 return 0;
5290}
5291
034c6ed7
LP
5292void exec_context_init(ExecContext *c) {
5293 assert(c);
5294
4c12626c 5295 c->umask = 0022;
0692548c 5296 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5297 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5298 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5299 c->syslog_level_prefix = true;
353e12c2 5300 c->ignore_sigpipe = true;
3a43da28 5301 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5302 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5303 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5304 c->directories[t].mode = 0755;
12213aed 5305 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5306 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5307 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5308 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5309 c->log_level_max = -1;
005bfaf1
TM
5310#if HAVE_SECCOMP
5311 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5312#endif
51462135
DDM
5313 c->tty_rows = UINT_MAX;
5314 c->tty_cols = UINT_MAX;
b070c7c0 5315 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5316}
5317
613b411c 5318void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5319 assert(c);
5320
6796073e
LP
5321 c->environment = strv_free(c->environment);
5322 c->environment_files = strv_free(c->environment_files);
b4c14404 5323 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5324 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5325
31ce987c 5326 rlimit_free_all(c->rlimit);
034c6ed7 5327
5b10116e 5328 for (size_t l = 0; l < 3; l++) {
52c239d7 5329 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5330 c->stdio_file[l] = mfree(c->stdio_file[l]);
5331 }
52c239d7 5332
a1e58e8e
LP
5333 c->working_directory = mfree(c->working_directory);
5334 c->root_directory = mfree(c->root_directory);
915e6d16 5335 c->root_image = mfree(c->root_image);
18d73705 5336 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5337 c->root_hash = mfree(c->root_hash);
5338 c->root_hash_size = 0;
5339 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5340 c->root_hash_sig = mfree(c->root_hash_sig);
5341 c->root_hash_sig_size = 0;
5342 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5343 c->root_verity = mfree(c->root_verity);
93f59701 5344 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5345 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5346 c->tty_path = mfree(c->tty_path);
5347 c->syslog_identifier = mfree(c->syslog_identifier);
5348 c->user = mfree(c->user);
5349 c->group = mfree(c->group);
034c6ed7 5350
6796073e 5351 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5352
a1e58e8e 5353 c->pam_name = mfree(c->pam_name);
5b6319dc 5354
2a624c36
AP
5355 c->read_only_paths = strv_free(c->read_only_paths);
5356 c->read_write_paths = strv_free(c->read_write_paths);
5357 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5358 c->exec_paths = strv_free(c->exec_paths);
5359 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5360 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5361
d2d6c096 5362 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5363 c->bind_mounts = NULL;
5364 c->n_bind_mounts = 0;
2abd4e38
YW
5365 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5366 c->temporary_filesystems = NULL;
5367 c->n_temporary_filesystems = 0;
b3d13314 5368 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5369
0985c7c4 5370 cpu_set_reset(&c->cpu_set);
b070c7c0 5371 numa_policy_reset(&c->numa_policy);
86a3475b 5372
a1e58e8e
LP
5373 c->utmp_id = mfree(c->utmp_id);
5374 c->selinux_context = mfree(c->selinux_context);
5375 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5376 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5377
b1994387
ILG
5378 c->restrict_filesystems = set_free(c->restrict_filesystems);
5379
8cfa775f 5380 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5381 c->syscall_archs = set_free(c->syscall_archs);
5382 c->address_families = set_free(c->address_families);
e66cf1a3 5383
5b10116e 5384 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5385 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5386
5387 c->log_level_max = -1;
5388
5389 exec_context_free_log_extra_fields(c);
08f3be7a 5390
5ac1530e
ZJS
5391 c->log_ratelimit_interval_usec = 0;
5392 c->log_ratelimit_burst = 0;
90fc172e 5393
08f3be7a
LP
5394 c->stdin_data = mfree(c->stdin_data);
5395 c->stdin_data_size = 0;
a8d08f39
LP
5396
5397 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5398 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5399
5400 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5401
43144be4 5402 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5403 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5404}
5405
34cf6c43 5406int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5407 assert(c);
5408
5409 if (!runtime_prefix)
5410 return 0;
5411
211a3d87 5412 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5413 _cleanup_free_ char *p = NULL;
e66cf1a3 5414
494d0247 5415 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5416 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5417 else
211a3d87 5418 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5419 if (!p)
5420 return -ENOMEM;
5421
7bc4bf4a
LP
5422 /* We execute this synchronously, since we need to be sure this is gone when we start the
5423 * service next. */
c6878637 5424 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5425
211a3d87
LB
5426 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5427 _cleanup_free_ char *symlink_abs = NULL;
5428
5429 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5430 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5431 else
5432 symlink_abs = path_join(runtime_prefix, *symlink);
5433 if (!symlink_abs)
5434 return -ENOMEM;
5435
5436 (void) unlink(symlink_abs);
5437 }
5438
e66cf1a3
LP
5439 }
5440
5441 return 0;
5cb5a6ff
LP
5442}
5443
bb0c0d6f
LP
5444int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5445 _cleanup_free_ char *p = NULL;
5446
5447 assert(c);
5448
5449 if (!runtime_prefix || !unit)
5450 return 0;
5451
5452 p = path_join(runtime_prefix, "credentials", unit);
5453 if (!p)
5454 return -ENOMEM;
5455
5456 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5457 * unmount it, and afterwards remove the mount point */
5458 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5459 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5460
5461 return 0;
5462}
5463
34cf6c43 5464static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5465 assert(c);
5466
a1e58e8e 5467 c->path = mfree(c->path);
6796073e 5468 c->argv = strv_free(c->argv);
43d0fcbd
LP
5469}
5470
da6053d0 5471void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5472 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5473 exec_command_done(c+i);
5474}
5475
f1acf85a 5476ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5477 ExecCommand *i;
5478
5479 while ((i = c)) {
71fda00f 5480 LIST_REMOVE(command, c, i);
43d0fcbd 5481 exec_command_done(i);
5cb5a6ff
LP
5482 free(i);
5483 }
f1acf85a
ZJS
5484
5485 return NULL;
5cb5a6ff
LP
5486}
5487
da6053d0 5488void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5489 for (size_t i = 0; i < n; i++)
f1acf85a 5490 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5491}
5492
6a1d4d9f 5493void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5494 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5495 exec_status_reset(&c[i].exec_status);
5496}
5497
5498void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5499 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5500 LIST_FOREACH(command, z, c[i])
5501 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5502}
5503
039f0e70 5504typedef struct InvalidEnvInfo {
34cf6c43 5505 const Unit *unit;
039f0e70
LP
5506 const char *path;
5507} InvalidEnvInfo;
5508
5509static void invalid_env(const char *p, void *userdata) {
5510 InvalidEnvInfo *info = userdata;
5511
f2341e0a 5512 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5513}
5514
52c239d7
LB
5515const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5516 assert(c);
5517
5518 switch (fd_index) {
5073ff6b 5519
52c239d7
LB
5520 case STDIN_FILENO:
5521 if (c->std_input != EXEC_INPUT_NAMED_FD)
5522 return NULL;
5073ff6b 5523
52c239d7 5524 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5525
52c239d7
LB
5526 case STDOUT_FILENO:
5527 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5528 return NULL;
5073ff6b 5529
52c239d7 5530 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5531
52c239d7
LB
5532 case STDERR_FILENO:
5533 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5534 return NULL;
5073ff6b 5535
52c239d7 5536 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5537
52c239d7
LB
5538 default:
5539 return NULL;
5540 }
5541}
5542
2caa38e9
LP
5543static int exec_context_named_iofds(
5544 const ExecContext *c,
5545 const ExecParameters *p,
5546 int named_iofds[static 3]) {
5547
5b10116e 5548 size_t targets;
56fbd561 5549 const char* stdio_fdname[3];
da6053d0 5550 size_t n_fds;
52c239d7
LB
5551
5552 assert(c);
5553 assert(p);
2caa38e9 5554 assert(named_iofds);
52c239d7
LB
5555
5556 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5557 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5558 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5559
5b10116e 5560 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5561 stdio_fdname[i] = exec_context_fdname(c, i);
5562
4c47affc
FB
5563 n_fds = p->n_storage_fds + p->n_socket_fds;
5564
5b10116e 5565 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5566 if (named_iofds[STDIN_FILENO] < 0 &&
5567 c->std_input == EXEC_INPUT_NAMED_FD &&
5568 stdio_fdname[STDIN_FILENO] &&
5569 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5570
52c239d7
LB
5571 named_iofds[STDIN_FILENO] = p->fds[i];
5572 targets--;
56fbd561
ZJS
5573
5574 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5575 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5576 stdio_fdname[STDOUT_FILENO] &&
5577 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5578
52c239d7
LB
5579 named_iofds[STDOUT_FILENO] = p->fds[i];
5580 targets--;
56fbd561
ZJS
5581
5582 } else if (named_iofds[STDERR_FILENO] < 0 &&
5583 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5584 stdio_fdname[STDERR_FILENO] &&
5585 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5586
52c239d7
LB
5587 named_iofds[STDERR_FILENO] = p->fds[i];
5588 targets--;
5589 }
5590
56fbd561 5591 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5592}
5593
398a5009
ZJS
5594static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5595 _cleanup_strv_free_ char **v = NULL;
398a5009 5596 int r;
8c7be95e
LP
5597
5598 assert(c);
398a5009 5599 assert(ret);
8c7be95e
LP
5600
5601 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5602 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5603 bool ignore = false;
5604 char *fn = *i;
8c7be95e
LP
5605
5606 if (fn[0] == '-') {
5607 ignore = true;
313cefa1 5608 fn++;
8c7be95e
LP
5609 }
5610
5611 if (!path_is_absolute(fn)) {
8c7be95e
LP
5612 if (ignore)
5613 continue;
8c7be95e
LP
5614 return -EINVAL;
5615 }
5616
2bef10ab 5617 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5618 r = safe_glob(fn, 0, &pglob);
5619 if (r < 0) {
2bef10ab
PL
5620 if (ignore)
5621 continue;
398a5009 5622 return r;
2bef10ab 5623 }
8c7be95e 5624
d8c92e8b
ZJS
5625 /* When we don't match anything, -ENOENT should be returned */
5626 assert(pglob.gl_pathc > 0);
5627
5b10116e 5628 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5629 _cleanup_strv_free_ char **p = NULL;
5630
5631 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5632 if (r < 0) {
2bef10ab
PL
5633 if (ignore)
5634 continue;
398a5009 5635 return r;
e9c1ea9d 5636 }
398a5009 5637
ebc05a09 5638 /* Log invalid environment variables with filename */
039f0e70
LP
5639 if (p) {
5640 InvalidEnvInfo info = {
f2341e0a 5641 .unit = unit,
039f0e70
LP
5642 .path = pglob.gl_pathv[n]
5643 };
5644
5645 p = strv_env_clean_with_callback(p, invalid_env, &info);
5646 }
8c7be95e 5647
398a5009
ZJS
5648 if (!v)
5649 v = TAKE_PTR(p);
2bef10ab 5650 else {
398a5009 5651 char **m = strv_env_merge(v, p);
c84a9488 5652 if (!m)
2bef10ab 5653 return -ENOMEM;
2bef10ab 5654
398a5009 5655 strv_free_and_replace(v, m);
2bef10ab 5656 }
8c7be95e
LP
5657 }
5658 }
5659
398a5009 5660 *ret = TAKE_PTR(v);
8c7be95e
LP
5661
5662 return 0;
5663}
5664
6ac8fdc9 5665static bool tty_may_match_dev_console(const char *tty) {
7b912648 5666 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5667
1e22b5cd
LP
5668 if (!tty)
5669 return true;
5670
a119ec7c 5671 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5672
5673 /* trivial identity? */
5674 if (streq(tty, "console"))
5675 return true;
5676
7b912648
LP
5677 if (resolve_dev_console(&resolved) < 0)
5678 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5679
5680 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5681 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5682}
5683
6c0ae739
LP
5684static bool exec_context_may_touch_tty(const ExecContext *ec) {
5685 assert(ec);
1e22b5cd 5686
6c0ae739 5687 return ec->tty_reset ||
1e22b5cd
LP
5688 ec->tty_vhangup ||
5689 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5690 is_terminal_input(ec->std_input) ||
5691 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5692 is_terminal_output(ec->std_error);
5693}
5694
5695bool exec_context_may_touch_console(const ExecContext *ec) {
5696
5697 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5698 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5699}
5700
15ae422b 5701static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5702 assert(f);
5703
5704 STRV_FOREACH(g, l)
5705 fprintf(f, " %s", *g);
5706}
5707
ddc155b2
TM
5708static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5709 assert(f);
5710 assert(prefix);
5711 assert(name);
5712
5713 if (!strv_isempty(strv)) {
a7bd1656 5714 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5715 strv_fprintf(f, strv);
5716 fputs("\n", f);
5717 }
5718}
5719
34cf6c43 5720void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5721 int r;
9eba9da4 5722
5cb5a6ff
LP
5723 assert(c);
5724 assert(f);
5725
4ad49000 5726 prefix = strempty(prefix);
5cb5a6ff
LP
5727
5728 fprintf(f,
94f04347
LP
5729 "%sUMask: %04o\n"
5730 "%sWorkingDirectory: %s\n"
451a074f 5731 "%sRootDirectory: %s\n"
15ae422b 5732 "%sNonBlocking: %s\n"
64747e2d 5733 "%sPrivateTmp: %s\n"
7f112f50 5734 "%sPrivateDevices: %s\n"
59eeb84b 5735 "%sProtectKernelTunables: %s\n"
e66a2f65 5736 "%sProtectKernelModules: %s\n"
84703040 5737 "%sProtectKernelLogs: %s\n"
fc64760d 5738 "%sProtectClock: %s\n"
59eeb84b 5739 "%sProtectControlGroups: %s\n"
d251207d
LP
5740 "%sPrivateNetwork: %s\n"
5741 "%sPrivateUsers: %s\n"
1b8689f9
LP
5742 "%sProtectHome: %s\n"
5743 "%sProtectSystem: %s\n"
5d997827 5744 "%sMountAPIVFS: %s\n"
f3e43635 5745 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5746 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5747 "%sRestrictRealtime: %s\n"
f69567cb 5748 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5749 "%sKeyringMode: %s\n"
4e399953
LP
5750 "%sProtectHostname: %s\n"
5751 "%sProtectProc: %s\n"
5752 "%sProcSubset: %s\n",
5cb5a6ff 5753 prefix, c->umask,
14eb3285
LP
5754 prefix, empty_to_root(c->working_directory),
5755 prefix, empty_to_root(c->root_directory),
15ae422b 5756 prefix, yes_no(c->non_blocking),
64747e2d 5757 prefix, yes_no(c->private_tmp),
7f112f50 5758 prefix, yes_no(c->private_devices),
59eeb84b 5759 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5760 prefix, yes_no(c->protect_kernel_modules),
84703040 5761 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5762 prefix, yes_no(c->protect_clock),
59eeb84b 5763 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5764 prefix, yes_no(c->private_network),
5765 prefix, yes_no(c->private_users),
1b8689f9
LP
5766 prefix, protect_home_to_string(c->protect_home),
5767 prefix, protect_system_to_string(c->protect_system),
5e98086d 5768 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5769 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5770 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5771 prefix, yes_no(c->restrict_realtime),
f69567cb 5772 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5773 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5774 prefix, yes_no(c->protect_hostname),
5775 prefix, protect_proc_to_string(c->protect_proc),
5776 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5777
915e6d16
LP
5778 if (c->root_image)
5779 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5780
18d73705 5781 if (c->root_image_options) {
18d73705
LB
5782 fprintf(f, "%sRootImageOptions:", prefix);
5783 LIST_FOREACH(mount_options, o, c->root_image_options)
5784 if (!isempty(o->options))
9ece6444
LB
5785 fprintf(f, " %s:%s",
5786 partition_designator_to_string(o->partition_designator),
5787 o->options);
18d73705
LB
5788 fprintf(f, "\n");
5789 }
5790
0389f4fa
LB
5791 if (c->root_hash) {
5792 _cleanup_free_ char *encoded = NULL;
5793 encoded = hexmem(c->root_hash, c->root_hash_size);
5794 if (encoded)
5795 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5796 }
5797
5798 if (c->root_hash_path)
5799 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5800
d4d55b0d
LB
5801 if (c->root_hash_sig) {
5802 _cleanup_free_ char *encoded = NULL;
5803 ssize_t len;
5804 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5805 if (len)
5806 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5807 }
5808
5809 if (c->root_hash_sig_path)
5810 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5811
0389f4fa
LB
5812 if (c->root_verity)
5813 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5814
8c7be95e
LP
5815 STRV_FOREACH(e, c->environment)
5816 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5817
5818 STRV_FOREACH(e, c->environment_files)
5819 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5820
b4c14404
FB
5821 STRV_FOREACH(e, c->pass_environment)
5822 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5823
00819cc1
LP
5824 STRV_FOREACH(e, c->unset_environment)
5825 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5826
53f47dfc
YW
5827 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5828
5b10116e 5829 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5830 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5831
211a3d87
LB
5832 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5833 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5834
5835 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5836 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5837 }
3536f49e 5838 }
c2bbd90b 5839
5291f26d 5840 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5841
fb33a393 5842 if (c->nice_set)
5291f26d 5843 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5844
dd6c17b1 5845 if (c->oom_score_adjust_set)
5291f26d 5846 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5847
ad21e542 5848 if (c->coredump_filter_set)
5291f26d 5849 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5850
5b10116e 5851 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5852 if (c->rlimit[i]) {
4c3a2b84 5853 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5854 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5855 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5856 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5857 }
94f04347 5858
f8b69d1d 5859 if (c->ioprio_set) {
1756a011 5860 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5861
5bead76e 5862 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5863 if (r >= 0)
5864 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5865
5bead76e 5866 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5867 }
94f04347 5868
f8b69d1d 5869 if (c->cpu_sched_set) {
1756a011 5870 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5871
837df140
YW
5872 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5873 if (r >= 0)
5874 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5875
94f04347 5876 fprintf(f,
38b48754
LP
5877 "%sCPUSchedulingPriority: %i\n"
5878 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5879 prefix, c->cpu_sched_priority,
5880 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5881 }
94f04347 5882
0985c7c4 5883 if (c->cpu_set.set) {
e7fca352
MS
5884 _cleanup_free_ char *affinity = NULL;
5885
5886 affinity = cpu_set_to_range_string(&c->cpu_set);
5887 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5888 }
5889
b070c7c0
MS
5890 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5891 _cleanup_free_ char *nodes = NULL;
5892
5893 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5894 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5895 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5896 }
5897
3a43da28 5898 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5899 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5900
5901 fprintf(f,
80876c20
LP
5902 "%sStandardInput: %s\n"
5903 "%sStandardOutput: %s\n"
5904 "%sStandardError: %s\n",
5905 prefix, exec_input_to_string(c->std_input),
5906 prefix, exec_output_to_string(c->std_output),
5907 prefix, exec_output_to_string(c->std_error));
5908
befc4a80
LP
5909 if (c->std_input == EXEC_INPUT_NAMED_FD)
5910 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5911 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5912 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5913 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5914 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5915
5916 if (c->std_input == EXEC_INPUT_FILE)
5917 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5918 if (c->std_output == EXEC_OUTPUT_FILE)
5919 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5920 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5921 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5922 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5923 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5924 if (c->std_error == EXEC_OUTPUT_FILE)
5925 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5926 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5927 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5928 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5929 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5930
80876c20
LP
5931 if (c->tty_path)
5932 fprintf(f,
6ea832a2
LP
5933 "%sTTYPath: %s\n"
5934 "%sTTYReset: %s\n"
5935 "%sTTYVHangup: %s\n"
51462135
DDM
5936 "%sTTYVTDisallocate: %s\n"
5937 "%sTTYRows: %u\n"
5938 "%sTTYColumns: %u\n",
6ea832a2
LP
5939 prefix, c->tty_path,
5940 prefix, yes_no(c->tty_reset),
5941 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5942 prefix, yes_no(c->tty_vt_disallocate),
5943 prefix, c->tty_rows,
5944 prefix, c->tty_cols);
94f04347 5945
9f6444eb 5946 if (IN_SET(c->std_output,
9f6444eb
LP
5947 EXEC_OUTPUT_KMSG,
5948 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5949 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5950 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5951 IN_SET(c->std_error,
9f6444eb
LP
5952 EXEC_OUTPUT_KMSG,
5953 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5954 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5955 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5956
5ce70e5b 5957 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5958
837df140
YW
5959 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5960 if (r >= 0)
5961 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5962
837df140
YW
5963 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5964 if (r >= 0)
5965 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5966 }
94f04347 5967
d3070fbd
LP
5968 if (c->log_level_max >= 0) {
5969 _cleanup_free_ char *t = NULL;
5970
5971 (void) log_level_to_string_alloc(c->log_level_max, &t);
5972
5973 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5974 }
5975
5291f26d 5976 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5977 fprintf(f,
5978 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5979 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5980
5ac1530e
ZJS
5981 if (c->log_ratelimit_burst > 0)
5982 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5983
5b10116e
ZJS
5984 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5985 fprintf(f, "%sLogExtraFields: ", prefix);
5986 fwrite(c->log_extra_fields[j].iov_base,
5987 1, c->log_extra_fields[j].iov_len,
5988 f);
5989 fputc('\n', f);
d3070fbd
LP
5990 }
5991
91dd5f7c
LP
5992 if (c->log_namespace)
5993 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5994
07d46372
YW
5995 if (c->secure_bits) {
5996 _cleanup_free_ char *str = NULL;
5997
5998 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5999 if (r >= 0)
6000 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6001 }
94f04347 6002
a103496c 6003 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 6004 _cleanup_free_ char *str = NULL;
94f04347 6005
dd1f5bd0
YW
6006 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6007 if (r >= 0)
6008 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6009 }
6010
6011 if (c->capability_ambient_set != 0) {
dd1f5bd0 6012 _cleanup_free_ char *str = NULL;
755d4b67 6013
dd1f5bd0
YW
6014 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6015 if (r >= 0)
6016 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6017 }
6018
6019 if (c->user)
f2d3769a 6020 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6021 if (c->group)
f2d3769a 6022 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6023
29206d46
LP
6024 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6025
ddc155b2 6026 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6027
5b6319dc 6028 if (c->pam_name)
f2d3769a 6029 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6030
ddc155b2
TM
6031 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6032 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6033 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6034 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6035 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6036 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6037
5b10116e
ZJS
6038 for (size_t i = 0; i < c->n_bind_mounts; i++)
6039 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6040 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6041 c->bind_mounts[i].ignore_enoent ? "-": "",
6042 c->bind_mounts[i].source,
6043 c->bind_mounts[i].destination,
6044 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6045
5b10116e
ZJS
6046 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6047 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6048
5b10116e
ZJS
6049 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6050 t->path,
6051 isempty(t->options) ? "" : ":",
6052 strempty(t->options));
6053 }
2abd4e38 6054
169c1bda
LP
6055 if (c->utmp_id)
6056 fprintf(f,
6057 "%sUtmpIdentifier: %s\n",
6058 prefix, c->utmp_id);
7b52a628
MS
6059
6060 if (c->selinux_context)
6061 fprintf(f,
5f8640fb
LP
6062 "%sSELinuxContext: %s%s\n",
6063 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6064
80c21aea
WC
6065 if (c->apparmor_profile)
6066 fprintf(f,
6067 "%sAppArmorProfile: %s%s\n",
6068 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6069
6070 if (c->smack_process_label)
6071 fprintf(f,
6072 "%sSmackProcessLabel: %s%s\n",
6073 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6074
050f7277 6075 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6076 fprintf(f,
6077 "%sPersonality: %s\n",
6078 prefix, strna(personality_to_string(c->personality)));
6079
78e864e5
TM
6080 fprintf(f,
6081 "%sLockPersonality: %s\n",
6082 prefix, yes_no(c->lock_personality));
6083
17df7223 6084 if (c->syscall_filter) {
17df7223 6085 fprintf(f,
57183d11 6086 "%sSystemCallFilter: ",
17df7223
LP
6087 prefix);
6088
6b000af4 6089 if (!c->syscall_allow_list)
17df7223
LP
6090 fputc('~', f);
6091
349cc4a5 6092#if HAVE_SECCOMP
d5a99b7c
JJ
6093 void *id, *val;
6094 bool first = true;
90e74a66 6095 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6096 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6097 const char *errno_name = NULL;
6098 int num = PTR_TO_INT(val);
17df7223
LP
6099
6100 if (first)
6101 first = false;
6102 else
6103 fputc(' ', f);
6104
57183d11 6105 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6106 fputs(strna(name), f);
8cfa775f
YW
6107
6108 if (num >= 0) {
005bfaf1 6109 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6110 if (errno_name)
6111 fprintf(f, ":%s", errno_name);
6112 else
6113 fprintf(f, ":%d", num);
6114 }
17df7223 6115 }
351a19b1 6116#endif
17df7223
LP
6117
6118 fputc('\n', f);
6119 }
6120
57183d11 6121 if (c->syscall_archs) {
57183d11
LP
6122 fprintf(f,
6123 "%sSystemCallArchitectures:",
6124 prefix);
6125
349cc4a5 6126#if HAVE_SECCOMP
d5a99b7c 6127 void *id;
90e74a66 6128 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6129 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6130#endif
6131 fputc('\n', f);
6132 }
6133
add00535
LP
6134 if (exec_context_restrict_namespaces_set(c)) {
6135 _cleanup_free_ char *s = NULL;
6136
86c2a9f1 6137 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6138 if (r >= 0)
6139 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6140 prefix, strna(s));
add00535
LP
6141 }
6142
b1994387 6143#if HAVE_LIBBPF
8fe84dc8
YW
6144 if (exec_context_restrict_filesystems_set(c)) {
6145 char *fs;
6146 SET_FOREACH(fs, c->restrict_filesystems)
6147 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6148 }
b1994387
ILG
6149#endif
6150
a8d08f39
LP
6151 if (c->network_namespace_path)
6152 fprintf(f,
6153 "%sNetworkNamespacePath: %s\n",
6154 prefix, c->network_namespace_path);
6155
3df90f24 6156 if (c->syscall_errno > 0) {
3df90f24
YW
6157 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6158
005bfaf1 6159#if HAVE_SECCOMP
d5a99b7c 6160 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6161 if (errno_name)
005bfaf1 6162 fputs(errno_name, f);
3df90f24 6163 else
005bfaf1
TM
6164 fprintf(f, "%d", c->syscall_errno);
6165#endif
6166 fputc('\n', f);
3df90f24 6167 }
b3d13314 6168
5b10116e 6169 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6170 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6171 c->mount_images[i].ignore_enoent ? "-": "",
6172 c->mount_images[i].source,
79e20ceb 6173 c->mount_images[i].destination);
427353f6 6174 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6175 fprintf(f, ":%s:%s",
427353f6 6176 partition_designator_to_string(o->partition_designator),
79e20ceb 6177 strempty(o->options));
427353f6
LB
6178 fprintf(f, "\n");
6179 }
93f59701
LB
6180
6181 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6182 fprintf(f, "%sExtensionImages: %s%s", prefix,
6183 c->extension_images[i].ignore_enoent ? "-": "",
6184 c->extension_images[i].source);
6185 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6186 fprintf(f, ":%s:%s",
6187 partition_designator_to_string(o->partition_designator),
6188 strempty(o->options));
6189 fprintf(f, "\n");
6190 }
a07b9926
LB
6191
6192 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6193}
6194
34cf6c43 6195bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6196 assert(c);
6197
61233823 6198 /* Returns true if the process forked off would run under
a931ad47
LP
6199 * an unchanged UID or as root. */
6200
6201 if (!c->user)
6202 return true;
6203
6204 if (streq(c->user, "root") || streq(c->user, "0"))
6205 return true;
6206
6207 return false;
6208}
6209
34cf6c43 6210int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6211 int p;
6212
6213 assert(c);
6214
6215 if (c->ioprio_set)
6216 return c->ioprio;
6217
6218 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6219 if (p < 0)
0692548c 6220 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6221
8b330d7d 6222 return ioprio_normalize(p);
7f452159
LP
6223}
6224
5e98086d
ZJS
6225bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6226 assert(c);
6227
61198784 6228 /* Explicit setting wins */
5e98086d
ZJS
6229 if (c->mount_apivfs_set)
6230 return c->mount_apivfs;
6231
61198784 6232 /* Default to "yes" if root directory or image are specified */
74e12520 6233 if (exec_context_with_rootfs(c))
61198784
ZJS
6234 return true;
6235
5e98086d
ZJS
6236 return false;
6237}
6238
d3070fbd 6239void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6240 assert(c);
6241
5b10116e 6242 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6243 free(c->log_extra_fields[l].iov_base);
6244 c->log_extra_fields = mfree(c->log_extra_fields);
6245 c->n_log_extra_fields = 0;
6246}
6247
6f765baf 6248void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6249 _cleanup_close_ int fd = -1;
6250 const char *path;
6251 struct stat st;
6f765baf
LP
6252 int r;
6253
6254 assert(c);
6255
6256 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6257 exec_context_tty_reset(c, NULL);
6258
6259 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6260 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6261 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6262 if (!exec_context_may_touch_tty(c))
6263 return;
6f765baf 6264
0ba976e8
LP
6265 path = exec_context_tty_path(c);
6266 if (!path)
6267 return;
6f765baf 6268
0ba976e8
LP
6269 fd = open(path, O_PATH|O_CLOEXEC);
6270 if (fd < 0)
6271 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6272 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6273 path);
6274
6275 if (fstat(fd, &st) < 0)
6276 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6277
6278 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6279 * if things are a character device, since a proper check either means we'd have to open the TTY and
6280 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6281 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6282 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6283 if (!S_ISCHR(st.st_mode))
6284 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6285
6286 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6287 if (r < 0)
6288 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6289}
6290
4c2f5842
LP
6291int exec_context_get_clean_directories(
6292 ExecContext *c,
6293 char **prefix,
6294 ExecCleanMask mask,
6295 char ***ret) {
6296
6297 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6298 int r;
6299
6300 assert(c);
6301 assert(prefix);
6302 assert(ret);
6303
5b10116e 6304 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6305 if (!FLAGS_SET(mask, 1U << t))
6306 continue;
6307
6308 if (!prefix[t])
6309 continue;
6310
211a3d87 6311 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6312 char *j;
6313
211a3d87 6314 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6315 if (!j)
6316 return -ENOMEM;
6317
6318 r = strv_consume(&l, j);
6319 if (r < 0)
6320 return r;
7f622a19
YW
6321
6322 /* Also remove private directories unconditionally. */
6323 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6324 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6325 if (!j)
6326 return -ENOMEM;
6327
6328 r = strv_consume(&l, j);
6329 if (r < 0)
6330 return r;
6331 }
6332
211a3d87
LB
6333 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6334 j = path_join(prefix[t], *symlink);
7f622a19
YW
6335 if (!j)
6336 return -ENOMEM;
6337
6338 r = strv_consume(&l, j);
6339 if (r < 0)
6340 return r;
6341 }
4c2f5842
LP
6342 }
6343 }
6344
6345 *ret = TAKE_PTR(l);
6346 return 0;
6347}
6348
6349int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6350 ExecCleanMask mask = 0;
6351
6352 assert(c);
6353 assert(ret);
6354
6355 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6356 if (c->directories[t].n_items > 0)
4c2f5842
LP
6357 mask |= 1U << t;
6358
6359 *ret = mask;
6360 return 0;
6361}
6362
b58b4116 6363void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6364 assert(s);
5cb5a6ff 6365
2ed26ed0
LP
6366 *s = (ExecStatus) {
6367 .pid = pid,
6368 };
6369
b58b4116
LP
6370 dual_timestamp_get(&s->start_timestamp);
6371}
6372
34cf6c43 6373void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6374 assert(s);
6375
d46b79bb 6376 if (s->pid != pid)
2ed26ed0
LP
6377 *s = (ExecStatus) {
6378 .pid = pid,
6379 };
b58b4116 6380
63983207 6381 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6382
034c6ed7
LP
6383 s->code = code;
6384 s->status = status;
169c1bda 6385
6f765baf
LP
6386 if (context && context->utmp_id)
6387 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6388}
6389
6a1d4d9f
LP
6390void exec_status_reset(ExecStatus *s) {
6391 assert(s);
6392
6393 *s = (ExecStatus) {};
6394}
6395
34cf6c43 6396void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6397 assert(s);
6398 assert(f);
6399
9fb86720
LP
6400 if (s->pid <= 0)
6401 return;
6402
4c940960
LP
6403 prefix = strempty(prefix);
6404
9fb86720 6405 fprintf(f,
ccd06097
ZJS
6406 "%sPID: "PID_FMT"\n",
6407 prefix, s->pid);
9fb86720 6408
af9d16e1 6409 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6410 fprintf(f,
6411 "%sStart Timestamp: %s\n",
04f5c018 6412 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6413
af9d16e1 6414 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6415 fprintf(f,
6416 "%sExit Timestamp: %s\n"
6417 "%sExit Code: %s\n"
6418 "%sExit Status: %i\n",
04f5c018 6419 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6420 prefix, sigchld_code_to_string(s->code),
6421 prefix, s->status);
5cb5a6ff 6422}
44d8db9e 6423
34cf6c43 6424static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6425 _cleanup_free_ char *cmd = NULL;
4c940960 6426 const char *prefix2;
44d8db9e
LP
6427
6428 assert(c);
6429 assert(f);
6430
4c940960 6431 prefix = strempty(prefix);
63c372cb 6432 prefix2 = strjoina(prefix, "\t");
44d8db9e 6433
4ef15008 6434 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
44d8db9e
LP
6435 fprintf(f,
6436 "%sCommand Line: %s\n",
7c248223 6437 prefix, cmd ?: strerror_safe(ENOMEM));
44d8db9e 6438
9fb86720 6439 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6440}
6441
6442void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6443 assert(f);
6444
4c940960 6445 prefix = strempty(prefix);
44d8db9e 6446
03677889
YW
6447 LIST_FOREACH(command, i, c)
6448 exec_command_dump(i, f, prefix);
44d8db9e 6449}
94f04347 6450
a6a80b4f
LP
6451void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6452 ExecCommand *end;
6453
6454 assert(l);
6455 assert(e);
6456
6457 if (*l) {
35b8ca3a 6458 /* It's kind of important, that we keep the order here */
71fda00f
LP
6459 LIST_FIND_TAIL(command, *l, end);
6460 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6461 } else
6462 *l = e;
6463}
6464
26fd040d
LP
6465int exec_command_set(ExecCommand *c, const char *path, ...) {
6466 va_list ap;
6467 char **l, *p;
6468
6469 assert(c);
6470 assert(path);
6471
6472 va_start(ap, path);
6473 l = strv_new_ap(path, ap);
6474 va_end(ap);
6475
6476 if (!l)
6477 return -ENOMEM;
6478
250a918d
LP
6479 p = strdup(path);
6480 if (!p) {
26fd040d
LP
6481 strv_free(l);
6482 return -ENOMEM;
6483 }
6484
6897dfe8 6485 free_and_replace(c->path, p);
26fd040d 6486
130d3d22 6487 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6488}
6489
86b23b07 6490int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6491 _cleanup_strv_free_ char **l = NULL;
86b23b07 6492 va_list ap;
86b23b07
JS
6493 int r;
6494
6495 assert(c);
6496 assert(path);
6497
6498 va_start(ap, path);
6499 l = strv_new_ap(path, ap);
6500 va_end(ap);
6501
6502 if (!l)
6503 return -ENOMEM;
6504
e287086b 6505 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6506 if (r < 0)
86b23b07 6507 return r;
86b23b07
JS
6508
6509 return 0;
6510}
6511
e8a565cb
YW
6512static void *remove_tmpdir_thread(void *p) {
6513 _cleanup_free_ char *path = p;
86b23b07 6514
e8a565cb
YW
6515 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6516 return NULL;
6517}
6518
6519static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6520 int r;
6521
6522 if (!rt)
6523 return NULL;
6524
6525 if (rt->manager)
6526 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6527
6528 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6529
6530 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6531 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6532
6533 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6534 if (r < 0)
e8a565cb 6535 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6536 else
6537 rt->tmp_dir = NULL;
e8a565cb 6538 }
613b411c 6539
56a13a49 6540 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6541 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6542
6543 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6544 if (r < 0)
e8a565cb 6545 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6546 else
6547 rt->var_tmp_dir = NULL;
e8a565cb
YW
6548 }
6549
6550 rt->id = mfree(rt->id);
6551 rt->tmp_dir = mfree(rt->tmp_dir);
6552 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6553 safe_close_pair(rt->netns_storage_socket);
a70581ff 6554 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6555 return mfree(rt);
6556}
6557
6558static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6559 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6560}
6561
56a13a49
ZJS
6562static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6563 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6564 ExecRuntime *n;
613b411c 6565
8e8009dc 6566 assert(ret);
613b411c 6567
56a13a49
ZJS
6568 id_copy = strdup(id);
6569 if (!id_copy)
6570 return -ENOMEM;
6571
8e8009dc
LP
6572 n = new(ExecRuntime, 1);
6573 if (!n)
613b411c
LP
6574 return -ENOMEM;
6575
8e8009dc 6576 *n = (ExecRuntime) {
56a13a49 6577 .id = TAKE_PTR(id_copy),
8e8009dc 6578 .netns_storage_socket = { -1, -1 },
a70581ff 6579 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6580 };
6581
6582 *ret = n;
613b411c
LP
6583 return 0;
6584}
6585
e8a565cb
YW
6586static int exec_runtime_add(
6587 Manager *m,
6588 const char *id,
56a13a49
ZJS
6589 char **tmp_dir,
6590 char **var_tmp_dir,
6591 int netns_storage_socket[2],
a70581ff 6592 int ipcns_storage_socket[2],
e8a565cb
YW
6593 ExecRuntime **ret) {
6594
6595 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6596 int r;
6597
e8a565cb 6598 assert(m);
613b411c
LP
6599 assert(id);
6600
a70581ff 6601 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6602
56a13a49 6603 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6604 if (r < 0)
6605 return r;
6606
63083706 6607 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6608 if (r < 0)
6609 return r;
e8a565cb 6610
56a13a49
ZJS
6611 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6612 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6613 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6614
6615 if (netns_storage_socket) {
56a13a49
ZJS
6616 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6617 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6618 }
6619
a70581ff
XR
6620 if (ipcns_storage_socket) {
6621 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6622 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6623 }
6624
e8a565cb
YW
6625 rt->manager = m;
6626
6627 if (ret)
6628 *ret = rt;
e8a565cb 6629 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6630 TAKE_PTR(rt);
e8a565cb
YW
6631 return 0;
6632}
6633
74aaf59b
LP
6634static int exec_runtime_make(
6635 Manager *m,
6636 const ExecContext *c,
6637 const char *id,
6638 ExecRuntime **ret) {
6639
56a13a49 6640 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6641 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6642 int r;
6643
6644 assert(m);
6645 assert(c);
6646 assert(id);
6647
6648 /* It is not necessary to create ExecRuntime object. */
a70581ff 6649 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6650 *ret = NULL;
e8a565cb 6651 return 0;
74aaf59b 6652 }
e8a565cb 6653
efa2f3a1
TM
6654 if (c->private_tmp &&
6655 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6656 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6657 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6658 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6659 if (r < 0)
6660 return r;
6661 }
6662
a8d08f39 6663 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6664 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6665 return -errno;
6666 }
6667
a70581ff
XR
6668 if (c->private_ipc || c->ipc_namespace_path) {
6669 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6670 return -errno;
6671 }
6672
6673 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6674 if (r < 0)
6675 return r;
6676
613b411c
LP
6677 return 1;
6678}
6679
e8a565cb
YW
6680int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6681 ExecRuntime *rt;
6682 int r;
613b411c 6683
e8a565cb
YW
6684 assert(m);
6685 assert(id);
6686 assert(ret);
6687
6688 rt = hashmap_get(m->exec_runtime_by_id, id);
6689 if (rt)
387f6955 6690 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6691 goto ref;
6692
74aaf59b
LP
6693 if (!create) {
6694 *ret = NULL;
e8a565cb 6695 return 0;
74aaf59b 6696 }
e8a565cb
YW
6697
6698 /* If not found, then create a new object. */
6699 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6700 if (r < 0)
e8a565cb 6701 return r;
74aaf59b
LP
6702 if (r == 0) {
6703 /* When r == 0, it is not necessary to create ExecRuntime object. */
6704 *ret = NULL;
6705 return 0;
6706 }
613b411c 6707
e8a565cb
YW
6708ref:
6709 /* increment reference counter. */
6710 rt->n_ref++;
6711 *ret = rt;
6712 return 1;
6713}
613b411c 6714
e8a565cb
YW
6715ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6716 if (!rt)
613b411c
LP
6717 return NULL;
6718
e8a565cb 6719 assert(rt->n_ref > 0);
613b411c 6720
e8a565cb
YW
6721 rt->n_ref--;
6722 if (rt->n_ref > 0)
f2341e0a
LP
6723 return NULL;
6724
e8a565cb 6725 return exec_runtime_free(rt, destroy);
613b411c
LP
6726}
6727
e8a565cb
YW
6728int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6729 ExecRuntime *rt;
e8a565cb
YW
6730
6731 assert(m);
613b411c
LP
6732 assert(f);
6733 assert(fds);
6734
90e74a66 6735 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6736 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6737
e8a565cb
YW
6738 if (rt->tmp_dir)
6739 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6740
e8a565cb
YW
6741 if (rt->var_tmp_dir)
6742 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6743
e8a565cb
YW
6744 if (rt->netns_storage_socket[0] >= 0) {
6745 int copy;
613b411c 6746
e8a565cb
YW
6747 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6748 if (copy < 0)
6749 return copy;
613b411c 6750
e8a565cb
YW
6751 fprintf(f, " netns-socket-0=%i", copy);
6752 }
613b411c 6753
e8a565cb
YW
6754 if (rt->netns_storage_socket[1] >= 0) {
6755 int copy;
613b411c 6756
e8a565cb
YW
6757 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6758 if (copy < 0)
6759 return copy;
613b411c 6760
e8a565cb
YW
6761 fprintf(f, " netns-socket-1=%i", copy);
6762 }
6763
a70581ff
XR
6764 if (rt->ipcns_storage_socket[0] >= 0) {
6765 int copy;
6766
6767 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6768 if (copy < 0)
6769 return copy;
6770
6771 fprintf(f, " ipcns-socket-0=%i", copy);
6772 }
6773
6774 if (rt->ipcns_storage_socket[1] >= 0) {
6775 int copy;
6776
6777 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6778 if (copy < 0)
6779 return copy;
6780
6781 fprintf(f, " ipcns-socket-1=%i", copy);
6782 }
6783
e8a565cb 6784 fputc('\n', f);
613b411c
LP
6785 }
6786
6787 return 0;
6788}
6789
e8a565cb
YW
6790int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6791 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6792 ExecRuntime *rt;
613b411c
LP
6793 int r;
6794
e8a565cb
YW
6795 /* This is for the migration from old (v237 or earlier) deserialization text.
6796 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6797 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6798 * so or not from the serialized text, then we always creates a new object owned by this. */
6799
6800 assert(u);
613b411c
LP
6801 assert(key);
6802 assert(value);
6803
e8a565cb
YW
6804 /* Manager manages ExecRuntime objects by the unit id.
6805 * So, we omit the serialized text when the unit does not have id (yet?)... */
6806 if (isempty(u->id)) {
6807 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6808 return 0;
6809 }
613b411c 6810
cbc165d1
ZJS
6811 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6812 return log_oom();
e8a565cb
YW
6813
6814 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6815 if (!rt) {
cbc165d1 6816 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6817 return log_oom();
613b411c 6818
e8a565cb
YW
6819 rt = rt_create;
6820 }
6821
6822 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6823 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6824 return -ENOMEM;
613b411c
LP
6825
6826 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6827 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6828 return -ENOMEM;
613b411c
LP
6829
6830 } else if (streq(key, "netns-socket-0")) {
6831 int fd;
6832
e8a565cb 6833 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6834 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6835 return 0;
613b411c 6836 }
e8a565cb
YW
6837
6838 safe_close(rt->netns_storage_socket[0]);
6839 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6840
613b411c
LP
6841 } else if (streq(key, "netns-socket-1")) {
6842 int fd;
6843
e8a565cb 6844 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6845 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6846 return 0;
613b411c 6847 }
e8a565cb
YW
6848
6849 safe_close(rt->netns_storage_socket[1]);
6850 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6851
613b411c
LP
6852 } else
6853 return 0;
6854
e8a565cb
YW
6855 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6856 if (rt_create) {
6857 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6858 if (r < 0) {
3fe91079 6859 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6860 return 0;
6861 }
613b411c 6862
e8a565cb 6863 rt_create->manager = u->manager;
613b411c 6864
e8a565cb 6865 /* Avoid cleanup */
56a13a49 6866 TAKE_PTR(rt_create);
e8a565cb 6867 }
98b47d54 6868
e8a565cb
YW
6869 return 1;
6870}
613b411c 6871
56a13a49
ZJS
6872int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6873 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6874 char *id = NULL;
a70581ff 6875 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6876 const char *p, *v = value;
6877 size_t n;
613b411c 6878
e8a565cb
YW
6879 assert(m);
6880 assert(value);
6881 assert(fds);
98b47d54 6882
e8a565cb 6883 n = strcspn(v, " ");
2f82562b 6884 id = strndupa_safe(v, n);
e8a565cb
YW
6885 if (v[n] != ' ')
6886 goto finalize;
6887 p = v + n + 1;
6888
6889 v = startswith(p, "tmp-dir=");
6890 if (v) {
6891 n = strcspn(v, " ");
56a13a49
ZJS
6892 tmp_dir = strndup(v, n);
6893 if (!tmp_dir)
6894 return log_oom();
e8a565cb
YW
6895 if (v[n] != ' ')
6896 goto finalize;
6897 p = v + n + 1;
6898 }
6899
6900 v = startswith(p, "var-tmp-dir=");
6901 if (v) {
6902 n = strcspn(v, " ");
56a13a49
ZJS
6903 var_tmp_dir = strndup(v, n);
6904 if (!var_tmp_dir)
6905 return log_oom();
e8a565cb
YW
6906 if (v[n] != ' ')
6907 goto finalize;
6908 p = v + n + 1;
6909 }
6910
6911 v = startswith(p, "netns-socket-0=");
6912 if (v) {
6913 char *buf;
6914
6915 n = strcspn(v, " ");
2f82562b 6916 buf = strndupa_safe(v, n);
c413bb28 6917
a70581ff 6918 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6919 if (r < 0)
6920 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6921 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6922 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6923 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6924 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6925 if (v[n] != ' ')
6926 goto finalize;
6927 p = v + n + 1;
613b411c
LP
6928 }
6929
e8a565cb
YW
6930 v = startswith(p, "netns-socket-1=");
6931 if (v) {
6932 char *buf;
98b47d54 6933
e8a565cb 6934 n = strcspn(v, " ");
2f82562b 6935 buf = strndupa_safe(v, n);
a70581ff
XR
6936
6937 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6938 if (r < 0)
6939 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6940 if (!fdset_contains(fds, netns_fdpair[1]))
6941 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6942 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6943 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6944 if (v[n] != ' ')
6945 goto finalize;
6946 p = v + n + 1;
6947 }
6948
6949 v = startswith(p, "ipcns-socket-0=");
6950 if (v) {
6951 char *buf;
6952
6953 n = strcspn(v, " ");
2f82562b 6954 buf = strndupa_safe(v, n);
a70581ff
XR
6955
6956 r = safe_atoi(buf, &ipcns_fdpair[0]);
6957 if (r < 0)
6958 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6959 if (!fdset_contains(fds, ipcns_fdpair[0]))
6960 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6961 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6962 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6963 if (v[n] != ' ')
6964 goto finalize;
6965 p = v + n + 1;
6966 }
6967
6968 v = startswith(p, "ipcns-socket-1=");
6969 if (v) {
6970 char *buf;
6971
6972 n = strcspn(v, " ");
2f82562b 6973 buf = strndupa_safe(v, n);
a70581ff
XR
6974
6975 r = safe_atoi(buf, &ipcns_fdpair[1]);
6976 if (r < 0)
6977 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6978 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6979 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6980 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6981 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6982 }
98b47d54 6983
e8a565cb 6984finalize:
a70581ff 6985 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6986 if (r < 0)
56a13a49
ZJS
6987 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6988 return 0;
e8a565cb 6989}
613b411c 6990
e8a565cb
YW
6991void exec_runtime_vacuum(Manager *m) {
6992 ExecRuntime *rt;
e8a565cb
YW
6993
6994 assert(m);
6995
6996 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6997
90e74a66 6998 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6999 if (rt->n_ref > 0)
7000 continue;
7001
7002 (void) exec_runtime_free(rt, false);
7003 }
613b411c
LP
7004}
7005
b9c04eaf
YW
7006void exec_params_clear(ExecParameters *p) {
7007 if (!p)
7008 return;
7009
c3f8a065
LP
7010 p->environment = strv_free(p->environment);
7011 p->fd_names = strv_free(p->fd_names);
7012 p->fds = mfree(p->fds);
7013 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7014}
7015
bb0c0d6f
LP
7016ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7017 if (!sc)
7018 return NULL;
7019
7020 free(sc->id);
7021 free(sc->data);
7022 return mfree(sc);
7023}
7024
43144be4
LP
7025ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7026 if (!lc)
7027 return NULL;
7028
7029 free(lc->id);
7030 free(lc->path);
7031 return mfree(lc);
7032}
7033
211a3d87
LB
7034void exec_directory_done(ExecDirectory *d) {
7035 if (!d)
7036 return;
7037
7038 for (size_t i = 0; i < d->n_items; i++) {
7039 free(d->items[i].path);
7040 strv_free(d->items[i].symlinks);
7041 }
7042
7043 d->items = mfree(d->items);
7044 d->n_items = 0;
7045 d->mode = 0755;
7046}
7047
7048int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7049 _cleanup_strv_free_ char **s = NULL;
7050 _cleanup_free_ char *p = NULL;
7051
7052 assert(d);
7053 assert(n);
7054 assert(path);
7055
7056 p = strdup(path);
7057 if (!p)
7058 return -ENOMEM;
7059
7060 if (symlinks) {
7061 s = strv_copy(symlinks);
7062 if (!s)
7063 return -ENOMEM;
7064 }
7065
7066 if (!GREEDY_REALLOC(*d, *n + 1))
7067 return -ENOMEM;
7068
7069 (*d)[(*n) ++] = (ExecDirectoryItem) {
7070 .path = TAKE_PTR(p),
7071 .symlinks = TAKE_PTR(s),
7072 };
7073
7074 return 0;
7075}
7076
bb0c0d6f 7077DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7078DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7079
80876c20
LP
7080static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7081 [EXEC_INPUT_NULL] = "null",
7082 [EXEC_INPUT_TTY] = "tty",
7083 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7084 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7085 [EXEC_INPUT_SOCKET] = "socket",
7086 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7087 [EXEC_INPUT_DATA] = "data",
2038c3f5 7088 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7089};
7090
8a0867d6
LP
7091DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7092
94f04347 7093static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7094 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7095 [EXEC_OUTPUT_NULL] = "null",
80876c20 7096 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7097 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7098 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7099 [EXEC_OUTPUT_JOURNAL] = "journal",
7100 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7101 [EXEC_OUTPUT_SOCKET] = "socket",
7102 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7103 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7104 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7105 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7106};
7107
7108DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7109
7110static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7111 [EXEC_UTMP_INIT] = "init",
7112 [EXEC_UTMP_LOGIN] = "login",
7113 [EXEC_UTMP_USER] = "user",
7114};
7115
7116DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7117
7118static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7119 [EXEC_PRESERVE_NO] = "no",
7120 [EXEC_PRESERVE_YES] = "yes",
7121 [EXEC_PRESERVE_RESTART] = "restart",
7122};
7123
7124DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7125
6b7b2ed9 7126/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7127static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7128 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7129 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7130 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7131 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7132 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7133};
7134
7135DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7136
211a3d87
LB
7137/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7138static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7139 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7140 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7141 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7142 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7143 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7144};
7145
7146DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7147
6b7b2ed9
LP
7148/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7149 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7150 * directories, specifically .timer units with their timestamp touch file. */
7151static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7152 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7153 [EXEC_DIRECTORY_STATE] = "state",
7154 [EXEC_DIRECTORY_CACHE] = "cache",
7155 [EXEC_DIRECTORY_LOGS] = "logs",
7156 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7157};
7158
7159DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7160
7161/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7162 * the service payload in. */
fb2042dd
YW
7163static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7164 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7165 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7166 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7167 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7168 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7169};
7170
7171DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7172
b1edf445
LP
7173static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7174 [EXEC_KEYRING_INHERIT] = "inherit",
7175 [EXEC_KEYRING_PRIVATE] = "private",
7176 [EXEC_KEYRING_SHARED] = "shared",
7177};
7178
7179DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);