]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
man: explicitly document that "reboot -f" is different from "systemctl reboot -f"
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
b1994387 44#include "bpf-lsm.h"
8dd4c05b 45#include "cap-list.h"
430f0182 46#include "capability-util.h"
fdb3deca 47#include "cgroup-setup.h"
f4351959 48#include "chase-symlinks.h"
bb0c0d6f 49#include "chown-recursive.h"
da681e1b 50#include "cpu-set-util.h"
43144be4 51#include "creds-util.h"
6a818c3c 52#include "data-fd-util.h"
f6a6225e 53#include "def.h"
686d13b9 54#include "env-file.h"
4d1a6904 55#include "env-util.h"
17df7223 56#include "errno-list.h"
8a62620e 57#include "escape.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
bb0c0d6f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
032b3afb 66#include "ioprio-util.h"
a1164ae3 67#include "label.h"
8dd4c05b
LP
68#include "log.h"
69#include "macro.h"
e8a565cb 70#include "manager.h"
2a341bb9 71#include "manager-dump.h"
0a970718 72#include "memory-util.h"
f5947a5e 73#include "missing_fs.h"
5bead76e 74#include "missing_ioprio.h"
35cd0ba5 75#include "mkdir-label.h"
21935150 76#include "mount-util.h"
bb0c0d6f 77#include "mountpoint-util.h"
8dd4c05b 78#include "namespace.h"
6bedfcbb 79#include "parse-util.h"
8dd4c05b 80#include "path-util.h"
0b452006 81#include "process-util.h"
d3dcf4e3 82#include "random-util.h"
3989bdc1 83#include "recurse-dir.h"
78f22b97 84#include "rlimit-util.h"
8dd4c05b 85#include "rm-rf.h"
349cc4a5 86#if HAVE_SECCOMP
3ffd4af2
LP
87#include "seccomp-util.h"
88#endif
07d46372 89#include "securebits-util.h"
8dd4c05b 90#include "selinux-util.h"
24882e06 91#include "signal-util.h"
8dd4c05b 92#include "smack-util.h"
57b7a260 93#include "socket-util.h"
fd63e712 94#include "special.h"
949befd3 95#include "stat-util.h"
8b43440b 96#include "string-table.h"
07630cea 97#include "string-util.h"
8dd4c05b 98#include "strv.h"
7ccbd1ae 99#include "syslog-util.h"
8dd4c05b 100#include "terminal-util.h"
bb0c0d6f 101#include "tmpfile-util.h"
566b7d23 102#include "umask-util.h"
2d3b784d 103#include "unit-serialize.h"
b1d4f8e1 104#include "user-util.h"
8dd4c05b 105#include "utmp-wtmp.h"
5cb5a6ff 106
e056b01d 107#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 108#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 109
531dca78
LP
110#define SNDBUF_SIZE (8*1024*1024)
111
da6053d0 112static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
113 if (n_fds <= 0)
114 return 0;
115
a0d40ac5
LP
116 /* Modifies the fds array! (sorts it) */
117
034c6ed7
LP
118 assert(fds);
119
5b10116e
ZJS
120 for (int start = 0;;) {
121 int restart_from = -1;
034c6ed7 122
5b10116e 123 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
124 int nfd;
125
126 /* Already at right index? */
127 if (fds[i] == i+3)
128 continue;
129
3cc2aff1
LP
130 nfd = fcntl(fds[i], F_DUPFD, i + 3);
131 if (nfd < 0)
034c6ed7
LP
132 return -errno;
133
03e334a1 134 safe_close(fds[i]);
034c6ed7
LP
135 fds[i] = nfd;
136
137 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 138 * let's remember that and try again from here */
034c6ed7
LP
139 if (nfd != i+3 && restart_from < 0)
140 restart_from = i;
141 }
142
143 if (restart_from < 0)
144 break;
145
146 start = restart_from;
147 }
148
149 return 0;
150}
151
25b583d7 152static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 153 size_t n_fds;
e2c76839 154 int r;
47a71eed 155
25b583d7 156 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
9b141911
FB
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
47a71eed 164
5b10116e 165 for (size_t i = 0; i < n_fds; i++) {
47a71eed 166
9b141911
FB
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
47a71eed 172
451a074f
LP
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
47a71eed 176
3cc2aff1
LP
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
e2c76839 179 return r;
47a71eed
LP
180 }
181
182 return 0;
183}
184
1e22b5cd 185static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
186 assert(context);
187
1e22b5cd
LP
188 if (context->stdio_as_fds)
189 return NULL;
190
80876c20
LP
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195}
196
1e22b5cd
LP
197static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
198 const char *path;
199
6ea832a2
LP
200 assert(context);
201
1e22b5cd 202 path = exec_context_tty_path(context);
6ea832a2 203
1e22b5cd
LP
204 if (context->tty_vhangup) {
205 if (p && p->stdin_fd >= 0)
206 (void) terminal_vhangup_fd(p->stdin_fd);
207 else if (path)
208 (void) terminal_vhangup(path);
209 }
6ea832a2 210
1e22b5cd
LP
211 if (context->tty_reset) {
212 if (p && p->stdin_fd >= 0)
213 (void) reset_terminal_fd(p->stdin_fd, true);
214 else if (path)
215 (void) reset_terminal(path);
216 }
217
51462135
DDM
218 if (p && p->stdin_fd >= 0)
219 (void) terminal_set_size_fd(p->stdin_fd, path, context->tty_rows, context->tty_cols);
220
1e22b5cd
LP
221 if (context->tty_vt_disallocate && path)
222 (void) vt_disallocate(path);
6ea832a2
LP
223}
224
6af760f3
LP
225static bool is_terminal_input(ExecInput i) {
226 return IN_SET(i,
227 EXEC_INPUT_TTY,
228 EXEC_INPUT_TTY_FORCE,
229 EXEC_INPUT_TTY_FAIL);
230}
231
3a1286b6 232static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
233 return IN_SET(o,
234 EXEC_OUTPUT_TTY,
6af760f3
LP
235 EXEC_OUTPUT_KMSG_AND_CONSOLE,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237}
238
aac8c0c3
LP
239static bool is_kmsg_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_KMSG,
242 EXEC_OUTPUT_KMSG_AND_CONSOLE);
243}
244
6af760f3
LP
245static bool exec_context_needs_term(const ExecContext *c) {
246 assert(c);
247
248 /* Return true if the execution context suggests we should set $TERM to something useful. */
249
250 if (is_terminal_input(c->std_input))
251 return true;
252
253 if (is_terminal_output(c->std_output))
254 return true;
255
256 if (is_terminal_output(c->std_error))
257 return true;
258
259 return !!c->tty_path;
3a1286b6
MS
260}
261
80876c20 262static int open_null_as(int flags, int nfd) {
046a82c1 263 int fd;
071830ff 264
80876c20 265 assert(nfd >= 0);
071830ff 266
613b411c
LP
267 fd = open("/dev/null", flags|O_NOCTTY);
268 if (fd < 0)
071830ff
LP
269 return -errno;
270
046a82c1 271 return move_fd(fd, nfd, false);
071830ff
LP
272}
273
91dd5f7c
LP
274static int connect_journal_socket(
275 int fd,
276 const char *log_namespace,
277 uid_t uid,
278 gid_t gid) {
279
524daa8c
ZJS
280 uid_t olduid = UID_INVALID;
281 gid_t oldgid = GID_INVALID;
91dd5f7c 282 const char *j;
524daa8c
ZJS
283 int r;
284
91dd5f7c
LP
285 j = log_namespace ?
286 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
287 "/run/systemd/journal/stdout";
91dd5f7c 288
cad93f29 289 if (gid_is_valid(gid)) {
524daa8c
ZJS
290 oldgid = getgid();
291
92a17af9 292 if (setegid(gid) < 0)
524daa8c
ZJS
293 return -errno;
294 }
295
cad93f29 296 if (uid_is_valid(uid)) {
524daa8c
ZJS
297 olduid = getuid();
298
92a17af9 299 if (seteuid(uid) < 0) {
524daa8c
ZJS
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
1861986a 305 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 306
1861986a
LP
307 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
308 an LSM interferes. */
524daa8c 309
cad93f29 310 if (uid_is_valid(uid))
524daa8c
ZJS
311 (void) seteuid(olduid);
312
313 restore_gid:
cad93f29 314 if (gid_is_valid(gid))
524daa8c
ZJS
315 (void) setegid(oldgid);
316
317 return r;
318}
319
fd1f9c89 320static int connect_logger_as(
34cf6c43 321 const Unit *unit,
fd1f9c89 322 const ExecContext *context,
af635cf3 323 const ExecParameters *params,
fd1f9c89
LP
324 ExecOutput output,
325 const char *ident,
fd1f9c89
LP
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
2ac1ff68
EV
330 _cleanup_close_ int fd = -1;
331 int r;
071830ff
LP
332
333 assert(context);
af635cf3 334 assert(params);
80876c20
LP
335 assert(output < _EXEC_OUTPUT_MAX);
336 assert(ident);
337 assert(nfd >= 0);
071830ff 338
54fe0cdb
LP
339 fd = socket(AF_UNIX, SOCK_STREAM, 0);
340 if (fd < 0)
80876c20 341 return -errno;
071830ff 342
91dd5f7c 343 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
344 if (r < 0)
345 return r;
071830ff 346
2ac1ff68 347 if (shutdown(fd, SHUT_RD) < 0)
80876c20 348 return -errno;
071830ff 349
fd1f9c89 350 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 351
2ac1ff68 352 if (dprintf(fd,
62bca2c6 353 "%s\n"
80876c20
LP
354 "%s\n"
355 "%i\n"
54fe0cdb
LP
356 "%i\n"
357 "%i\n"
358 "%i\n"
4f4a1dbf 359 "%i\n",
c867611e 360 context->syslog_identifier ?: ident,
af635cf3 361 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
362 context->syslog_priority,
363 !!context->syslog_level_prefix,
f3dc6af2 364 false,
aac8c0c3 365 is_kmsg_output(output),
2ac1ff68
EV
366 is_terminal_output(output)) < 0)
367 return -errno;
80876c20 368
2ac1ff68 369 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 370}
2ac1ff68 371
3a274a21 372static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 373 int fd;
071830ff 374
80876c20
LP
375 assert(path);
376 assert(nfd >= 0);
fd1f9c89 377
3a274a21 378 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 379 if (fd < 0)
80876c20 380 return fd;
071830ff 381
046a82c1 382 return move_fd(fd, nfd, false);
80876c20 383}
071830ff 384
2038c3f5 385static int acquire_path(const char *path, int flags, mode_t mode) {
15a3e96f 386 _cleanup_close_ int fd = -1;
86fca584 387 int r;
071830ff 388
80876c20 389 assert(path);
071830ff 390
2038c3f5
LP
391 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
392 flags |= O_CREAT;
393
394 fd = open(path, flags|O_NOCTTY, mode);
395 if (fd >= 0)
15a3e96f 396 return TAKE_FD(fd);
071830ff 397
2038c3f5
LP
398 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
399 return -errno;
2038c3f5
LP
400
401 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
402
403 fd = socket(AF_UNIX, SOCK_STREAM, 0);
404 if (fd < 0)
405 return -errno;
406
1861986a
LP
407 r = connect_unix_path(fd, AT_FDCWD, path);
408 if (IN_SET(r, -ENOTSOCK, -EINVAL))
409 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
410 * wasn't an AF_UNIX socket after all */
411 return -ENXIO;
412 if (r < 0)
413 return r;
071830ff 414
2038c3f5
LP
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
86fca584 420 r = 0;
15a3e96f 421 if (r < 0)
2038c3f5 422 return -errno;
2038c3f5 423
15a3e96f 424 return TAKE_FD(fd);
80876c20 425}
071830ff 426
08f3be7a
LP
427static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
1e3ad081
LP
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
071830ff 440
03fd9c49 441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
442 return EXEC_INPUT_NULL;
443
08f3be7a
LP
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
03fd9c49 447 return std_input;
4f2d528d
LP
448}
449
7966a916 450static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 451
7966a916 452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
453 return EXEC_OUTPUT_INHERIT;
454
7966a916 455 return output;
4f2d528d
LP
456}
457
a34ceba6
LP
458static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
52c239d7 461 int socket_fd,
2caa38e9 462 const int named_iofds[static 3]) {
a34ceba6 463
4f2d528d 464 ExecInput i;
51462135 465 int r;
4f2d528d
LP
466
467 assert(context);
a34ceba6 468 assert(params);
2caa38e9 469 assert(named_iofds);
a34ceba6
LP
470
471 if (params->stdin_fd >= 0) {
472 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
473 return -errno;
474
475 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
476 if (isatty(STDIN_FILENO)) {
477 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
478 (void) reset_terminal_fd(STDIN_FILENO, true);
51462135 479 (void) terminal_set_size_fd(STDIN_FILENO, NULL, context->tty_rows, context->tty_cols);
1fb0682e 480 }
a34ceba6
LP
481
482 return STDIN_FILENO;
483 }
4f2d528d 484
08f3be7a 485 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
486
487 switch (i) {
071830ff 488
80876c20
LP
489 case EXEC_INPUT_NULL:
490 return open_null_as(O_RDONLY, STDIN_FILENO);
491
492 case EXEC_INPUT_TTY:
493 case EXEC_INPUT_TTY_FORCE:
494 case EXEC_INPUT_TTY_FAIL: {
046a82c1 495 int fd;
071830ff 496
1e22b5cd 497 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
498 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
499 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
500 ACQUIRE_TERMINAL_WAIT,
3a43da28 501 USEC_INFINITY);
970edce6 502 if (fd < 0)
80876c20
LP
503 return fd;
504
51462135
DDM
505 r = terminal_set_size_fd(fd, exec_context_tty_path(context), context->tty_rows, context->tty_cols);
506 if (r < 0)
507 return r;
508
046a82c1 509 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
510 }
511
4f2d528d 512 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
513 assert(socket_fd >= 0);
514
7c248223 515 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 516
52c239d7 517 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
518 assert(named_iofds[STDIN_FILENO] >= 0);
519
52c239d7 520 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 521 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 522
08f3be7a
LP
523 case EXEC_INPUT_DATA: {
524 int fd;
525
526 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
527 if (fd < 0)
528 return fd;
529
530 return move_fd(fd, STDIN_FILENO, false);
531 }
532
2038c3f5
LP
533 case EXEC_INPUT_FILE: {
534 bool rw;
535 int fd;
536
537 assert(context->stdio_file[STDIN_FILENO]);
538
539 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
540 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
541
542 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
543 if (fd < 0)
544 return fd;
545
546 return move_fd(fd, STDIN_FILENO, false);
547 }
548
80876c20 549 default:
04499a70 550 assert_not_reached();
80876c20
LP
551 }
552}
553
41fc585a
LP
554static bool can_inherit_stderr_from_stdout(
555 const ExecContext *context,
556 ExecOutput o,
557 ExecOutput e) {
558
559 assert(context);
560
561 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
562 * stderr fd */
563
564 if (e == EXEC_OUTPUT_INHERIT)
565 return true;
566 if (e != o)
567 return false;
568
569 if (e == EXEC_OUTPUT_NAMED_FD)
570 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
571
8d7dab1f 572 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
573 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
574
575 return true;
576}
577
a34ceba6 578static int setup_output(
34cf6c43 579 const Unit *unit,
a34ceba6
LP
580 const ExecContext *context,
581 const ExecParameters *params,
582 int fileno,
583 int socket_fd,
2caa38e9 584 const int named_iofds[static 3],
a34ceba6 585 const char *ident,
7bce046b
LP
586 uid_t uid,
587 gid_t gid,
588 dev_t *journal_stream_dev,
589 ino_t *journal_stream_ino) {
a34ceba6 590
4f2d528d
LP
591 ExecOutput o;
592 ExecInput i;
47c1d80d 593 int r;
4f2d528d 594
f2341e0a 595 assert(unit);
80876c20 596 assert(context);
a34ceba6 597 assert(params);
80876c20 598 assert(ident);
7bce046b
LP
599 assert(journal_stream_dev);
600 assert(journal_stream_ino);
80876c20 601
a34ceba6
LP
602 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
603
604 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
605 return -errno;
606
607 return STDOUT_FILENO;
608 }
609
610 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
611 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
612 return -errno;
613
614 return STDERR_FILENO;
615 }
616
08f3be7a 617 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 618 o = fixup_output(context->std_output, socket_fd);
4f2d528d 619
eb17e935
MS
620 if (fileno == STDERR_FILENO) {
621 ExecOutput e;
622 e = fixup_output(context->std_error, socket_fd);
80876c20 623
eb17e935
MS
624 /* This expects the input and output are already set up */
625
626 /* Don't change the stderr file descriptor if we inherit all
627 * the way and are not on a tty */
628 if (e == EXEC_OUTPUT_INHERIT &&
629 o == EXEC_OUTPUT_INHERIT &&
630 i == EXEC_INPUT_NULL &&
631 !is_terminal_input(context->std_input) &&
7966a916 632 getppid() != 1)
eb17e935
MS
633 return fileno;
634
635 /* Duplicate from stdout if possible */
41fc585a 636 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 637 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 638
eb17e935 639 o = e;
80876c20 640
eb17e935 641 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
642 /* If input got downgraded, inherit the original value */
643 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 644 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 645
08f3be7a
LP
646 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
647 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 648 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 649
acb591e4
LP
650 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
651 if (getppid() != 1)
eb17e935 652 return fileno;
94f04347 653
eb17e935
MS
654 /* We need to open /dev/null here anew, to get the right access mode. */
655 return open_null_as(O_WRONLY, fileno);
071830ff 656 }
94f04347 657
eb17e935 658 switch (o) {
80876c20
LP
659
660 case EXEC_OUTPUT_NULL:
eb17e935 661 return open_null_as(O_WRONLY, fileno);
80876c20
LP
662
663 case EXEC_OUTPUT_TTY:
4f2d528d 664 if (is_terminal_input(i))
7c248223 665 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
666
667 /* We don't reset the terminal if this is just about output */
1e22b5cd 668 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 669
9a6bca7a 670 case EXEC_OUTPUT_KMSG:
28dbc1e8 671 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
672 case EXEC_OUTPUT_JOURNAL:
673 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 674 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 675 if (r < 0) {
7966a916
ZJS
676 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
677 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 678 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
679 } else {
680 struct stat st;
681
682 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
683 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
684 * services to detect whether they are connected to the journal or not.
685 *
686 * If both stdout and stderr are connected to a stream then let's make sure to store the data
687 * about STDERR as that's usually the best way to do logging. */
7bce046b 688
ab2116b1
LP
689 if (fstat(fileno, &st) >= 0 &&
690 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
691 *journal_stream_dev = st.st_dev;
692 *journal_stream_ino = st.st_ino;
693 }
47c1d80d
MS
694 }
695 return r;
4f2d528d
LP
696
697 case EXEC_OUTPUT_SOCKET:
698 assert(socket_fd >= 0);
e75a9ed1 699
7c248223 700 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 701
52c239d7 702 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
703 assert(named_iofds[fileno] >= 0);
704
52c239d7 705 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 706 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 707
566b7d23 708 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
709 case EXEC_OUTPUT_FILE_APPEND:
710 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 711 bool rw;
566b7d23 712 int fd, flags;
2038c3f5
LP
713
714 assert(context->stdio_file[fileno]);
715
716 rw = context->std_input == EXEC_INPUT_FILE &&
717 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
718
719 if (rw)
7c248223 720 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 721
566b7d23
ZD
722 flags = O_WRONLY;
723 if (o == EXEC_OUTPUT_FILE_APPEND)
724 flags |= O_APPEND;
8d7dab1f
LW
725 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
726 flags |= O_TRUNC;
566b7d23
ZD
727
728 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
729 if (fd < 0)
730 return fd;
731
566b7d23 732 return move_fd(fd, fileno, 0);
2038c3f5
LP
733 }
734
94f04347 735 default:
04499a70 736 assert_not_reached();
94f04347 737 }
071830ff
LP
738}
739
02a51aba 740static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 741 int r;
02a51aba
LP
742
743 assert(fd >= 0);
02a51aba 744
1ff74fb6 745 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
746 if (isatty(fd) < 1) {
747 if (IN_SET(errno, EINVAL, ENOTTY))
748 return 0; /* not a tty */
1ff74fb6 749
02a51aba 750 return -errno;
4b3b5bc7 751 }
02a51aba 752
4b3b5bc7 753 /* This might fail. What matters are the results. */
f2df231f 754 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
755 if (r < 0)
756 return r;
02a51aba 757
4b3b5bc7 758 return 1;
02a51aba
LP
759}
760
aedec452 761static int setup_confirm_stdio(
51462135 762 const ExecContext *context,
aedec452
LP
763 const char *vc,
764 int *ret_saved_stdin,
765 int *ret_saved_stdout) {
766
3d18b167
LP
767 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
768 int r;
80876c20 769
aedec452
LP
770 assert(ret_saved_stdin);
771 assert(ret_saved_stdout);
80876c20 772
af6da548
LP
773 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
774 if (saved_stdin < 0)
775 return -errno;
80876c20 776
af6da548 777 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
778 if (saved_stdout < 0)
779 return -errno;
80876c20 780
8854d795 781 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
782 if (fd < 0)
783 return fd;
80876c20 784
af6da548
LP
785 r = chown_terminal(fd, getuid());
786 if (r < 0)
3d18b167 787 return r;
02a51aba 788
3d18b167
LP
789 r = reset_terminal_fd(fd, true);
790 if (r < 0)
791 return r;
80876c20 792
51462135
DDM
793 r = terminal_set_size_fd(fd, vc, context->tty_rows, context->tty_cols);
794 if (r < 0)
795 return r;
796
aedec452
LP
797 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
798 TAKE_FD(fd);
2b33ab09
LP
799 if (r < 0)
800 return r;
80876c20 801
aedec452
LP
802 *ret_saved_stdin = TAKE_FD(saved_stdin);
803 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 804 return 0;
80876c20
LP
805}
806
63d77c92 807static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
808 assert(err < 0);
809
810 if (err == -ETIMEDOUT)
63d77c92 811 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
812 else {
813 errno = -err;
63d77c92 814 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
815 }
816}
817
63d77c92 818static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 819 _cleanup_close_ int fd = -1;
80876c20 820
3b20f877 821 assert(vc);
80876c20 822
7d5ceb64 823 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 824 if (fd < 0)
3b20f877 825 return;
80876c20 826
63d77c92 827 write_confirm_error_fd(err, fd, u);
af6da548 828}
80876c20 829
3d18b167 830static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 831 int r = 0;
80876c20 832
af6da548
LP
833 assert(saved_stdin);
834 assert(saved_stdout);
835
836 release_terminal();
837
838 if (*saved_stdin >= 0)
80876c20 839 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 840 r = -errno;
80876c20 841
af6da548 842 if (*saved_stdout >= 0)
80876c20 843 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 844 r = -errno;
80876c20 845
3d18b167
LP
846 *saved_stdin = safe_close(*saved_stdin);
847 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
848
849 return r;
850}
851
3b20f877
FB
852enum {
853 CONFIRM_PRETEND_FAILURE = -1,
854 CONFIRM_PRETEND_SUCCESS = 0,
855 CONFIRM_EXECUTE = 1,
856};
857
51462135 858static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 859 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 860 _cleanup_free_ char *e = NULL;
3b20f877 861 char c;
af6da548 862
3b20f877 863 /* For any internal errors, assume a positive response. */
51462135 864 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 865 if (r < 0) {
63d77c92 866 write_confirm_error(r, vc, u);
3b20f877
FB
867 return CONFIRM_EXECUTE;
868 }
af6da548 869
b0eb2944
FB
870 /* confirm_spawn might have been disabled while we were sleeping. */
871 if (manager_is_confirm_spawn_disabled(u->manager)) {
872 r = 1;
873 goto restore_stdio;
874 }
af6da548 875
2bcd3c26
FB
876 e = ellipsize(cmdline, 60, 100);
877 if (!e) {
878 log_oom();
879 r = CONFIRM_EXECUTE;
880 goto restore_stdio;
881 }
af6da548 882
d172b175 883 for (;;) {
539622bd 884 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 885 if (r < 0) {
63d77c92 886 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
887 r = CONFIRM_EXECUTE;
888 goto restore_stdio;
889 }
af6da548 890
d172b175 891 switch (c) {
b0eb2944
FB
892 case 'c':
893 printf("Resuming normal execution.\n");
894 manager_disable_confirm_spawn();
895 r = 1;
896 break;
dd6f9ac0
FB
897 case 'D':
898 unit_dump(u, stdout, " ");
899 continue; /* ask again */
d172b175
FB
900 case 'f':
901 printf("Failing execution.\n");
902 r = CONFIRM_PRETEND_FAILURE;
903 break;
904 case 'h':
b0eb2944
FB
905 printf(" c - continue, proceed without asking anymore\n"
906 " D - dump, show the state of the unit\n"
dd6f9ac0 907 " f - fail, don't execute the command and pretend it failed\n"
d172b175 908 " h - help\n"
eedf223a 909 " i - info, show a short summary of the unit\n"
56fde33a 910 " j - jobs, show jobs that are in progress\n"
d172b175
FB
911 " s - skip, don't execute the command and pretend it succeeded\n"
912 " y - yes, execute the command\n");
dd6f9ac0 913 continue; /* ask again */
eedf223a
FB
914 case 'i':
915 printf(" Description: %s\n"
916 " Unit: %s\n"
917 " Command: %s\n",
918 u->id, u->description, cmdline);
919 continue; /* ask again */
56fde33a
FB
920 case 'j':
921 manager_dump_jobs(u->manager, stdout, " ");
922 continue; /* ask again */
539622bd
FB
923 case 'n':
924 /* 'n' was removed in favor of 'f'. */
925 printf("Didn't understand 'n', did you mean 'f'?\n");
926 continue; /* ask again */
d172b175
FB
927 case 's':
928 printf("Skipping execution.\n");
929 r = CONFIRM_PRETEND_SUCCESS;
930 break;
931 case 'y':
932 r = CONFIRM_EXECUTE;
933 break;
934 default:
04499a70 935 assert_not_reached();
d172b175 936 }
3b20f877 937 break;
3b20f877 938 }
af6da548 939
3b20f877 940restore_stdio:
af6da548 941 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 942 return r;
80876c20
LP
943}
944
4d885bd3
DH
945static int get_fixed_user(const ExecContext *c, const char **user,
946 uid_t *uid, gid_t *gid,
947 const char **home, const char **shell) {
81a2b7ce 948 int r;
4d885bd3 949 const char *name;
81a2b7ce 950
4d885bd3 951 assert(c);
81a2b7ce 952
23deef88
LP
953 if (!c->user)
954 return 0;
955
4d885bd3
DH
956 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
957 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 958
23deef88 959 name = c->user;
fafff8f1 960 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
961 if (r < 0)
962 return r;
81a2b7ce 963
4d885bd3
DH
964 *user = name;
965 return 0;
966}
967
968static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
969 int r;
970 const char *name;
971
972 assert(c);
973
974 if (!c->group)
975 return 0;
976
977 name = c->group;
fafff8f1 978 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
979 if (r < 0)
980 return r;
981
982 *group = name;
983 return 0;
984}
985
cdc5d5c5
DH
986static int get_supplementary_groups(const ExecContext *c, const char *user,
987 const char *group, gid_t gid,
988 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
989 int r, k = 0;
990 int ngroups_max;
991 bool keep_groups = false;
992 gid_t *groups = NULL;
993 _cleanup_free_ gid_t *l_gids = NULL;
994
995 assert(c);
996
bbeea271
DH
997 /*
998 * If user is given, then lookup GID and supplementary groups list.
999 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1000 * here and as early as possible so we keep the list of supplementary
1001 * groups of the caller.
bbeea271
DH
1002 */
1003 if (user && gid_is_valid(gid) && gid != 0) {
1004 /* First step, initialize groups from /etc/groups */
1005 if (initgroups(user, gid) < 0)
1006 return -errno;
1007
1008 keep_groups = true;
1009 }
1010
ac6e8be6 1011 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1012 return 0;
1013
366ddd25
DH
1014 /*
1015 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1016 * be positive, otherwise fail.
1017 */
1018 errno = 0;
1019 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1020 if (ngroups_max <= 0)
1021 return errno_or_else(EOPNOTSUPP);
366ddd25 1022
4d885bd3
DH
1023 l_gids = new(gid_t, ngroups_max);
1024 if (!l_gids)
1025 return -ENOMEM;
81a2b7ce 1026
4d885bd3
DH
1027 if (keep_groups) {
1028 /*
1029 * Lookup the list of groups that the user belongs to, we
1030 * avoid NSS lookups here too for gid=0.
1031 */
1032 k = ngroups_max;
1033 if (getgrouplist(user, gid, l_gids, &k) < 0)
1034 return -EINVAL;
1035 } else
1036 k = 0;
81a2b7ce 1037
4d885bd3
DH
1038 STRV_FOREACH(i, c->supplementary_groups) {
1039 const char *g;
81a2b7ce 1040
4d885bd3
DH
1041 if (k >= ngroups_max)
1042 return -E2BIG;
81a2b7ce 1043
4d885bd3 1044 g = *i;
fafff8f1 1045 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1046 if (r < 0)
1047 return r;
81a2b7ce 1048
4d885bd3
DH
1049 k++;
1050 }
81a2b7ce 1051
4d885bd3
DH
1052 /*
1053 * Sets ngids to zero to drop all supplementary groups, happens
1054 * when we are under root and SupplementaryGroups= is empty.
1055 */
1056 if (k == 0) {
1057 *ngids = 0;
1058 return 0;
1059 }
81a2b7ce 1060
4d885bd3
DH
1061 /* Otherwise get the final list of supplementary groups */
1062 groups = memdup(l_gids, sizeof(gid_t) * k);
1063 if (!groups)
1064 return -ENOMEM;
1065
1066 *supplementary_gids = groups;
1067 *ngids = k;
1068
1069 groups = NULL;
1070
1071 return 0;
1072}
1073
34cf6c43 1074static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1075 int r;
1076
709dbeac
YW
1077 /* Handle SupplementaryGroups= if it is not empty */
1078 if (ngids > 0) {
4d885bd3
DH
1079 r = maybe_setgroups(ngids, supplementary_gids);
1080 if (r < 0)
97f0e76f 1081 return r;
4d885bd3 1082 }
81a2b7ce 1083
4d885bd3
DH
1084 if (gid_is_valid(gid)) {
1085 /* Then set our gids */
1086 if (setresgid(gid, gid, gid) < 0)
1087 return -errno;
81a2b7ce
LP
1088 }
1089
1090 return 0;
1091}
1092
dbdc4098
TK
1093static int set_securebits(int bits, int mask) {
1094 int current, applied;
1095 current = prctl(PR_GET_SECUREBITS);
1096 if (current < 0)
1097 return -errno;
1098 /* Clear all securebits defined in mask and set bits */
1099 applied = (current & ~mask) | bits;
1100 if (current == applied)
1101 return 0;
1102 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1103 return -errno;
1104 return 1;
1105}
1106
81a2b7ce 1107static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1108 assert(context);
dbdc4098 1109 int r;
81a2b7ce 1110
4d885bd3
DH
1111 if (!uid_is_valid(uid))
1112 return 0;
1113
479050b3 1114 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1115 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1116 * required, so we also need keep-caps in this case.
1117 */
81a2b7ce 1118
dbdc4098 1119 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1120
1121 /* First step: If we need to keep capabilities but
1122 * drop privileges we need to make sure we keep our
cbb21cca 1123 * caps, while we drop privileges. */
693ced48 1124 if (uid != 0) {
dbdc4098
TK
1125 /* Add KEEP_CAPS to the securebits */
1126 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1127 if (r < 0)
1128 return r;
693ced48 1129 }
81a2b7ce
LP
1130 }
1131
479050b3 1132 /* Second step: actually set the uids */
81a2b7ce
LP
1133 if (setresuid(uid, uid, uid) < 0)
1134 return -errno;
1135
1136 /* At this point we should have all necessary capabilities but
1137 are otherwise a normal user. However, the caps might got
1138 corrupted due to the setresuid() so we need clean them up
1139 later. This is done outside of this call. */
1140
1141 return 0;
1142}
1143
349cc4a5 1144#if HAVE_PAM
5b6319dc
LP
1145
1146static int null_conv(
1147 int num_msg,
1148 const struct pam_message **msg,
1149 struct pam_response **resp,
1150 void *appdata_ptr) {
1151
1152 /* We don't support conversations */
1153
1154 return PAM_CONV_ERR;
1155}
1156
cefc33ae
LP
1157#endif
1158
5b6319dc
LP
1159static int setup_pam(
1160 const char *name,
1161 const char *user,
940c5210 1162 uid_t uid,
2d6fce8d 1163 gid_t gid,
5b6319dc 1164 const char *tty,
421bb42d 1165 char ***env, /* updated on success */
5b8d1f6b 1166 const int fds[], size_t n_fds) {
5b6319dc 1167
349cc4a5 1168#if HAVE_PAM
cefc33ae 1169
5b6319dc
LP
1170 static const struct pam_conv conv = {
1171 .conv = null_conv,
1172 .appdata_ptr = NULL
1173 };
1174
2d7c6aa2 1175 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1176 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1177 pam_handle_t *handle = NULL;
d6e5f3ad 1178 sigset_t old_ss;
7bb70b6e 1179 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1180 bool close_session = false;
1181 pid_t pam_pid = 0, parent_pid;
970edce6 1182 int flags = 0;
5b6319dc
LP
1183
1184 assert(name);
1185 assert(user);
2065ca69 1186 assert(env);
5b6319dc
LP
1187
1188 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1189 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1190 * systemd via the cgroup logic. It will then remove the PAM
1191 * session again. The parent process will exec() the actual
1192 * daemon. We do things this way to ensure that the main PID
1193 * of the daemon is the one we initially fork()ed. */
1194
7bb70b6e
LP
1195 r = barrier_create(&barrier);
1196 if (r < 0)
2d7c6aa2
DH
1197 goto fail;
1198
553d2243 1199 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1200 flags |= PAM_SILENT;
1201
f546241b
ZJS
1202 pam_code = pam_start(name, user, &conv, &handle);
1203 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1204 handle = NULL;
1205 goto fail;
1206 }
1207
3cd24c1a
LP
1208 if (!tty) {
1209 _cleanup_free_ char *q = NULL;
1210
1211 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1212 * out if that's the case, and read the TTY off it. */
1213
1214 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1215 tty = strjoina("/dev/", q);
1216 }
1217
513cf7da
MS
1218 if (tty) {
1219 pam_code = pam_set_item(handle, PAM_TTY, tty);
1220 if (pam_code != PAM_SUCCESS)
1221 goto fail;
1222 }
5b6319dc 1223
84eada2f
JW
1224 STRV_FOREACH(nv, *env) {
1225 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1226 if (pam_code != PAM_SUCCESS)
1227 goto fail;
1228 }
1229
970edce6 1230 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1231 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1232 goto fail;
1233
3bb39ea9
DG
1234 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1235 if (pam_code != PAM_SUCCESS)
46d7c6af 1236 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1237
970edce6 1238 pam_code = pam_open_session(handle, flags);
f546241b 1239 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1240 goto fail;
1241
1242 close_session = true;
1243
f546241b
ZJS
1244 e = pam_getenvlist(handle);
1245 if (!e) {
5b6319dc
LP
1246 pam_code = PAM_BUF_ERR;
1247 goto fail;
1248 }
1249
cafc5ca1 1250 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1251
72c0a2c2 1252 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1253
df0ff127 1254 parent_pid = getpid_cached();
5b6319dc 1255
4c253ed1
LP
1256 r = safe_fork("(sd-pam)", 0, &pam_pid);
1257 if (r < 0)
5b6319dc 1258 goto fail;
4c253ed1 1259 if (r == 0) {
7bb70b6e 1260 int sig, ret = EXIT_PAM;
5b6319dc 1261
cafc5ca1 1262 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1263 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1264
1da37e58
ZJS
1265 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1266 * those fds are open here that have been opened by PAM. */
4c253ed1 1267 (void) close_many(fds, n_fds);
5b6319dc 1268
cafc5ca1
LP
1269 /* Drop privileges - we don't need any to pam_close_session and this will make
1270 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1271 * threads to fail to exit normally */
2d6fce8d 1272
97f0e76f
LP
1273 r = maybe_setgroups(0, NULL);
1274 if (r < 0)
1275 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1276 if (setresgid(gid, gid, gid) < 0)
1277 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1278 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1279 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1280
9c274488 1281 (void) ignore_signals(SIGPIPE);
ce30c8dc 1282
cafc5ca1
LP
1283 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1284 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1285 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1286 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1287 goto child_finish;
1288
cafc5ca1
LP
1289 /* Tell the parent that our setup is done. This is especially important regarding dropping
1290 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1291 *
cafc5ca1 1292 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1293 (void) barrier_place(&barrier);
2d7c6aa2 1294
643f4706 1295 /* Check if our parent process might already have died? */
5b6319dc 1296 if (getppid() == parent_pid) {
d6e5f3ad
DM
1297 sigset_t ss;
1298
1299 assert_se(sigemptyset(&ss) >= 0);
1300 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1301
3dead8d9
LP
1302 for (;;) {
1303 if (sigwait(&ss, &sig) < 0) {
1304 if (errno == EINTR)
1305 continue;
1306
1307 goto child_finish;
1308 }
5b6319dc 1309
3dead8d9
LP
1310 assert(sig == SIGTERM);
1311 break;
1312 }
5b6319dc
LP
1313 }
1314
3bb39ea9
DG
1315 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1316 if (pam_code != PAM_SUCCESS)
1317 goto child_finish;
1318
3dead8d9 1319 /* If our parent died we'll end the session */
f546241b 1320 if (getppid() != parent_pid) {
970edce6 1321 pam_code = pam_close_session(handle, flags);
f546241b 1322 if (pam_code != PAM_SUCCESS)
5b6319dc 1323 goto child_finish;
f546241b 1324 }
5b6319dc 1325
7bb70b6e 1326 ret = 0;
5b6319dc
LP
1327
1328 child_finish:
7feb2b57
LP
1329 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1330 * know about this. See pam_end(3) */
1331 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1332 _exit(ret);
5b6319dc
LP
1333 }
1334
2d7c6aa2
DH
1335 barrier_set_role(&barrier, BARRIER_PARENT);
1336
cafc5ca1
LP
1337 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1338 * here. */
5b6319dc
LP
1339 handle = NULL;
1340
3b8bddde 1341 /* Unblock SIGTERM again in the parent */
72c0a2c2 1342 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1343
cafc5ca1
LP
1344 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1345 * this fd around. */
5b6319dc
LP
1346 closelog();
1347
cafc5ca1
LP
1348 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1349 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1350 if (!barrier_place_and_sync(&barrier))
1351 log_error("PAM initialization failed");
1352
130d3d22 1353 return strv_free_and_replace(*env, e);
5b6319dc
LP
1354
1355fail:
970edce6
ZJS
1356 if (pam_code != PAM_SUCCESS) {
1357 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1358 r = -EPERM; /* PAM errors do not map to errno */
1359 } else
1360 log_error_errno(r, "PAM failed: %m");
9ba35398 1361
5b6319dc
LP
1362 if (handle) {
1363 if (close_session)
970edce6 1364 pam_code = pam_close_session(handle, flags);
5b6319dc 1365
7feb2b57 1366 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1367 }
1368
5b6319dc 1369 closelog();
7bb70b6e 1370 return r;
cefc33ae
LP
1371#else
1372 return 0;
5b6319dc 1373#endif
cefc33ae 1374}
5b6319dc 1375
5d6b1584
LP
1376static void rename_process_from_path(const char *path) {
1377 char process_name[11];
1378 const char *p;
1379 size_t l;
1380
1381 /* This resulting string must fit in 10 chars (i.e. the length
1382 * of "/sbin/init") to look pretty in /bin/ps */
1383
2b6bf07d 1384 p = basename(path);
5d6b1584
LP
1385 if (isempty(p)) {
1386 rename_process("(...)");
1387 return;
1388 }
1389
1390 l = strlen(p);
1391 if (l > 8) {
1392 /* The end of the process name is usually more
1393 * interesting, since the first bit might just be
1394 * "systemd-" */
1395 p = p + l - 8;
1396 l = 8;
1397 }
1398
1399 process_name[0] = '(';
1400 memcpy(process_name+1, p, l);
1401 process_name[1+l] = ')';
1402 process_name[1+l+1] = 0;
1403
1404 rename_process(process_name);
1405}
1406
469830d1
LP
1407static bool context_has_address_families(const ExecContext *c) {
1408 assert(c);
1409
6b000af4 1410 return c->address_families_allow_list ||
469830d1
LP
1411 !set_isempty(c->address_families);
1412}
1413
1414static bool context_has_syscall_filters(const ExecContext *c) {
1415 assert(c);
1416
6b000af4 1417 return c->syscall_allow_list ||
8cfa775f 1418 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1419}
1420
9df2cdd8
TM
1421static bool context_has_syscall_logs(const ExecContext *c) {
1422 assert(c);
1423
1424 return c->syscall_log_allow_list ||
1425 !hashmap_isempty(c->syscall_log);
1426}
1427
469830d1
LP
1428static bool context_has_no_new_privileges(const ExecContext *c) {
1429 assert(c);
1430
1431 if (c->no_new_privileges)
1432 return true;
1433
1434 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1435 return false;
1436
1437 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1438 return c->lock_personality ||
469830d1 1439 c->memory_deny_write_execute ||
0538d2a8 1440 c->private_devices ||
fc64760d 1441 c->protect_clock ||
0538d2a8 1442 c->protect_hostname ||
469830d1
LP
1443 c->protect_kernel_tunables ||
1444 c->protect_kernel_modules ||
84703040 1445 c->protect_kernel_logs ||
0538d2a8
YW
1446 context_has_address_families(c) ||
1447 exec_context_restrict_namespaces_set(c) ||
1448 c->restrict_realtime ||
1449 c->restrict_suid_sgid ||
78e864e5 1450 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1451 context_has_syscall_filters(c) ||
1452 context_has_syscall_logs(c);
469830d1
LP
1453}
1454
bb0c0d6f
LP
1455static bool exec_context_has_credentials(const ExecContext *context) {
1456
1457 assert(context);
1458
1459 return !hashmap_isempty(context->set_credentials) ||
43144be4 1460 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1461}
1462
349cc4a5 1463#if HAVE_SECCOMP
17df7223 1464
83f12b27 1465static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1466
1467 if (is_seccomp_available())
1468 return false;
1469
f673b62d 1470 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1471 return true;
83f12b27
FS
1472}
1473
165a31c0 1474static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1475 uint32_t negative_action, default_action, action;
165a31c0 1476 int r;
8351ceae 1477
469830d1 1478 assert(u);
c0467cf3 1479 assert(c);
8351ceae 1480
469830d1 1481 if (!context_has_syscall_filters(c))
83f12b27
FS
1482 return 0;
1483
469830d1
LP
1484 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1485 return 0;
e9642be2 1486
005bfaf1 1487 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1488
6b000af4 1489 if (c->syscall_allow_list) {
469830d1
LP
1490 default_action = negative_action;
1491 action = SCMP_ACT_ALLOW;
7c66bae2 1492 } else {
469830d1
LP
1493 default_action = SCMP_ACT_ALLOW;
1494 action = negative_action;
57183d11 1495 }
8351ceae 1496
165a31c0 1497 if (needs_ambient_hack) {
6b000af4 1498 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1499 if (r < 0)
1500 return r;
1501 }
1502
b54f36c6 1503 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1504}
1505
9df2cdd8
TM
1506static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1507#ifdef SCMP_ACT_LOG
1508 uint32_t default_action, action;
1509#endif
1510
1511 assert(u);
1512 assert(c);
1513
1514 if (!context_has_syscall_logs(c))
1515 return 0;
1516
1517#ifdef SCMP_ACT_LOG
1518 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1519 return 0;
1520
1521 if (c->syscall_log_allow_list) {
1522 /* Log nothing but the ones listed */
1523 default_action = SCMP_ACT_ALLOW;
1524 action = SCMP_ACT_LOG;
1525 } else {
1526 /* Log everything but the ones listed */
1527 default_action = SCMP_ACT_LOG;
1528 action = SCMP_ACT_ALLOW;
1529 }
1530
1531 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1532#else
1533 /* old libseccomp */
1534 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1535 return 0;
1536#endif
1537}
1538
469830d1
LP
1539static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1540 assert(u);
4298d0b5
LP
1541 assert(c);
1542
469830d1 1543 if (set_isempty(c->syscall_archs))
83f12b27
FS
1544 return 0;
1545
469830d1
LP
1546 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1547 return 0;
4298d0b5 1548
469830d1
LP
1549 return seccomp_restrict_archs(c->syscall_archs);
1550}
4298d0b5 1551
469830d1
LP
1552static int apply_address_families(const Unit* u, const ExecContext *c) {
1553 assert(u);
1554 assert(c);
4298d0b5 1555
469830d1
LP
1556 if (!context_has_address_families(c))
1557 return 0;
4298d0b5 1558
469830d1
LP
1559 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1560 return 0;
4298d0b5 1561
6b000af4 1562 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1563}
4298d0b5 1564
83f12b27 1565static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1566 assert(u);
f3e43635
TM
1567 assert(c);
1568
469830d1 1569 if (!c->memory_deny_write_execute)
83f12b27
FS
1570 return 0;
1571
469830d1
LP
1572 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1573 return 0;
f3e43635 1574
469830d1 1575 return seccomp_memory_deny_write_execute();
f3e43635
TM
1576}
1577
83f12b27 1578static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1579 assert(u);
f4170c67
LP
1580 assert(c);
1581
469830d1 1582 if (!c->restrict_realtime)
83f12b27
FS
1583 return 0;
1584
469830d1
LP
1585 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1586 return 0;
f4170c67 1587
469830d1 1588 return seccomp_restrict_realtime();
f4170c67
LP
1589}
1590
f69567cb
LP
1591static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1592 assert(u);
1593 assert(c);
1594
1595 if (!c->restrict_suid_sgid)
1596 return 0;
1597
1598 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1599 return 0;
1600
1601 return seccomp_restrict_suid_sgid();
1602}
1603
59e856c7 1604static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1605 assert(u);
59eeb84b
LP
1606 assert(c);
1607
1608 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1609 * let's protect even those systems where this is left on in the kernel. */
1610
469830d1 1611 if (!c->protect_kernel_tunables)
59eeb84b
LP
1612 return 0;
1613
469830d1
LP
1614 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1615 return 0;
59eeb84b 1616
469830d1 1617 return seccomp_protect_sysctl();
59eeb84b
LP
1618}
1619
59e856c7 1620static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1621 assert(u);
502d704e
DH
1622 assert(c);
1623
25a8d8a0 1624 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1625
469830d1
LP
1626 if (!c->protect_kernel_modules)
1627 return 0;
1628
502d704e
DH
1629 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1630 return 0;
1631
b54f36c6 1632 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1633}
1634
84703040
KK
1635static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1636 assert(u);
1637 assert(c);
1638
1639 if (!c->protect_kernel_logs)
1640 return 0;
1641
1642 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1643 return 0;
1644
1645 return seccomp_protect_syslog();
1646}
1647
daf8f72b 1648static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1649 assert(u);
1650 assert(c);
1651
1652 if (!c->protect_clock)
1653 return 0;
1654
1655 if (skip_seccomp_unavailable(u, "ProtectClock="))
1656 return 0;
1657
1658 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1659}
1660
59e856c7 1661static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1662 assert(u);
ba128bb8
LP
1663 assert(c);
1664
8f81a5f6 1665 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1666
469830d1
LP
1667 if (!c->private_devices)
1668 return 0;
1669
ba128bb8
LP
1670 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1671 return 0;
1672
b54f36c6 1673 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1674}
1675
34cf6c43 1676static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1677 assert(u);
add00535
LP
1678 assert(c);
1679
1680 if (!exec_context_restrict_namespaces_set(c))
1681 return 0;
1682
1683 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1684 return 0;
1685
1686 return seccomp_restrict_namespaces(c->restrict_namespaces);
1687}
1688
78e864e5 1689static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1690 unsigned long personality;
1691 int r;
78e864e5
TM
1692
1693 assert(u);
1694 assert(c);
1695
1696 if (!c->lock_personality)
1697 return 0;
1698
1699 if (skip_seccomp_unavailable(u, "LockPersonality="))
1700 return 0;
1701
e8132d63
LP
1702 personality = c->personality;
1703
1704 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1705 if (personality == PERSONALITY_INVALID) {
1706
1707 r = opinionated_personality(&personality);
1708 if (r < 0)
1709 return r;
1710 }
78e864e5
TM
1711
1712 return seccomp_lock_personality(personality);
1713}
1714
c0467cf3 1715#endif
8351ceae 1716
7a8288f6 1717#if HAVE_LIBBPF
7a8288f6
DM
1718static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1719 assert(u);
1720 assert(c);
1721
1722 if (!exec_context_restrict_filesystems_set(c))
1723 return 0;
1724
46004616
ZJS
1725 if (!u->manager->restrict_fs) {
1726 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1727 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1728 return 0;
46004616 1729 }
7a8288f6
DM
1730
1731 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1732}
1733#endif
1734
daf8f72b 1735static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1736 assert(u);
1737 assert(c);
1738
1739 if (!c->protect_hostname)
1740 return 0;
1741
1742 if (ns_type_supported(NAMESPACE_UTS)) {
1743 if (unshare(CLONE_NEWUTS) < 0) {
1744 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1745 *ret_exit_status = EXIT_NAMESPACE;
1746 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1747 }
1748
1749 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1750 }
1751 } else
1752 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1753
1754#if HAVE_SECCOMP
8f3e342f
ZJS
1755 int r;
1756
daf8f72b
LP
1757 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1758 return 0;
1759
1760 r = seccomp_protect_hostname();
1761 if (r < 0) {
1762 *ret_exit_status = EXIT_SECCOMP;
1763 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1764 }
1765#endif
1766
1767 return 0;
1768}
1769
3042bbeb 1770static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1771 assert(idle_pipe);
1772
54eb2300
LP
1773 idle_pipe[1] = safe_close(idle_pipe[1]);
1774 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1775
1776 if (idle_pipe[0] >= 0) {
1777 int r;
1778
1779 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1780
1781 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1782 ssize_t n;
1783
31a7eb86 1784 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1785 n = write(idle_pipe[3], "x", 1);
1786 if (n > 0)
cd972d69 1787 /* Wait for systemd to react to the signal above. */
54756dce 1788 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1789 }
1790
54eb2300 1791 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1792
1793 }
1794
54eb2300 1795 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1796}
1797
fb2042dd
YW
1798static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1799
7cae38c4 1800static int build_environment(
34cf6c43 1801 const Unit *u,
9fa95f85 1802 const ExecContext *c,
1e22b5cd 1803 const ExecParameters *p,
da6053d0 1804 size_t n_fds,
7cae38c4
LP
1805 const char *home,
1806 const char *username,
1807 const char *shell,
7bce046b
LP
1808 dev_t journal_stream_dev,
1809 ino_t journal_stream_ino,
7cae38c4
LP
1810 char ***ret) {
1811
1812 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1813 size_t n_env = 0;
7cae38c4
LP
1814 char *x;
1815
4b58153d 1816 assert(u);
7cae38c4 1817 assert(c);
7c1cb6f1 1818 assert(p);
7cae38c4
LP
1819 assert(ret);
1820
dc4e2940 1821#define N_ENV_VARS 17
8d5bb13d 1822 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1823 if (!our_env)
1824 return -ENOMEM;
1825
1826 if (n_fds > 0) {
8dd4c05b
LP
1827 _cleanup_free_ char *joined = NULL;
1828
df0ff127 1829 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1830 return -ENOMEM;
1831 our_env[n_env++] = x;
1832
da6053d0 1833 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1834 return -ENOMEM;
1835 our_env[n_env++] = x;
8dd4c05b 1836
1e22b5cd 1837 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1838 if (!joined)
1839 return -ENOMEM;
1840
605405c6 1841 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1842 if (!x)
1843 return -ENOMEM;
1844 our_env[n_env++] = x;
7cae38c4
LP
1845 }
1846
b08af3b1 1847 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1848 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1849 return -ENOMEM;
1850 our_env[n_env++] = x;
1851
1e22b5cd 1852 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1853 return -ENOMEM;
1854 our_env[n_env++] = x;
1855 }
1856
de90700f
LP
1857 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1858 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1859 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1860 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1861 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1862 if (!x)
1863 return -ENOMEM;
1864 our_env[n_env++] = x;
1865 }
1866
7cae38c4 1867 if (home) {
b910cc72 1868 x = strjoin("HOME=", home);
7cae38c4
LP
1869 if (!x)
1870 return -ENOMEM;
7bbead1d 1871
4ff361cc 1872 path_simplify(x + 5);
7cae38c4
LP
1873 our_env[n_env++] = x;
1874 }
1875
1876 if (username) {
b910cc72 1877 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1878 if (!x)
1879 return -ENOMEM;
1880 our_env[n_env++] = x;
1881
b910cc72 1882 x = strjoin("USER=", username);
7cae38c4
LP
1883 if (!x)
1884 return -ENOMEM;
1885 our_env[n_env++] = x;
1886 }
1887
1888 if (shell) {
b910cc72 1889 x = strjoin("SHELL=", shell);
7cae38c4
LP
1890 if (!x)
1891 return -ENOMEM;
7bbead1d 1892
4ff361cc 1893 path_simplify(x + 6);
7cae38c4
LP
1894 our_env[n_env++] = x;
1895 }
1896
4b58153d
LP
1897 if (!sd_id128_is_null(u->invocation_id)) {
1898 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1899 return -ENOMEM;
1900
1901 our_env[n_env++] = x;
1902 }
1903
6af760f3
LP
1904 if (exec_context_needs_term(c)) {
1905 const char *tty_path, *term = NULL;
1906
1907 tty_path = exec_context_tty_path(c);
1908
e8cf09b2
LP
1909 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1910 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1911 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1912
e8cf09b2 1913 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1914 term = getenv("TERM");
e8cf09b2 1915
6af760f3
LP
1916 if (!term)
1917 term = default_term_for_tty(tty_path);
7cae38c4 1918
b910cc72 1919 x = strjoin("TERM=", term);
7cae38c4
LP
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
7bce046b
LP
1925 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1926 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1927 return -ENOMEM;
1928
1929 our_env[n_env++] = x;
1930 }
1931
91dd5f7c
LP
1932 if (c->log_namespace) {
1933 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1934 if (!x)
1935 return -ENOMEM;
1936
1937 our_env[n_env++] = x;
1938 }
1939
5b10116e 1940 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 1941 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
1942 const char *n;
1943
1944 if (!p->prefix[t])
1945 continue;
1946
211a3d87 1947 if (c->directories[t].n_items == 0)
fb2042dd
YW
1948 continue;
1949
1950 n = exec_directory_env_name_to_string(t);
1951 if (!n)
1952 continue;
1953
211a3d87
LB
1954 for (size_t i = 0; i < c->directories[t].n_items; i++) {
1955 _cleanup_free_ char *prefixed = NULL;
fb2042dd 1956
211a3d87
LB
1957 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
1958 if (!prefixed)
1959 return -ENOMEM;
1960
1961 if (!strextend_with_separator(&joined, ":", prefixed))
1962 return -ENOMEM;
1963 }
fb2042dd
YW
1964
1965 x = strjoin(n, "=", joined);
1966 if (!x)
1967 return -ENOMEM;
1968
1969 our_env[n_env++] = x;
1970 }
1971
bb0c0d6f
LP
1972 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1973 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1974 if (!x)
1975 return -ENOMEM;
1976
1977 our_env[n_env++] = x;
1978 }
1979
dc4e2940
YW
1980 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1981 return -ENOMEM;
1982
1983 our_env[n_env++] = x;
1984
7cae38c4 1985 our_env[n_env++] = NULL;
8d5bb13d
LP
1986 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1987#undef N_ENV_VARS
7cae38c4 1988
ae2a15bc 1989 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1990
1991 return 0;
1992}
1993
b4c14404
FB
1994static int build_pass_environment(const ExecContext *c, char ***ret) {
1995 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1996 size_t n_env = 0;
b4c14404
FB
1997
1998 STRV_FOREACH(i, c->pass_environment) {
1999 _cleanup_free_ char *x = NULL;
2000 char *v;
2001
2002 v = getenv(*i);
2003 if (!v)
2004 continue;
605405c6 2005 x = strjoin(*i, "=", v);
b4c14404
FB
2006 if (!x)
2007 return -ENOMEM;
00819cc1 2008
319a4f4b 2009 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2010 return -ENOMEM;
00819cc1 2011
1cc6c93a 2012 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2013 pass_env[n_env] = NULL;
b4c14404
FB
2014 }
2015
ae2a15bc 2016 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2017
2018 return 0;
2019}
2020
5e8deb94 2021bool exec_needs_mount_namespace(
8b44a3d2
LP
2022 const ExecContext *context,
2023 const ExecParameters *params,
4657abb5 2024 const ExecRuntime *runtime) {
8b44a3d2
LP
2025
2026 assert(context);
8b44a3d2 2027
915e6d16
LP
2028 if (context->root_image)
2029 return true;
2030
2a624c36
AP
2031 if (!strv_isempty(context->read_write_paths) ||
2032 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2033 !strv_isempty(context->inaccessible_paths) ||
2034 !strv_isempty(context->exec_paths) ||
2035 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2036 return true;
2037
42b1d8e0 2038 if (context->n_bind_mounts > 0)
d2d6c096
LP
2039 return true;
2040
2abd4e38
YW
2041 if (context->n_temporary_filesystems > 0)
2042 return true;
2043
b3d13314
LB
2044 if (context->n_mount_images > 0)
2045 return true;
2046
93f59701
LB
2047 if (context->n_extension_images > 0)
2048 return true;
2049
a07b9926
LB
2050 if (!strv_isempty(context->extension_directories))
2051 return true;
2052
37ed15d7 2053 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2054 return true;
2055
2056 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2057 return true;
2058
8b44a3d2 2059 if (context->private_devices ||
228af36f 2060 context->private_mounts ||
8b44a3d2 2061 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2062 context->protect_home != PROTECT_HOME_NO ||
2063 context->protect_kernel_tunables ||
c575770b 2064 context->protect_kernel_modules ||
94a7b275 2065 context->protect_kernel_logs ||
4e399953
LP
2066 context->protect_control_groups ||
2067 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2068 context->proc_subset != PROC_SUBSET_ALL ||
2069 context->private_ipc ||
2070 context->ipc_namespace_path)
8b44a3d2
LP
2071 return true;
2072
37c56f89 2073 if (context->root_directory) {
5e98086d 2074 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2075 return true;
2076
5b10116e 2077 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2078 if (params && !params->prefix[t])
37c56f89
YW
2079 continue;
2080
211a3d87 2081 if (context->directories[t].n_items > 0)
37c56f89
YW
2082 return true;
2083 }
2084 }
5d997827 2085
42b1d8e0 2086 if (context->dynamic_user &&
211a3d87
LB
2087 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2088 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2089 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2090 return true;
2091
91dd5f7c
LP
2092 if (context->log_namespace)
2093 return true;
2094
8b44a3d2
LP
2095 return false;
2096}
2097
5749f855 2098static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2099 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2100 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2101 _cleanup_close_ int unshare_ready_fd = -1;
2102 _cleanup_(sigkill_waitp) pid_t pid = 0;
2103 uint64_t c = 1;
d251207d
LP
2104 ssize_t n;
2105 int r;
2106
5749f855
AZ
2107 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2108 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2109 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2110 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2111 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2112 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2113 * continues execution normally.
2114 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2115 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2116
5749f855
AZ
2117 /* Can only set up multiple mappings with CAP_SETUID. */
2118 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2119 r = asprintf(&uid_map,
5749f855 2120 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2121 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2122 ouid, ouid, uid, uid);
2123 else
2124 r = asprintf(&uid_map,
2125 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2126 ouid, ouid);
d251207d 2127
5749f855
AZ
2128 if (r < 0)
2129 return -ENOMEM;
2130
2131 /* Can only set up multiple mappings with CAP_SETGID. */
2132 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2133 r = asprintf(&gid_map,
5749f855 2134 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2135 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2136 ogid, ogid, gid, gid);
2137 else
2138 r = asprintf(&gid_map,
2139 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2140 ogid, ogid);
2141
2142 if (r < 0)
2143 return -ENOMEM;
d251207d
LP
2144
2145 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2146 * namespace. */
2147 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2148 if (unshare_ready_fd < 0)
2149 return -errno;
2150
2151 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2152 * failed. */
2153 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2154 return -errno;
2155
4c253ed1
LP
2156 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2157 if (r < 0)
2158 return r;
2159 if (r == 0) {
d251207d
LP
2160 _cleanup_close_ int fd = -1;
2161 const char *a;
2162 pid_t ppid;
2163
2164 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2165 * here, after the parent opened its own user namespace. */
2166
2167 ppid = getppid();
2168 errno_pipe[0] = safe_close(errno_pipe[0]);
2169
2170 /* Wait until the parent unshared the user namespace */
2171 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175
2176 /* Disable the setgroups() system call in the child user namespace, for good. */
2177 a = procfs_file_alloca(ppid, "setgroups");
2178 fd = open(a, O_WRONLY|O_CLOEXEC);
2179 if (fd < 0) {
2180 if (errno != ENOENT) {
2181 r = -errno;
2182 goto child_fail;
2183 }
2184
2185 /* If the file is missing the kernel is too old, let's continue anyway. */
2186 } else {
2187 if (write(fd, "deny\n", 5) < 0) {
2188 r = -errno;
2189 goto child_fail;
2190 }
2191
2192 fd = safe_close(fd);
2193 }
2194
2195 /* First write the GID map */
2196 a = procfs_file_alloca(ppid, "gid_map");
2197 fd = open(a, O_WRONLY|O_CLOEXEC);
2198 if (fd < 0) {
2199 r = -errno;
2200 goto child_fail;
2201 }
2202 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2203 r = -errno;
2204 goto child_fail;
2205 }
2206 fd = safe_close(fd);
2207
2208 /* The write the UID map */
2209 a = procfs_file_alloca(ppid, "uid_map");
2210 fd = open(a, O_WRONLY|O_CLOEXEC);
2211 if (fd < 0) {
2212 r = -errno;
2213 goto child_fail;
2214 }
2215 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2216 r = -errno;
2217 goto child_fail;
2218 }
2219
2220 _exit(EXIT_SUCCESS);
2221
2222 child_fail:
2223 (void) write(errno_pipe[1], &r, sizeof(r));
2224 _exit(EXIT_FAILURE);
2225 }
2226
2227 errno_pipe[1] = safe_close(errno_pipe[1]);
2228
2229 if (unshare(CLONE_NEWUSER) < 0)
2230 return -errno;
2231
2232 /* Let the child know that the namespace is ready now */
2233 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2234 return -errno;
2235
2236 /* Try to read an error code from the child */
2237 n = read(errno_pipe[0], &r, sizeof(r));
2238 if (n < 0)
2239 return -errno;
2240 if (n == sizeof(r)) { /* an error code was sent to us */
2241 if (r < 0)
2242 return r;
2243 return -EIO;
2244 }
2245 if (n != 0) /* on success we should have read 0 bytes */
2246 return -EIO;
2247
8f03de53 2248 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2249 if (r < 0)
2250 return r;
2e87a1fd 2251 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2252 return -EIO;
2253
2254 return 0;
2255}
2256
494d0247
YW
2257static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2258 if (!context->dynamic_user)
2259 return false;
2260
2261 if (type == EXEC_DIRECTORY_CONFIGURATION)
2262 return false;
2263
2264 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2265 return false;
2266
2267 return true;
2268}
2269
211a3d87
LB
2270static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2271 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2272 int r;
2273
2274 assert(source);
2275
2276 src_abs = path_join(root, source);
2277 if (!src_abs)
2278 return -ENOMEM;
2279
2280 STRV_FOREACH(dst, symlinks) {
2281 _cleanup_free_ char *dst_abs = NULL;
2282
2283 dst_abs = path_join(root, *dst);
2284 if (!dst_abs)
2285 return -ENOMEM;
2286
2287 r = mkdir_parents_label(dst_abs, 0755);
2288 if (r < 0)
2289 return r;
2290
2291 r = symlink_idempotent(src_abs, dst_abs, true);
2292 if (r < 0)
2293 return r;
2294 }
2295
2296 return 0;
2297}
2298
3536f49e 2299static int setup_exec_directory(
07689d5d
LP
2300 const ExecContext *context,
2301 const ExecParameters *params,
2302 uid_t uid,
3536f49e 2303 gid_t gid,
3536f49e 2304 ExecDirectoryType type,
211a3d87 2305 bool needs_mount_namespace,
3536f49e 2306 int *exit_status) {
07689d5d 2307
72fd1768 2308 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2309 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2310 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2311 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2312 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2313 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2314 };
07689d5d
LP
2315 int r;
2316
2317 assert(context);
2318 assert(params);
72fd1768 2319 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2320 assert(exit_status);
07689d5d 2321
3536f49e
YW
2322 if (!params->prefix[type])
2323 return 0;
2324
8679efde 2325 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2326 if (!uid_is_valid(uid))
2327 uid = 0;
2328 if (!gid_is_valid(gid))
2329 gid = 0;
2330 }
2331
211a3d87 2332 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2333 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2334
211a3d87 2335 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2336 if (!p) {
2337 r = -ENOMEM;
2338 goto fail;
2339 }
07689d5d 2340
23a7448e
YW
2341 r = mkdir_parents_label(p, 0755);
2342 if (r < 0)
3536f49e 2343 goto fail;
23a7448e 2344
494d0247 2345 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2346 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2347 * case we want to avoid leaving a directory around fully accessible that is owned by
2348 * a dynamic user whose UID is later on reused. To lock this down we use the same
2349 * trick used by container managers to prohibit host users to get access to files of
2350 * the same UID in containers: we place everything inside a directory that has an
2351 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2352 * for unprivileged host code. We then use fs namespacing to make this directory
2353 * permeable for the service itself.
6c47cd7d 2354 *
3f5b1508
LP
2355 * Specifically: for a service which wants a special directory "foo/" we first create
2356 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2357 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2358 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2359 * unprivileged host users can't look into it. Inside of the namespace of the unit
2360 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2361 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2362 * for the service and making sure it only gets access to the dirs it needs but no
2363 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2364 *
3f5b1508
LP
2365 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2366 * to be owned by the service itself.
2367 *
2368 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2369 * for sharing files or sockets with other services. */
6c47cd7d 2370
4ede9802
LP
2371 pp = path_join(params->prefix[type], "private");
2372 if (!pp) {
6c47cd7d
LP
2373 r = -ENOMEM;
2374 goto fail;
2375 }
2376
2377 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2378 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2379 if (r < 0)
2380 goto fail;
2381
211a3d87 2382 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2383 r = -ENOMEM;
2384 goto fail;
2385 }
2386
2387 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2388 r = mkdir_parents_label(pp, 0755);
2389 if (r < 0)
2390 goto fail;
2391
949befd3
LP
2392 if (is_dir(p, false) > 0 &&
2393 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2394
2395 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2396 * it over. Most likely the service has been upgraded from one that didn't use
2397 * DynamicUser=1, to one that does. */
2398
cf52c45d
LP
2399 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2401 exec_directory_type_to_string(type), p, pp);
2402
949befd3
LP
2403 if (rename(p, pp) < 0) {
2404 r = -errno;
2405 goto fail;
2406 }
2407 } else {
2408 /* Otherwise, create the actual directory for the service */
2409
2410 r = mkdir_label(pp, context->directories[type].mode);
2411 if (r < 0 && r != -EEXIST)
2412 goto fail;
2413 }
6c47cd7d 2414
df61e79a
LB
2415 /* And link it up from the original place. Note that if a mount namespace is going to be
2416 * used, then this symlink remains on the host, and a new one for the child namespace will
2417 * be created later. */
6c9c51e5 2418 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2419 if (r < 0)
2420 goto fail;
2421
6c47cd7d 2422 } else {
5c6d40d1
LP
2423 _cleanup_free_ char *target = NULL;
2424
2425 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2426 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2427 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2428
2429 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2430 * by DynamicUser=1 (see above)?
2431 *
2432 * We do this for all directory types except for ConfigurationDirectory=,
2433 * since they all support the private/ symlink logic at least in some
2434 * configurations, see above. */
5c6d40d1 2435
578dc69f
YW
2436 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2437 if (r < 0)
2438 goto fail;
2439
211a3d87 2440 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2441 if (!q) {
2442 r = -ENOMEM;
2443 goto fail;
2444 }
2445
578dc69f
YW
2446 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2447 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2448 if (r < 0)
2449 goto fail;
2450
2451 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2452
2453 /* Hmm, apparently DynamicUser= was once turned on for this service,
2454 * but is no longer. Let's move the directory back up. */
2455
cf52c45d
LP
2456 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2457 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2458 exec_directory_type_to_string(type), q, p);
2459
5c6d40d1
LP
2460 if (unlink(p) < 0) {
2461 r = -errno;
2462 goto fail;
2463 }
2464
2465 if (rename(q, p) < 0) {
2466 r = -errno;
2467 goto fail;
2468 }
2469 }
2470 }
2471
6c47cd7d 2472 r = mkdir_label(p, context->directories[type].mode);
d484580c 2473 if (r < 0) {
d484580c
LP
2474 if (r != -EEXIST)
2475 goto fail;
2476
206e9864
LP
2477 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2478 struct stat st;
2479
2480 /* Don't change the owner/access mode of the configuration directory,
2481 * as in the common case it is not written to by a service, and shall
2482 * not be writable. */
2483
2484 if (stat(p, &st) < 0) {
2485 r = -errno;
2486 goto fail;
2487 }
2488
2489 /* Still complain if the access mode doesn't match */
2490 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2491 log_warning("%s \'%s\' already exists but the mode is different. "
2492 "(File system: %o %sMode: %o)",
211a3d87 2493 exec_directory_type_to_string(type), context->directories[type].items[i].path,
206e9864
LP
2494 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2495
6cff72eb 2496 continue;
206e9864 2497 }
6cff72eb 2498 }
a1164ae3 2499 }
07689d5d 2500
206e9864 2501 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2502 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2503 * current UID/GID ownership.) */
2504 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2505 if (r < 0)
2506 goto fail;
c71b2eb7 2507
607b358e
LP
2508 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2509 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2510 * assignments to exist. */
607b358e 2511 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2512 if (r < 0)
3536f49e 2513 goto fail;
07689d5d
LP
2514 }
2515
211a3d87
LB
2516 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2517 * they are set up later, to allow configuring empty var/run/etc. */
2518 if (!needs_mount_namespace)
2519 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2520 r = create_many_symlinks(params->prefix[type],
2521 context->directories[type].items[i].path,
2522 context->directories[type].items[i].symlinks);
2523 if (r < 0)
2524 goto fail;
2525 }
2526
07689d5d 2527 return 0;
3536f49e
YW
2528
2529fail:
2530 *exit_status = exit_status_table[type];
3536f49e 2531 return r;
07689d5d
LP
2532}
2533
bb0c0d6f
LP
2534static int write_credential(
2535 int dfd,
2536 const char *id,
2537 const void *data,
2538 size_t size,
2539 uid_t uid,
2540 bool ownership_ok) {
2541
2542 _cleanup_(unlink_and_freep) char *tmp = NULL;
2543 _cleanup_close_ int fd = -1;
2544 int r;
2545
2546 r = tempfn_random_child("", "cred", &tmp);
2547 if (r < 0)
2548 return r;
2549
2550 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2551 if (fd < 0) {
2552 tmp = mfree(tmp);
2553 return -errno;
2554 }
2555
43144be4 2556 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2557 if (r < 0)
2558 return r;
2559
2560 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2561 return -errno;
2562
2563 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2564 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2565 if (r < 0) {
2566 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2567 return r;
2568
2569 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2570 * to express: that the user gets read access and nothing
2571 * else. But if the backing fs can't support that (e.g. ramfs)
2572 * then we can use file ownership instead. But that's only safe if
2573 * we can then re-mount the whole thing read-only, so that the
2574 * user can no longer chmod() the file to gain write access. */
2575 return r;
2576
f5fbe71d 2577 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2578 return -errno;
2579 }
2580 }
2581
2582 if (renameat(dfd, tmp, dfd, id) < 0)
2583 return -errno;
2584
2585 tmp = mfree(tmp);
2586 return 0;
2587}
2588
2ad591a3
LP
2589static char **credential_search_path(
2590 const ExecParameters *params,
2591 bool encrypted) {
2592
2593 _cleanup_strv_free_ char **l = NULL;
2594
2595 assert(params);
2596
2597 /* Assemble a search path to find credentials in. We'll look in /etc/credstore/ (and similar
2598 * directories in /usr/lib/ + /run/) for all types of credentials. If we are looking for encrypted
2599 * credentials, also look in /etc/credstore.encrypted/ (and similar dirs). */
2600
2601 if (encrypted) {
2602 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
2603 return NULL;
2604
2605 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
2606 return NULL;
2607 }
2608
2609 if (params->received_credentials_directory)
2610 if (strv_extend(&l, params->received_credentials_directory) < 0)
2611 return NULL;
2612
2613 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
2614 return NULL;
2615
2616 if (DEBUG_LOGGING) {
2617 _cleanup_free_ char *t = strv_join(l, ":");
2618
2619 log_debug("Credential search path is: %s", t);
2620 }
2621
2622 return TAKE_PTR(l);
2623}
2624
3989bdc1
AB
2625static int load_credential(
2626 const ExecContext *context,
2627 const ExecParameters *params,
10b44e1d
LP
2628 const char *id,
2629 const char *path,
2630 bool encrypted,
3989bdc1
AB
2631 const char *unit,
2632 int read_dfd,
2633 int write_dfd,
2634 uid_t uid,
2635 bool ownership_ok,
2636 uint64_t *left) {
2637
3989bdc1 2638 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
2ad591a3 2639 _cleanup_strv_free_ char **search_path = NULL;
3989bdc1 2640 _cleanup_(erase_and_freep) char *data = NULL;
2ad591a3
LP
2641 _cleanup_free_ char *bindname = NULL;
2642 const char *source = NULL;
3989bdc1 2643 bool missing_ok = true;
2ad591a3 2644 size_t size, add, maxsz;
3989bdc1
AB
2645 int r;
2646
10b44e1d
LP
2647 assert(context);
2648 assert(params);
2649 assert(id);
2650 assert(path);
2651 assert(unit);
2652 assert(write_dfd >= 0);
2653 assert(left);
2654
2ad591a3
LP
2655 if (read_dfd >= 0) {
2656 /* If a directory fd is specified, then read the file directly from that dir. In this case we
2657 * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
2658 * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
2659 * open it. */
2660
2661 if (!filename_is_valid(path)) /* safety check */
2662 return -EINVAL;
2663
2664 missing_ok = true;
10b44e1d 2665 source = path;
2ad591a3
LP
2666
2667 } else if (path_is_absolute(path)) {
2668 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
2669 * sockets */
2670
2671 if (!path_is_valid(path)) /* safety check */
2672 return -EINVAL;
2673
3989bdc1
AB
2674 flags |= READ_FULL_FILE_CONNECT_SOCKET;
2675
2676 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2677 * via the source socket address in case we read off an AF_UNIX socket. */
10b44e1d 2678 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
3989bdc1
AB
2679 return -ENOMEM;
2680
2681 missing_ok = false;
2ad591a3 2682 source = path;
3989bdc1 2683
2ad591a3
LP
2684 } else if (credential_name_valid(path)) {
2685 /* If this is a relative path, take it as credential name relative to the credentials
2686 * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
2687 * are operating on a credential store, i.e. this is guaranteed to be regular files. */
2688
2689 search_path = credential_search_path(params, encrypted);
2690 if (!search_path)
3989bdc1
AB
2691 return -ENOMEM;
2692
2ad591a3 2693 missing_ok = true;
3989bdc1
AB
2694 } else
2695 source = NULL;
2696
2ad591a3
LP
2697 if (encrypted)
2698 flags |= READ_FULL_FILE_UNBASE64;
2699
2700 maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
2701
2702 if (search_path) {
2703 STRV_FOREACH(d, search_path) {
2704 _cleanup_free_ char *j = NULL;
2705
2706 j = path_join(*d, path);
2707 if (!j)
2708 return -ENOMEM;
2709
2710 r = read_full_file_full(
2711 AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
2712 UINT64_MAX,
2713 maxsz,
2714 flags,
2715 NULL,
2716 &data, &size);
2717 if (r != -ENOENT)
2718 break;
2719 }
2720 } else if (source)
3989bdc1
AB
2721 r = read_full_file_full(
2722 read_dfd, source,
2723 UINT64_MAX,
2ad591a3
LP
2724 maxsz,
2725 flags,
3989bdc1
AB
2726 bindname,
2727 &data, &size);
2728 else
2729 r = -ENOENT;
2730
10b44e1d 2731 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
3989bdc1
AB
2732 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2733 * will get clear errors if we don't pass such a missing credential on as they
2734 * themselves will get ENOENT when trying to read them, which should not be much
2735 * worse than when we handle the error here and make it fatal.
2736 *
2737 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2738 * we are fine, too. */
10b44e1d 2739 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
3989bdc1
AB
2740 return 0;
2741 }
2742 if (r < 0)
10b44e1d 2743 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
3989bdc1 2744
10b44e1d 2745 if (encrypted) {
3989bdc1
AB
2746 _cleanup_free_ void *plaintext = NULL;
2747 size_t plaintext_size = 0;
2748
6a0779cb 2749 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size, &plaintext, &plaintext_size);
3989bdc1
AB
2750 if (r < 0)
2751 return r;
2752
2753 free_and_replace(data, plaintext);
2754 size = plaintext_size;
2755 }
2756
10b44e1d 2757 add = strlen(id) + size;
3989bdc1
AB
2758 if (add > *left)
2759 return -E2BIG;
2760
10b44e1d 2761 r = write_credential(write_dfd, id, data, size, uid, ownership_ok);
3989bdc1 2762 if (r < 0)
94602bff 2763 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
3989bdc1
AB
2764
2765 *left -= add;
2766 return 0;
2767}
2768
2769struct load_cred_args {
3989bdc1
AB
2770 const ExecContext *context;
2771 const ExecParameters *params;
461345a1 2772 bool encrypted;
3989bdc1
AB
2773 const char *unit;
2774 int dfd;
2775 uid_t uid;
2776 bool ownership_ok;
2777 uint64_t *left;
2778};
2779
2780static int load_cred_recurse_dir_cb(
2781 RecurseDirEvent event,
2782 const char *path,
2783 int dir_fd,
2784 int inode_fd,
2785 const struct dirent *de,
2786 const struct statx *sx,
2787 void *userdata) {
2788
6394e5cd 2789 struct load_cred_args *args = ASSERT_PTR(userdata);
11348386 2790 _cleanup_free_ char *sub_id = NULL;
3989bdc1
AB
2791 int r;
2792
2793 if (event != RECURSE_DIR_ENTRY)
2794 return RECURSE_DIR_CONTINUE;
2795
2796 if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
2797 return RECURSE_DIR_CONTINUE;
2798
11348386 2799 sub_id = strreplace(path, "/", "_");
3989bdc1
AB
2800 if (!sub_id)
2801 return -ENOMEM;
2802
2803 if (!credential_name_valid(sub_id))
1451435c 2804 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
3989bdc1 2805
5bec447a 2806 if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
3989bdc1
AB
2807 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
2808 return RECURSE_DIR_CONTINUE;
2809 }
5bec447a
LP
2810 if (errno != ENOENT)
2811 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
3989bdc1 2812
10b44e1d
LP
2813 r = load_credential(
2814 args->context,
2815 args->params,
2816 sub_id,
2817 de->d_name,
461345a1 2818 args->encrypted,
10b44e1d
LP
2819 args->unit,
2820 dir_fd,
2821 args->dfd,
2822 args->uid,
2823 args->ownership_ok,
2824 args->left);
3989bdc1
AB
2825 if (r < 0)
2826 return r;
2827
2828 return RECURSE_DIR_CONTINUE;
2829}
2830
bb0c0d6f
LP
2831static int acquire_credentials(
2832 const ExecContext *context,
2833 const ExecParameters *params,
d3dcf4e3 2834 const char *unit,
bb0c0d6f
LP
2835 const char *p,
2836 uid_t uid,
2837 bool ownership_ok) {
2838
43144be4 2839 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2840 _cleanup_close_ int dfd = -1;
43144be4 2841 ExecLoadCredential *lc;
bb0c0d6f 2842 ExecSetCredential *sc;
bb0c0d6f
LP
2843 int r;
2844
2845 assert(context);
2846 assert(p);
2847
2848 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2849 if (dfd < 0)
2850 return -errno;
2851
43144be4
LP
2852 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2853 HASHMAP_FOREACH(lc, context->load_credentials) {
3989bdc1 2854 _cleanup_close_ int sub_fd = -1;
d3dcf4e3 2855
f344f7fd
LP
2856 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
2857 * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
2858 * a regular file. Finally, if it's a relative path we will use it as a credential name to
2859 * propagate a credential passed to us from further up. */
43144be4 2860
f344f7fd
LP
2861 if (path_is_absolute(lc->path)) {
2862 sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
1d68a2e1
LP
2863 if (sub_fd < 0 && !IN_SET(errno,
2864 ENOTDIR, /* Not a directory */
2865 ENOENT)) /* Doesn't exist? */
2866 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
f344f7fd 2867 }
43144be4 2868
61c5a49e 2869 if (sub_fd < 0)
f344f7fd 2870 /* Regular file (incl. a credential passed in from higher up) */
10b44e1d
LP
2871 r = load_credential(
2872 context,
2873 params,
2874 lc->id,
2875 lc->path,
2876 lc->encrypted,
2877 unit,
2878 -1,
2879 dfd,
2880 uid,
2881 ownership_ok,
2882 &left);
61c5a49e 2883 else
10b44e1d 2884 /* Directory */
3989bdc1
AB
2885 r = recurse_dir(
2886 sub_fd,
11348386 2887 /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
3989bdc1
AB
2888 /* statx_mask= */ 0,
2889 /* n_depth_max= */ UINT_MAX,
9883cbb2 2890 RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
3989bdc1
AB
2891 load_cred_recurse_dir_cb,
2892 &(struct load_cred_args) {
3989bdc1
AB
2893 .context = context,
2894 .params = params,
461345a1 2895 .encrypted = lc->encrypted,
3989bdc1
AB
2896 .unit = unit,
2897 .dfd = dfd,
2898 .uid = uid,
2899 .ownership_ok = ownership_ok,
2900 .left = &left,
2901 });
61c5a49e
LP
2902 if (r < 0)
2903 return r;
bb0c0d6f
LP
2904 }
2905
9e6e9d61
LP
2906 /* Second, we add in literally specified credentials. If the credentials already exist, we'll not add
2907 * them, so that they can act as a "default" if the same credential is specified multiple times. */
43144be4
LP
2908 HASHMAP_FOREACH(sc, context->set_credentials) {
2909 _cleanup_(erase_and_freep) void *plaintext = NULL;
2910 const char *data;
2911 size_t size, add;
2912
9e6e9d61
LP
2913 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
2914 * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
2915 * slow and involved, hence it's nice to be able to skip that if the credential already
2916 * exists anyway. */
43144be4
LP
2917 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2918 continue;
2919 if (errno != ENOENT)
2920 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2921
2922 if (sc->encrypted) {
6a0779cb 2923 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
43144be4
LP
2924 if (r < 0)
2925 return r;
2926
2927 data = plaintext;
2928 } else {
2929 data = sc->data;
2930 size = sc->size;
2931 }
2932
2933 add = strlen(sc->id) + size;
2934 if (add > left)
2935 return -E2BIG;
2936
2937 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2938 if (r < 0)
2939 return r;
2940
43144be4
LP
2941 left -= add;
2942 }
2943
bb0c0d6f
LP
2944 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2945 return -errno;
2946
2947 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2948 * accessible */
2949
2950 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2951 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2952 if (r < 0) {
2953 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2954 return r;
2955
2956 if (!ownership_ok)
2957 return r;
2958
f5fbe71d 2959 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2960 return -errno;
2961 }
2962 }
2963
2964 return 0;
2965}
2966
2967static int setup_credentials_internal(
2968 const ExecContext *context,
2969 const ExecParameters *params,
d3dcf4e3 2970 const char *unit,
bb0c0d6f
LP
2971 const char *final, /* This is where the credential store shall eventually end up at */
2972 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2973 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2974 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2975 uid_t uid) {
2976
2977 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2978 * if we mounted something; false if we definitely can't mount anything */
2979 bool final_mounted;
2980 const char *where;
2981
2982 assert(context);
2983 assert(final);
2984 assert(workspace);
2985
2986 if (reuse_workspace) {
2987 r = path_is_mount_point(workspace, NULL, 0);
2988 if (r < 0)
2989 return r;
2990 if (r > 0)
2991 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2992 else
2993 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2994 } else
2995 workspace_mounted = -1; /* ditto */
2996
2997 r = path_is_mount_point(final, NULL, 0);
2998 if (r < 0)
2999 return r;
3000 if (r > 0) {
3001 /* If the final place already has something mounted, we use that. If the workspace also has
3002 * something mounted we assume it's actually the same mount (but with MS_RDONLY
3003 * different). */
3004 final_mounted = true;
3005
3006 if (workspace_mounted < 0) {
3007 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
3008 * the final version to the workspace, and make it writable, so that we can make
3009 * changes */
3010
21935150
LP
3011 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3012 if (r < 0)
3013 return r;
bb0c0d6f 3014
21935150
LP
3015 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3016 if (r < 0)
3017 return r;
bb0c0d6f
LP
3018
3019 workspace_mounted = true;
3020 }
3021 } else
3022 final_mounted = false;
3023
3024 if (workspace_mounted < 0) {
3025 /* Nothing is mounted on the workspace yet, let's try to mount something now */
3026 for (int try = 0;; try++) {
3027
3028 if (try == 0) {
3029 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
3030 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
3031 if (r >= 0) {
bb0c0d6f
LP
3032 workspace_mounted = true;
3033 break;
3034 }
3035
3036 } else if (try == 1) {
3037 _cleanup_free_ char *opts = NULL;
3038
43144be4 3039 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
3040 return -ENOMEM;
3041
3042 /* Fall back to "tmpfs" otherwise */
21935150
LP
3043 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
3044 if (r >= 0) {
bb0c0d6f
LP
3045 workspace_mounted = true;
3046 break;
3047 }
3048
3049 } else {
3050 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
3051 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
3052 if (r < 0) {
3053 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
3054 return r;
bb0c0d6f
LP
3055
3056 if (must_mount) /* If we it's not OK to use the plain directory
3057 * fallback, propagate all errors too */
21935150 3058 return r;
bb0c0d6f
LP
3059
3060 /* If we lack privileges to bind mount stuff, then let's gracefully
3061 * proceed for compat with container envs, and just use the final dir
3062 * as is. */
3063
3064 workspace_mounted = false;
3065 break;
3066 }
3067
3068 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
3069 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3070 if (r < 0)
3071 return r;
bb0c0d6f
LP
3072
3073 workspace_mounted = true;
3074 break;
3075 }
3076 }
3077 }
3078
3079 assert(!must_mount || workspace_mounted > 0);
3080 where = workspace_mounted ? workspace : final;
3081
03bc11d1 3082 (void) label_fix_full(AT_FDCWD, where, final, 0);
e3a0a862 3083
d3dcf4e3 3084 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
3085 if (r < 0)
3086 return r;
3087
3088 if (workspace_mounted) {
3089 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
3090 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
3091 if (r < 0)
3092 return r;
bb0c0d6f
LP
3093
3094 /* And mount it to the final place, read-only */
21935150
LP
3095 if (final_mounted)
3096 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
3097 else
3098 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
3099 if (r < 0)
3100 return r;
bb0c0d6f
LP
3101 } else {
3102 _cleanup_free_ char *parent = NULL;
3103
3104 /* If we do not have our own mount put used the plain directory fallback, then we need to
3105 * open access to the top-level credential directory and the per-service directory now */
3106
45519d13
LP
3107 r = path_extract_directory(final, &parent);
3108 if (r < 0)
3109 return r;
bb0c0d6f
LP
3110 if (chmod(parent, 0755) < 0)
3111 return -errno;
3112 }
3113
3114 return 0;
3115}
3116
3117static int setup_credentials(
3118 const ExecContext *context,
3119 const ExecParameters *params,
3120 const char *unit,
3121 uid_t uid) {
3122
3123 _cleanup_free_ char *p = NULL, *q = NULL;
bb0c0d6f
LP
3124 int r;
3125
3126 assert(context);
3127 assert(params);
3128
3129 if (!exec_context_has_credentials(context))
3130 return 0;
3131
3132 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
3133 return -EINVAL;
3134
3135 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
3136 * and the subdir we mount over with a read-only file system readable by the service's user */
3137 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
3138 if (!q)
3139 return -ENOMEM;
3140
3141 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
3142 if (r < 0 && r != -EEXIST)
3143 return r;
3144
3145 p = path_join(q, unit);
3146 if (!p)
3147 return -ENOMEM;
3148
3149 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
3150 if (r < 0 && r != -EEXIST)
3151 return r;
3152
3153 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
3154 if (r < 0) {
3155 _cleanup_free_ char *t = NULL, *u = NULL;
3156
3157 /* If this is not a privilege or support issue then propagate the error */
3158 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
3159 return r;
3160
3161 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
3162 * it into place, so that users can't access half-initialized credential stores. */
3163 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
3164 if (!t)
3165 return -ENOMEM;
3166
3167 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
3168 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
3169 * after it is fully set up */
3170 u = path_join(t, unit);
3171 if (!u)
3172 return -ENOMEM;
3173
3174 FOREACH_STRING(i, t, u) {
3175 r = mkdir_label(i, 0700);
3176 if (r < 0 && r != -EEXIST)
3177 return r;
3178 }
3179
3180 r = setup_credentials_internal(
3181 context,
3182 params,
d3dcf4e3 3183 unit,
bb0c0d6f
LP
3184 p, /* final mount point */
3185 u, /* temporary workspace to overmount */
3186 true, /* reuse the workspace if it is already a mount */
3187 false, /* it's OK to fall back to a plain directory if we can't mount anything */
3188 uid);
3189
3190 (void) rmdir(u); /* remove the workspace again if we can. */
3191
3192 if (r < 0)
3193 return r;
3194
3195 } else if (r == 0) {
3196
3197 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
3198 * we can use the same directory for all cases, after turning off propagation. Question
3199 * though is: where do we turn off propagation exactly, and where do we place the workspace
3200 * directory? We need some place that is guaranteed to be a mount point in the host, and
3201 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
3202 * since we ultimately want to move the resulting file system there, i.e. we need propagation
3203 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
3204 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
3205 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
3206 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
3207 * propagation on the former, and then overmount the latter.
3208 *
3209 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
3210 * for this purpose, but there are few other candidates that work equally well for us, and
3211 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 3212 * that no one else sees this should be OK to do. */
bb0c0d6f 3213
21935150
LP
3214 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
3215 if (r < 0)
bb0c0d6f
LP
3216 goto child_fail;
3217
3218 r = setup_credentials_internal(
3219 context,
3220 params,
d3dcf4e3 3221 unit,
bb0c0d6f
LP
3222 p, /* final mount point */
3223 "/dev/shm", /* temporary workspace to overmount */
3224 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
3225 true, /* insist that something is mounted, do not allow fallback to plain directory */
3226 uid);
3227 if (r < 0)
3228 goto child_fail;
3229
3230 _exit(EXIT_SUCCESS);
3231
3232 child_fail:
3233 _exit(EXIT_FAILURE);
3234 }
3235
3236 return 0;
3237}
3238
92b423b9 3239#if ENABLE_SMACK
cefc33ae 3240static int setup_smack(
aa5ae971 3241 const Manager *manager,
cefc33ae 3242 const ExecContext *context,
b83d5050 3243 int executable_fd) {
cefc33ae
LP
3244 int r;
3245
3246 assert(context);
b83d5050 3247 assert(executable_fd >= 0);
cefc33ae 3248
cefc33ae
LP
3249 if (context->smack_process_label) {
3250 r = mac_smack_apply_pid(0, context->smack_process_label);
3251 if (r < 0)
3252 return r;
aa5ae971 3253 } else if (manager->default_smack_process_label) {
cefc33ae
LP
3254 _cleanup_free_ char *exec_label = NULL;
3255
b83d5050 3256 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 3257 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
3258 return r;
3259
aa5ae971 3260 r = mac_smack_apply_pid(0, exec_label ? : manager->default_smack_process_label);
cefc33ae
LP
3261 if (r < 0)
3262 return r;
3263 }
cefc33ae
LP
3264
3265 return 0;
3266}
92b423b9 3267#endif
cefc33ae 3268
6c47cd7d
LP
3269static int compile_bind_mounts(
3270 const ExecContext *context,
3271 const ExecParameters *params,
3272 BindMount **ret_bind_mounts,
da6053d0 3273 size_t *ret_n_bind_mounts,
6c47cd7d
LP
3274 char ***ret_empty_directories) {
3275
3276 _cleanup_strv_free_ char **empty_directories = NULL;
3277 BindMount *bind_mounts;
5b10116e 3278 size_t n, h = 0;
6c47cd7d
LP
3279 int r;
3280
3281 assert(context);
3282 assert(params);
3283 assert(ret_bind_mounts);
3284 assert(ret_n_bind_mounts);
3285 assert(ret_empty_directories);
3286
3287 n = context->n_bind_mounts;
5b10116e 3288 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3289 if (!params->prefix[t])
3290 continue;
3291
211a3d87 3292 n += context->directories[t].n_items;
6c47cd7d
LP
3293 }
3294
3295 if (n <= 0) {
3296 *ret_bind_mounts = NULL;
3297 *ret_n_bind_mounts = 0;
3298 *ret_empty_directories = NULL;
3299 return 0;
3300 }
3301
3302 bind_mounts = new(BindMount, n);
3303 if (!bind_mounts)
3304 return -ENOMEM;
3305
5b10116e 3306 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3307 BindMount *item = context->bind_mounts + i;
3308 char *s, *d;
3309
3310 s = strdup(item->source);
3311 if (!s) {
3312 r = -ENOMEM;
3313 goto finish;
3314 }
3315
3316 d = strdup(item->destination);
3317 if (!d) {
3318 free(s);
3319 r = -ENOMEM;
3320 goto finish;
3321 }
3322
3323 bind_mounts[h++] = (BindMount) {
3324 .source = s,
3325 .destination = d,
3326 .read_only = item->read_only,
3327 .recursive = item->recursive,
3328 .ignore_enoent = item->ignore_enoent,
3329 };
3330 }
3331
5b10116e 3332 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3333 if (!params->prefix[t])
3334 continue;
3335
211a3d87 3336 if (context->directories[t].n_items == 0)
6c47cd7d
LP
3337 continue;
3338
494d0247 3339 if (exec_directory_is_private(context, t) &&
74e12520 3340 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3341 char *private_root;
3342
3343 /* So this is for a dynamic user, and we need to make sure the process can access its own
3344 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3345 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3346
657ee2d8 3347 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3348 if (!private_root) {
3349 r = -ENOMEM;
3350 goto finish;
3351 }
3352
3353 r = strv_consume(&empty_directories, private_root);
a635a7ae 3354 if (r < 0)
6c47cd7d 3355 goto finish;
6c47cd7d
LP
3356 }
3357
211a3d87 3358 for (size_t i = 0; i < context->directories[t].n_items; i++) {
6c47cd7d
LP
3359 char *s, *d;
3360
494d0247 3361 if (exec_directory_is_private(context, t))
211a3d87 3362 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 3363 else
211a3d87 3364 s = path_join(params->prefix[t], context->directories[t].items[i].path);
6c47cd7d
LP
3365 if (!s) {
3366 r = -ENOMEM;
3367 goto finish;
3368 }
3369
494d0247 3370 if (exec_directory_is_private(context, t) &&
74e12520 3371 exec_context_with_rootfs(context))
5609f688
YW
3372 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3373 * directory is not created on the root directory. So, let's bind-mount the directory
3374 * on the 'non-private' place. */
211a3d87 3375 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
3376 else
3377 d = strdup(s);
6c47cd7d
LP
3378 if (!d) {
3379 free(s);
3380 r = -ENOMEM;
3381 goto finish;
3382 }
3383
3384 bind_mounts[h++] = (BindMount) {
3385 .source = s,
3386 .destination = d,
3387 .read_only = false,
9ce4e4b0 3388 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3389 .recursive = true,
3390 .ignore_enoent = false,
3391 };
3392 }
3393 }
3394
3395 assert(h == n);
3396
3397 *ret_bind_mounts = bind_mounts;
3398 *ret_n_bind_mounts = n;
ae2a15bc 3399 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3400
3401 return (int) n;
3402
3403finish:
3404 bind_mount_free_many(bind_mounts, h);
3405 return r;
3406}
3407
df61e79a
LB
3408/* ret_symlinks will contain a list of pairs src:dest that describes
3409 * the symlinks to create later on. For example, the symlinks needed
3410 * to safely give private directories to DynamicUser=1 users. */
3411static int compile_symlinks(
3412 const ExecContext *context,
3413 const ExecParameters *params,
3414 char ***ret_symlinks) {
3415
3416 _cleanup_strv_free_ char **symlinks = NULL;
3417 int r;
3418
3419 assert(context);
3420 assert(params);
3421 assert(ret_symlinks);
3422
3423 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
3424 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
3425 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 3426
211a3d87
LB
3427 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
3428 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 3429
211a3d87
LB
3430 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
3431 dst_abs = path_join(params->prefix[dt], *symlink);
3432 if (!src_abs || !dst_abs)
3433 return -ENOMEM;
df61e79a 3434
211a3d87
LB
3435 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
3436 if (r < 0)
3437 return r;
3438 }
3439
3fa80e5e 3440 if (!exec_directory_is_private(context, dt) || exec_context_with_rootfs(context))
211a3d87
LB
3441 continue;
3442
3443 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
3444 if (!private_path)
3445 return -ENOMEM;
3446
211a3d87 3447 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
3448 if (!path)
3449 return -ENOMEM;
3450
3451 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
3452 if (r < 0)
3453 return r;
3454 }
3455 }
3456
3457 *ret_symlinks = TAKE_PTR(symlinks);
3458
3459 return 0;
3460}
3461
4e677599
LP
3462static bool insist_on_sandboxing(
3463 const ExecContext *context,
3464 const char *root_dir,
3465 const char *root_image,
3466 const BindMount *bind_mounts,
3467 size_t n_bind_mounts) {
3468
4e677599
LP
3469 assert(context);
3470 assert(n_bind_mounts == 0 || bind_mounts);
3471
3472 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3473 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3474 * rearrange stuff in a way we cannot ignore gracefully. */
3475
3476 if (context->n_temporary_filesystems > 0)
3477 return true;
3478
3479 if (root_dir || root_image)
3480 return true;
3481
b3d13314
LB
3482 if (context->n_mount_images > 0)
3483 return true;
3484
4e677599
LP
3485 if (context->dynamic_user)
3486 return true;
3487
4355c04f
LB
3488 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3489 return true;
3490
4e677599
LP
3491 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3492 * essential. */
5b10116e 3493 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3494 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3495 return true;
3496
91dd5f7c
LP
3497 if (context->log_namespace)
3498 return true;
3499
4e677599
LP
3500 return false;
3501}
3502
6818c54c 3503static int apply_mount_namespace(
34cf6c43 3504 const Unit *u,
9f71ba8d 3505 ExecCommandFlags command_flags,
6818c54c
LP
3506 const ExecContext *context,
3507 const ExecParameters *params,
7cc5ef5f
ZJS
3508 const ExecRuntime *runtime,
3509 char **error_path) {
6818c54c 3510
df61e79a 3511 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL;
56a13a49 3512 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3513 const char *root_dir = NULL, *root_image = NULL;
24759d8f
LB
3514 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
3515 *extension_dir = NULL;
228af36f 3516 NamespaceInfo ns_info;
165a31c0 3517 bool needs_sandboxing;
6c47cd7d 3518 BindMount *bind_mounts = NULL;
da6053d0 3519 size_t n_bind_mounts = 0;
6818c54c 3520 int r;
93c6bb51 3521
2b3c1b9e
DH
3522 assert(context);
3523
915e6d16
LP
3524 if (params->flags & EXEC_APPLY_CHROOT) {
3525 root_image = context->root_image;
3526
3527 if (!root_image)
3528 root_dir = context->root_directory;
3529 }
93c6bb51 3530
6c47cd7d
LP
3531 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3532 if (r < 0)
3533 return r;
3534
211a3d87 3535 /* Symlinks for exec dirs are set up after other mounts, before they are made read-only. */
df61e79a
LB
3536 r = compile_symlinks(context, params, &symlinks);
3537 if (r < 0)
41abd7f6 3538 goto finalize;
df61e79a 3539
9f71ba8d 3540 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3541 if (needs_sandboxing) {
3542 /* The runtime struct only contains the parent of the private /tmp,
3543 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3544 * that is sticky, and that's the one we want to use here.
3545 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3546
3547 if (context->private_tmp && runtime) {
56a13a49
ZJS
3548 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3549 tmp_dir = runtime->tmp_dir;
3550 else if (runtime->tmp_dir)
3551 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3552
3553 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3554 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3555 else if (runtime->var_tmp_dir)
56a13a49 3556 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3557 }
3558
b5a33299
YW
3559 ns_info = (NamespaceInfo) {
3560 .ignore_protect_paths = false,
3561 .private_dev = context->private_devices,
3562 .protect_control_groups = context->protect_control_groups,
3563 .protect_kernel_tunables = context->protect_kernel_tunables,
3564 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3565 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3566 .protect_hostname = context->protect_hostname,
5e98086d 3567 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3568 .private_mounts = context->private_mounts,
52b3d652
LP
3569 .protect_home = context->protect_home,
3570 .protect_system = context->protect_system,
4e399953
LP
3571 .protect_proc = context->protect_proc,
3572 .proc_subset = context->proc_subset,
80271a44 3573 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3574 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3575 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3576 };
ecf63c91 3577 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3578 /*
3579 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3580 * sandbox info, otherwise enforce it, don't ignore protected paths and
3581 * fail if we are enable to apply the sandbox inside the mount namespace.
3582 */
3583 ns_info = (NamespaceInfo) {
3584 .ignore_protect_paths = true,
3585 };
3586 else
3587 ns_info = (NamespaceInfo) {};
b5a33299 3588
37ed15d7
FB
3589 if (context->mount_flags == MS_SHARED)
3590 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3591
a631cbfa
LP
3592 if (exec_context_has_credentials(context) &&
3593 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3594 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3595 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3596 if (!creds_path) {
3597 r = -ENOMEM;
3598 goto finalize;
3599 }
bbb4e7f3
LP
3600 }
3601
5e8deb94
LB
3602 if (MANAGER_IS_SYSTEM(u->manager)) {
3603 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3604 if (!propagate_dir) {
3605 r = -ENOMEM;
3606 goto finalize;
3607 }
3608
5e8deb94 3609 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3610 if (!incoming_dir) {
3611 r = -ENOMEM;
3612 goto finalize;
3613 }
24759d8f
LB
3614
3615 extension_dir = strdup("/run/systemd/unit-extensions");
3616 if (!extension_dir) {
3617 r = -ENOMEM;
3618 goto finalize;
3619 }
3620 } else
3621 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0) {
3622 r = -ENOMEM;
3623 goto finalize;
3624 }
5e8deb94 3625
18d73705 3626 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3627 &ns_info, context->read_write_paths,
165a31c0
LP
3628 needs_sandboxing ? context->read_only_paths : NULL,
3629 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3630 needs_sandboxing ? context->exec_paths : NULL,
3631 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d 3632 empty_directories,
df61e79a 3633 symlinks,
6c47cd7d
LP
3634 bind_mounts,
3635 n_bind_mounts,
2abd4e38
YW
3636 context->temporary_filesystems,
3637 context->n_temporary_filesystems,
b3d13314
LB
3638 context->mount_images,
3639 context->n_mount_images,
56a13a49
ZJS
3640 tmp_dir,
3641 var_tmp_dir,
bbb4e7f3 3642 creds_path,
91dd5f7c 3643 context->log_namespace,
915e6d16 3644 context->mount_flags,
d4d55b0d
LB
3645 context->root_hash, context->root_hash_size, context->root_hash_path,
3646 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3647 context->root_verity,
93f59701
LB
3648 context->extension_images,
3649 context->n_extension_images,
a07b9926 3650 context->extension_directories,
5e8deb94
LB
3651 propagate_dir,
3652 incoming_dir,
24759d8f 3653 extension_dir,
3bdc25a4 3654 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3655 error_path);
93c6bb51 3656
1beab8b0 3657 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3658 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3659 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3660 * completely different execution environment. */
aca835ed 3661 if (r == -ENOANO) {
4e677599
LP
3662 if (insist_on_sandboxing(
3663 context,
3664 root_dir, root_image,
3665 bind_mounts,
3666 n_bind_mounts)) {
3667 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3668 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3669 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3670
3671 r = -EOPNOTSUPP;
3672 } else {
aca835ed 3673 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3674 r = 0;
aca835ed 3675 }
93c6bb51
DH
3676 }
3677
8062e643 3678finalize:
4e677599 3679 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3680 return r;
3681}
3682
915e6d16
LP
3683static int apply_working_directory(
3684 const ExecContext *context,
3685 const ExecParameters *params,
3686 const char *home,
376fecf6 3687 int *exit_status) {
915e6d16 3688
6732edab 3689 const char *d, *wd;
2b3c1b9e
DH
3690
3691 assert(context);
376fecf6 3692 assert(exit_status);
2b3c1b9e 3693
6732edab
LP
3694 if (context->working_directory_home) {
3695
376fecf6
LP
3696 if (!home) {
3697 *exit_status = EXIT_CHDIR;
6732edab 3698 return -ENXIO;
376fecf6 3699 }
6732edab 3700
2b3c1b9e 3701 wd = home;
6732edab 3702
14eb3285
LP
3703 } else
3704 wd = empty_to_root(context->working_directory);
e7f1e7c6 3705
fa97f630 3706 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3707 d = wd;
fa97f630 3708 else
3b0e5bb5 3709 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3710
376fecf6
LP
3711 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3712 *exit_status = EXIT_CHDIR;
2b3c1b9e 3713 return -errno;
376fecf6 3714 }
e7f1e7c6
DH
3715
3716 return 0;
3717}
3718
fa97f630
JB
3719static int apply_root_directory(
3720 const ExecContext *context,
3721 const ExecParameters *params,
3722 const bool needs_mount_ns,
3723 int *exit_status) {
3724
3725 assert(context);
3726 assert(exit_status);
3727
5b10116e 3728 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3729 if (!needs_mount_ns && context->root_directory)
3730 if (chroot(context->root_directory) < 0) {
3731 *exit_status = EXIT_CHROOT;
3732 return -errno;
3733 }
fa97f630
JB
3734
3735 return 0;
3736}
3737
b1edf445 3738static int setup_keyring(
34cf6c43 3739 const Unit *u,
b1edf445
LP
3740 const ExecContext *context,
3741 const ExecParameters *p,
3742 uid_t uid, gid_t gid) {
3743
74dd6b51 3744 key_serial_t keyring;
e64c2d0b
DJL
3745 int r = 0;
3746 uid_t saved_uid;
3747 gid_t saved_gid;
74dd6b51
LP
3748
3749 assert(u);
b1edf445 3750 assert(context);
74dd6b51
LP
3751 assert(p);
3752
3753 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3754 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3755 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3756 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3757 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3758 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3759
b1edf445
LP
3760 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3761 return 0;
3762
e64c2d0b
DJL
3763 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3764 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3765 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3766 * & group is just as nasty as acquiring a reference to the user keyring. */
3767
3768 saved_uid = getuid();
3769 saved_gid = getgid();
3770
3771 if (gid_is_valid(gid) && gid != saved_gid) {
3772 if (setregid(gid, -1) < 0)
3773 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3774 }
3775
3776 if (uid_is_valid(uid) && uid != saved_uid) {
3777 if (setreuid(uid, -1) < 0) {
3778 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3779 goto out;
3780 }
3781 }
3782
74dd6b51
LP
3783 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3784 if (keyring == -1) {
3785 if (errno == ENOSYS)
8002fb97 3786 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3787 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3788 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3789 else if (errno == EDQUOT)
8002fb97 3790 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3791 else
e64c2d0b 3792 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3793
e64c2d0b 3794 goto out;
74dd6b51
LP
3795 }
3796
e64c2d0b
DJL
3797 /* When requested link the user keyring into the session keyring. */
3798 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3799
3800 if (keyctl(KEYCTL_LINK,
3801 KEY_SPEC_USER_KEYRING,
3802 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3803 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3804 goto out;
3805 }
3806 }
3807
3808 /* Restore uid/gid back */
3809 if (uid_is_valid(uid) && uid != saved_uid) {
3810 if (setreuid(saved_uid, -1) < 0) {
3811 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3812 goto out;
3813 }
3814 }
3815
3816 if (gid_is_valid(gid) && gid != saved_gid) {
3817 if (setregid(saved_gid, -1) < 0)
3818 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3819 }
3820
3821 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3822 if (!sd_id128_is_null(u->invocation_id)) {
3823 key_serial_t key;
3824
3825 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3826 if (key == -1)
8002fb97 3827 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3828 else {
3829 if (keyctl(KEYCTL_SETPERM, key,
3830 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3831 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3832 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3833 }
3834 }
3835
e64c2d0b 3836out:
37b22b3b 3837 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3838 /* no extra logging, as only the first already reported error matters */
3839 if (getuid() != saved_uid)
3840 (void) setreuid(saved_uid, -1);
b1edf445 3841
e64c2d0b
DJL
3842 if (getgid() != saved_gid)
3843 (void) setregid(saved_gid, -1);
b1edf445 3844
e64c2d0b 3845 return r;
74dd6b51
LP
3846}
3847
3042bbeb 3848static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3849 assert(array);
3850 assert(n);
2caa38e9 3851 assert(pair);
29206d46
LP
3852
3853 if (pair[0] >= 0)
3854 array[(*n)++] = pair[0];
3855 if (pair[1] >= 0)
3856 array[(*n)++] = pair[1];
3857}
3858
a34ceba6
LP
3859static int close_remaining_fds(
3860 const ExecParameters *params,
34cf6c43
YW
3861 const ExecRuntime *runtime,
3862 const DynamicCreds *dcreds,
00d9ef85 3863 int user_lookup_fd,
a34ceba6 3864 int socket_fd,
5b8d1f6b 3865 const int *fds, size_t n_fds) {
a34ceba6 3866
da6053d0 3867 size_t n_dont_close = 0;
00d9ef85 3868 int dont_close[n_fds + 12];
a34ceba6
LP
3869
3870 assert(params);
3871
3872 if (params->stdin_fd >= 0)
3873 dont_close[n_dont_close++] = params->stdin_fd;
3874 if (params->stdout_fd >= 0)
3875 dont_close[n_dont_close++] = params->stdout_fd;
3876 if (params->stderr_fd >= 0)
3877 dont_close[n_dont_close++] = params->stderr_fd;
3878
3879 if (socket_fd >= 0)
3880 dont_close[n_dont_close++] = socket_fd;
3881 if (n_fds > 0) {
3882 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3883 n_dont_close += n_fds;
3884 }
3885
a70581ff 3886 if (runtime) {
29206d46 3887 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3888 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3889 }
29206d46
LP
3890
3891 if (dcreds) {
3892 if (dcreds->user)
3893 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3894 if (dcreds->group)
3895 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3896 }
3897
00d9ef85
LP
3898 if (user_lookup_fd >= 0)
3899 dont_close[n_dont_close++] = user_lookup_fd;
3900
a34ceba6
LP
3901 return close_all_fds(dont_close, n_dont_close);
3902}
3903
00d9ef85
LP
3904static int send_user_lookup(
3905 Unit *unit,
3906 int user_lookup_fd,
3907 uid_t uid,
3908 gid_t gid) {
3909
3910 assert(unit);
3911
3912 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3913 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3914 * specified. */
3915
3916 if (user_lookup_fd < 0)
3917 return 0;
3918
3919 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3920 return 0;
3921
3922 if (writev(user_lookup_fd,
3923 (struct iovec[]) {
e6a7ec4b
LP
3924 IOVEC_INIT(&uid, sizeof(uid)),
3925 IOVEC_INIT(&gid, sizeof(gid)),
3926 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3927 return -errno;
3928
3929 return 0;
3930}
3931
6732edab
LP
3932static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3933 int r;
3934
3935 assert(c);
3936 assert(home);
3937 assert(buf);
3938
3939 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3940
3941 if (*home)
3942 return 0;
3943
3944 if (!c->working_directory_home)
3945 return 0;
3946
6732edab
LP
3947 r = get_home_dir(buf);
3948 if (r < 0)
3949 return r;
3950
3951 *home = *buf;
3952 return 1;
3953}
3954
da50b85a
LP
3955static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3956 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3957 int r;
3958
3959 assert(c);
3960 assert(p);
3961 assert(ret);
3962
3963 assert(c->dynamic_user);
3964
3965 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3966 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3967 * directories. */
3968
5b10116e 3969 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3970 if (t == EXEC_DIRECTORY_CONFIGURATION)
3971 continue;
3972
3973 if (!p->prefix[t])
3974 continue;
3975
211a3d87 3976 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3977 char *e;
3978
494d0247 3979 if (exec_directory_is_private(c, t))
211a3d87 3980 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3981 else
211a3d87 3982 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3983 if (!e)
3984 return -ENOMEM;
3985
3986 r = strv_consume(&list, e);
3987 if (r < 0)
3988 return r;
3989 }
3990 }
3991
ae2a15bc 3992 *ret = TAKE_PTR(list);
da50b85a
LP
3993
3994 return 0;
3995}
3996
78f93209
LP
3997static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3998 bool using_subcgroup;
3999 char *p;
4000
4001 assert(params);
4002 assert(ret);
4003
4004 if (!params->cgroup_path)
4005 return -EINVAL;
4006
4007 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
4008 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
4009 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
4010 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
4011 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
4012 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
4013 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
4014 * flag, which is only passed for the former statements, not for the latter. */
4015
4016 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
4017 if (using_subcgroup)
657ee2d8 4018 p = path_join(params->cgroup_path, ".control");
78f93209
LP
4019 else
4020 p = strdup(params->cgroup_path);
4021 if (!p)
4022 return -ENOMEM;
4023
4024 *ret = p;
4025 return using_subcgroup;
4026}
4027
e2b2fb7f
MS
4028static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
4029 _cleanup_(cpu_set_reset) CPUSet s = {};
4030 int r;
4031
4032 assert(c);
4033 assert(ret);
4034
4035 if (!c->numa_policy.nodes.set) {
4036 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
4037 return 0;
4038 }
4039
4040 r = numa_to_cpu_set(&c->numa_policy, &s);
4041 if (r < 0)
4042 return r;
4043
4044 cpu_set_reset(ret);
4045
4046 return cpu_set_add_all(ret, &s);
4047}
4048
4049bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
4050 assert(c);
4051
4052 return c->cpu_affinity_from_numa;
4053}
4054
1da37e58
ZJS
4055static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
4056 int r;
4057
4058 assert(fds);
4059 assert(n_fds);
4060 assert(*n_fds < fds_size);
4061 assert(ret_fd);
4062
4063 if (fd < 0) {
4064 *ret_fd = -1;
4065 return 0;
4066 }
4067
4068 if (fd < 3 + (int) *n_fds) {
4069 /* Let's move the fd up, so that it's outside of the fd range we will use to store
4070 * the fds we pass to the process (or which are closed only during execve). */
4071
4072 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
4073 if (r < 0)
4074 return -errno;
4075
4076 CLOSE_AND_REPLACE(fd, r);
4077 }
4078
4079 *ret_fd = fds[*n_fds] = fd;
4080 (*n_fds) ++;
4081 return 1;
4082}
4083
ff0af2a1 4084static int exec_child(
f2341e0a 4085 Unit *unit,
34cf6c43 4086 const ExecCommand *command,
ff0af2a1
LP
4087 const ExecContext *context,
4088 const ExecParameters *params,
4089 ExecRuntime *runtime,
29206d46 4090 DynamicCreds *dcreds,
ff0af2a1 4091 int socket_fd,
2caa38e9 4092 const int named_iofds[static 3],
4c47affc 4093 int *fds,
da6053d0 4094 size_t n_socket_fds,
25b583d7 4095 size_t n_storage_fds,
ff0af2a1 4096 char **files_env,
00d9ef85 4097 int user_lookup_fd,
12145637 4098 int *exit_status) {
d35fbf6b 4099
8c35c10d 4100 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 4101 int r, ngids = 0, exec_fd;
4d885bd3
DH
4102 _cleanup_free_ gid_t *supplementary_gids = NULL;
4103 const char *username = NULL, *groupname = NULL;
5686391b 4104 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 4105 const char *home = NULL, *shell = NULL;
7ca69792 4106 char **final_argv = NULL;
7bce046b
LP
4107 dev_t journal_stream_dev = 0;
4108 ino_t journal_stream_ino = 0;
5749f855 4109 bool userns_set_up = false;
165a31c0
LP
4110 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
4111 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
4112 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
4113 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 4114#if HAVE_SELINUX
7f59dd35 4115 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 4116 bool use_selinux = false;
ecfbc84f 4117#endif
f9fa32f0 4118#if ENABLE_SMACK
43b1f709 4119 bool use_smack = false;
ecfbc84f 4120#endif
349cc4a5 4121#if HAVE_APPARMOR
43b1f709 4122 bool use_apparmor = false;
ecfbc84f 4123#endif
5749f855
AZ
4124 uid_t saved_uid = getuid();
4125 gid_t saved_gid = getgid();
fed1e721
LP
4126 uid_t uid = UID_INVALID;
4127 gid_t gid = GID_INVALID;
1da37e58
ZJS
4128 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
4129 n_keep_fds; /* total number of fds not to close */
165a31c0 4130 int secure_bits;
afb11bf1
DG
4131 _cleanup_free_ gid_t *gids_after_pam = NULL;
4132 int ngids_after_pam = 0;
034c6ed7 4133
f2341e0a 4134 assert(unit);
5cb5a6ff
LP
4135 assert(command);
4136 assert(context);
d35fbf6b 4137 assert(params);
ff0af2a1 4138 assert(exit_status);
d35fbf6b 4139
69339ae9
LP
4140 /* Explicitly test for CVE-2021-4034 inspired invocations */
4141 assert(command->path);
4142 assert(!strv_isempty(command->argv));
4143
d35fbf6b
DM
4144 rename_process_from_path(command->path);
4145
9c274488
LP
4146 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4147 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4148 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4149 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4150 SIGNALS_IGNORE);
d35fbf6b
DM
4151
4152 if (context->ignore_sigpipe)
9c274488 4153 (void) ignore_signals(SIGPIPE);
d35fbf6b 4154
ff0af2a1
LP
4155 r = reset_signal_mask();
4156 if (r < 0) {
4157 *exit_status = EXIT_SIGNAL_MASK;
12145637 4158 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4159 }
034c6ed7 4160
d35fbf6b
DM
4161 if (params->idle_pipe)
4162 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4163
2c027c62
LP
4164 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4165 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4166 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4167 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4168
d35fbf6b 4169 log_forget_fds();
2c027c62 4170 log_set_open_when_needed(true);
4f2d528d 4171
40a80078
LP
4172 /* In case anything used libc syslog(), close this here, too */
4173 closelog();
4174
b1994387 4175 int keep_fds[n_fds + 3];
1da37e58
ZJS
4176 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4177 n_keep_fds = n_fds;
4178
4179 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4180 if (r < 0) {
4181 *exit_status = EXIT_FDS;
4182 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4183 }
4184
b1994387 4185#if HAVE_LIBBPF
46004616
ZJS
4186 if (unit->manager->restrict_fs) {
4187 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4188 if (bpf_map_fd < 0) {
4189 *exit_status = EXIT_FDS;
46004616 4190 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4191 }
4192
4193 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4194 if (r < 0) {
4195 *exit_status = EXIT_FDS;
4196 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4197 }
4198 }
4199#endif
4200
1da37e58 4201 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4202 if (r < 0) {
4203 *exit_status = EXIT_FDS;
12145637 4204 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4205 }
4206
0af07108
ZJS
4207 if (!context->same_pgrp &&
4208 setsid() < 0) {
4209 *exit_status = EXIT_SETSID;
4210 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4211 }
9e2f7c11 4212
1e22b5cd 4213 exec_context_tty_reset(context, params);
d35fbf6b 4214
c891efaf 4215 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4216 _cleanup_free_ char *cmdline = NULL;
4217
4ef15008 4218 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4219 if (!cmdline) {
0460aa5c 4220 *exit_status = EXIT_MEMORY;
12145637 4221 return log_oom();
3b20f877 4222 }
d35fbf6b 4223
4ef15008 4224 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4225 if (r != CONFIRM_EXECUTE) {
4226 if (r == CONFIRM_PRETEND_SUCCESS) {
4227 *exit_status = EXIT_SUCCESS;
4228 return 0;
4229 }
ff0af2a1 4230 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4231 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4232 "Execution cancelled by the user");
d35fbf6b
DM
4233 }
4234 }
1a63a750 4235
d521916d
LP
4236 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4237 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4238 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4239 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4240 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4241 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
4242 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
4243 *exit_status = EXIT_MEMORY;
4244 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4245 }
4246
29206d46 4247 if (context->dynamic_user && dcreds) {
da50b85a 4248 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4249
d521916d 4250 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4251 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4252 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4253 *exit_status = EXIT_USER;
12145637 4254 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4255 }
4256
da50b85a
LP
4257 r = compile_suggested_paths(context, params, &suggested_paths);
4258 if (r < 0) {
4259 *exit_status = EXIT_MEMORY;
4260 return log_oom();
4261 }
4262
4263 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4264 if (r < 0) {
4265 *exit_status = EXIT_USER;
d85ff944
YW
4266 if (r == -EILSEQ)
4267 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4268 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4269 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4270 }
524daa8c 4271
70dd455c 4272 if (!uid_is_valid(uid)) {
29206d46 4273 *exit_status = EXIT_USER;
d85ff944 4274 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4275 }
4276
4277 if (!gid_is_valid(gid)) {
4278 *exit_status = EXIT_USER;
d85ff944 4279 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4280 }
5bc7452b 4281
29206d46
LP
4282 if (dcreds->user)
4283 username = dcreds->user->name;
4284
4285 } else {
4d885bd3
DH
4286 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4287 if (r < 0) {
4288 *exit_status = EXIT_USER;
12145637 4289 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4290 }
5bc7452b 4291
4d885bd3
DH
4292 r = get_fixed_group(context, &groupname, &gid);
4293 if (r < 0) {
4294 *exit_status = EXIT_GROUP;
12145637 4295 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4296 }
cdc5d5c5 4297 }
29206d46 4298
cdc5d5c5
DH
4299 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4300 r = get_supplementary_groups(context, username, groupname, gid,
4301 &supplementary_gids, &ngids);
4302 if (r < 0) {
4303 *exit_status = EXIT_GROUP;
12145637 4304 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4305 }
5bc7452b 4306
00d9ef85
LP
4307 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4308 if (r < 0) {
4309 *exit_status = EXIT_USER;
12145637 4310 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4311 }
4312
4313 user_lookup_fd = safe_close(user_lookup_fd);
4314
6732edab
LP
4315 r = acquire_home(context, uid, &home, &home_buffer);
4316 if (r < 0) {
4317 *exit_status = EXIT_CHDIR;
12145637 4318 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4319 }
4320
d35fbf6b
DM
4321 /* If a socket is connected to STDIN/STDOUT/STDERR, we
4322 * must sure to drop O_NONBLOCK */
4323 if (socket_fd >= 0)
a34ceba6 4324 (void) fd_nonblock(socket_fd, false);
acbb0225 4325
4c70a4a7
MS
4326 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4327 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4328 if (params->cgroup_path) {
4329 _cleanup_free_ char *p = NULL;
4330
4331 r = exec_parameters_get_cgroup_path(params, &p);
4332 if (r < 0) {
4333 *exit_status = EXIT_CGROUP;
4334 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4335 }
4336
4337 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4338 if (r == -EUCLEAN) {
4339 *exit_status = EXIT_CGROUP;
4340 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4341 "because the cgroup or one of its parents or "
4342 "siblings is in the threaded mode: %m", p);
4343 }
4c70a4a7
MS
4344 if (r < 0) {
4345 *exit_status = EXIT_CGROUP;
4346 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4347 }
4348 }
4349
a8d08f39 4350 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 4351 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4352 if (r < 0) {
4353 *exit_status = EXIT_NETWORK;
4354 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4355 }
4356 }
4357
a70581ff
XR
4358 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4359 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
4360 if (r < 0) {
4361 *exit_status = EXIT_NAMESPACE;
4362 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4363 }
4364 }
4365
52c239d7 4366 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4367 if (r < 0) {
4368 *exit_status = EXIT_STDIN;
12145637 4369 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4370 }
034c6ed7 4371
52c239d7 4372 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4373 if (r < 0) {
4374 *exit_status = EXIT_STDOUT;
12145637 4375 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4376 }
4377
52c239d7 4378 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4379 if (r < 0) {
4380 *exit_status = EXIT_STDERR;
12145637 4381 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4382 }
4383
d35fbf6b 4384 if (context->oom_score_adjust_set) {
9f8168eb
LP
4385 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4386 * prohibit write access to this file, and we shouldn't trip up over that. */
4387 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4388 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4389 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4390 else if (r < 0) {
ff0af2a1 4391 *exit_status = EXIT_OOM_ADJUST;
12145637 4392 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4393 }
d35fbf6b
DM
4394 }
4395
ad21e542
ZJS
4396 if (context->coredump_filter_set) {
4397 r = set_coredump_filter(context->coredump_filter);
4398 if (ERRNO_IS_PRIVILEGE(r))
4399 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4400 else if (r < 0)
4401 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4402 }
4403
39090201
DJL
4404 if (context->nice_set) {
4405 r = setpriority_closest(context->nice);
4406 if (r < 0)
4407 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4408 }
613b411c 4409
d35fbf6b
DM
4410 if (context->cpu_sched_set) {
4411 struct sched_param param = {
4412 .sched_priority = context->cpu_sched_priority,
4413 };
4414
ff0af2a1
LP
4415 r = sched_setscheduler(0,
4416 context->cpu_sched_policy |
4417 (context->cpu_sched_reset_on_fork ?
4418 SCHED_RESET_ON_FORK : 0),
4419 &param);
4420 if (r < 0) {
4421 *exit_status = EXIT_SETSCHEDULER;
12145637 4422 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4423 }
d35fbf6b 4424 }
fc9b2a84 4425
e2b2fb7f
MS
4426 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4427 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4428 const CPUSet *cpu_set;
4429
4430 if (context->cpu_affinity_from_numa) {
4431 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4432 if (r < 0) {
4433 *exit_status = EXIT_CPUAFFINITY;
4434 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4435 }
4436
4437 cpu_set = &converted_cpu_set;
4438 } else
4439 cpu_set = &context->cpu_set;
4440
4441 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4442 *exit_status = EXIT_CPUAFFINITY;
12145637 4443 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4444 }
e2b2fb7f 4445 }
034c6ed7 4446
b070c7c0
MS
4447 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4448 r = apply_numa_policy(&context->numa_policy);
4449 if (r == -EOPNOTSUPP)
33fe9e3f 4450 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4451 else if (r < 0) {
4452 *exit_status = EXIT_NUMA_POLICY;
4453 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4454 }
4455 }
4456
d35fbf6b
DM
4457 if (context->ioprio_set)
4458 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4459 *exit_status = EXIT_IOPRIO;
12145637 4460 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4461 }
da726a4d 4462
d35fbf6b
DM
4463 if (context->timer_slack_nsec != NSEC_INFINITY)
4464 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4465 *exit_status = EXIT_TIMERSLACK;
12145637 4466 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4467 }
9eba9da4 4468
21022b9d
LP
4469 if (context->personality != PERSONALITY_INVALID) {
4470 r = safe_personality(context->personality);
4471 if (r < 0) {
ff0af2a1 4472 *exit_status = EXIT_PERSONALITY;
12145637 4473 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4474 }
21022b9d 4475 }
94f04347 4476
33331d11
VB
4477 if (context->utmp_id) {
4478 const char *line = context->tty_path ?
4479 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4480 NULL;
df0ff127 4481 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4482 line,
023a4f67
LP
4483 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4484 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4485 USER_PROCESS,
6a93917d 4486 username);
33331d11 4487 }
d35fbf6b 4488
08f67696 4489 if (uid_is_valid(uid)) {
ff0af2a1
LP
4490 r = chown_terminal(STDIN_FILENO, uid);
4491 if (r < 0) {
4492 *exit_status = EXIT_STDIN;
12145637 4493 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4494 }
d35fbf6b 4495 }
8e274523 4496
4e1dfa45 4497 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4498 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4499 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4500 * touch a single hierarchy too. */
584b8688 4501 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4502 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4503 if (r < 0) {
4504 *exit_status = EXIT_CGROUP;
12145637 4505 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4506 }
d35fbf6b 4507 }
034c6ed7 4508
211a3d87
LB
4509 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4510
5b10116e 4511 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87 4512 r = setup_exec_directory(context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4513 if (r < 0)
4514 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4515 }
94f04347 4516
bb0c0d6f
LP
4517 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4518 r = setup_credentials(context, params, unit->id, uid);
4519 if (r < 0) {
4520 *exit_status = EXIT_CREDENTIALS;
4521 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4522 }
4523 }
4524
7bce046b 4525 r = build_environment(
fd63e712 4526 unit,
7bce046b
LP
4527 context,
4528 params,
4529 n_fds,
4530 home,
4531 username,
4532 shell,
4533 journal_stream_dev,
4534 journal_stream_ino,
4535 &our_env);
2065ca69
JW
4536 if (r < 0) {
4537 *exit_status = EXIT_MEMORY;
12145637 4538 return log_oom();
2065ca69
JW
4539 }
4540
4541 r = build_pass_environment(context, &pass_env);
4542 if (r < 0) {
4543 *exit_status = EXIT_MEMORY;
12145637 4544 return log_oom();
2065ca69
JW
4545 }
4546
adf769b0
ZJS
4547 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4548 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4549 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4550 if (!strv_isempty(context->exec_search_path)) {
4551 _cleanup_free_ char *joined = NULL;
4552
4553 joined = strv_join(context->exec_search_path, ":");
4554 if (!joined) {
4555 *exit_status = EXIT_MEMORY;
4556 return log_oom();
4557 }
4558
4559 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4560 if (r < 0) {
4561 *exit_status = EXIT_MEMORY;
4562 return log_oom();
4563 }
4564 }
4565
4ab3d29f 4566 accum_env = strv_env_merge(params->environment,
2065ca69 4567 our_env,
8c35c10d 4568 joined_exec_search_path,
2065ca69
JW
4569 pass_env,
4570 context->environment,
44e5d006 4571 files_env);
2065ca69
JW
4572 if (!accum_env) {
4573 *exit_status = EXIT_MEMORY;
12145637 4574 return log_oom();
2065ca69 4575 }
1280503b 4576 accum_env = strv_env_clean(accum_env);
2065ca69 4577
096424d1 4578 (void) umask(context->umask);
b213e1c1 4579
b1edf445 4580 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4581 if (r < 0) {
4582 *exit_status = EXIT_KEYRING;
12145637 4583 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4584 }
4585
adf769b0
ZJS
4586 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4587 * from it. */
1703fa41 4588 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4589
adf769b0
ZJS
4590 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4591 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4592 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4593
adf769b0
ZJS
4594 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4595 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4596 * desired. */
165a31c0
LP
4597 if (needs_ambient_hack)
4598 needs_setuid = false;
4599 else
4600 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4601
4602 if (needs_sandboxing) {
adf769b0
ZJS
4603 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4604 * /sys being present. The actual MAC context application will happen later, as late as
4605 * possible, to avoid impacting our own code paths. */
7f18ef0a 4606
349cc4a5 4607#if HAVE_SELINUX
43b1f709 4608 use_selinux = mac_selinux_use();
7f18ef0a 4609#endif
f9fa32f0 4610#if ENABLE_SMACK
43b1f709 4611 use_smack = mac_smack_use();
7f18ef0a 4612#endif
349cc4a5 4613#if HAVE_APPARMOR
43b1f709 4614 use_apparmor = mac_apparmor_use();
7f18ef0a 4615#endif
165a31c0 4616 }
7f18ef0a 4617
ce932d2d
LP
4618 if (needs_sandboxing) {
4619 int which_failed;
4620
4621 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4622 * is set here. (See below.) */
4623
4624 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4625 if (r < 0) {
4626 *exit_status = EXIT_LIMITS;
4627 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4628 }
4629 }
4630
0af07108 4631 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4632 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4633 * wins here. (See above.) */
4634
1da37e58 4635 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4636 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4637 if (r < 0) {
4638 *exit_status = EXIT_PAM;
4639 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4640 }
ac45f971 4641
0af07108
ZJS
4642 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4643 if (ngids_after_pam < 0) {
4644 *exit_status = EXIT_MEMORY;
4645 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4646 }
b213e1c1 4647 }
5749f855 4648
0af07108 4649 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4650 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4651 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4652 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4653
4654 userns_set_up = true;
4655 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4656 if (r < 0) {
4657 *exit_status = EXIT_USER;
4658 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4659 }
4660 }
4661
a8d08f39
LP
4662 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4663
6e2d7c4f 4664 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4665 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4666 if (r == -EPERM)
4667 log_unit_warning_errno(unit, r,
4668 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4669 else if (r < 0) {
6e2d7c4f
MS
4670 *exit_status = EXIT_NETWORK;
4671 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4672 }
a8d08f39
LP
4673 } else if (context->network_namespace_path) {
4674 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4675 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4676 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4677 } else
4678 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4679 }
169c1bda 4680
a70581ff
XR
4681 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4682
4683 if (ns_type_supported(NAMESPACE_IPC)) {
4684 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4685 if (r == -EPERM)
4686 log_unit_warning_errno(unit, r,
4687 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4688 else if (r < 0) {
4689 *exit_status = EXIT_NAMESPACE;
4690 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4691 }
4692 } else if (context->ipc_namespace_path) {
4693 *exit_status = EXIT_NAMESPACE;
4694 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4695 "IPCNamespacePath= is not supported, refusing.");
4696 } else
4697 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4698 }
4699
ee818b89 4700 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4701 _cleanup_free_ char *error_path = NULL;
4702
9f71ba8d 4703 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4704 if (r < 0) {
4705 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4706 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4707 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4708 }
d35fbf6b 4709 }
81a2b7ce 4710
daf8f72b
LP
4711 if (needs_sandboxing) {
4712 r = apply_protect_hostname(unit, context, exit_status);
4713 if (r < 0)
4714 return r;
aecd5ac6
TM
4715 }
4716
5749f855
AZ
4717 /* Drop groups as early as possible.
4718 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4719 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4720 if (needs_setuid) {
afb11bf1
DG
4721 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4722 int ngids_to_enforce = 0;
4723
4724 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4725 ngids,
4726 gids_after_pam,
4727 ngids_after_pam,
4728 &gids_to_enforce);
4729 if (ngids_to_enforce < 0) {
4730 *exit_status = EXIT_MEMORY;
4731 return log_unit_error_errno(unit,
4732 ngids_to_enforce,
4733 "Failed to merge group lists. Group membership might be incorrect: %m");
4734 }
4735
4736 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4737 if (r < 0) {
4738 *exit_status = EXIT_GROUP;
12145637 4739 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4740 }
165a31c0 4741 }
096424d1 4742
5749f855
AZ
4743 /* If the user namespace was not set up above, try to do it now.
4744 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4745 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4746 * case of mount namespaces being less privileged when the mount point list is copied from a
4747 * different user namespace). */
9008e1ac 4748
5749f855
AZ
4749 if (needs_sandboxing && context->private_users && !userns_set_up) {
4750 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4751 if (r < 0) {
4752 *exit_status = EXIT_USER;
4753 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4754 }
4755 }
4756
9f71ba8d
ZJS
4757 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4758 * shall execute. */
4759
4760 _cleanup_free_ char *executable = NULL;
b83d5050 4761 _cleanup_close_ int executable_fd = -1;
8c35c10d 4762 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4763 if (r < 0) {
4764 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4765 log_unit_struct_errno(unit, LOG_INFO, r,
4766 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4767 LOG_UNIT_INVOCATION_ID(unit),
4768 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4769 command->path),
4770 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4771 return 0;
4772 }
4773
4774 *exit_status = EXIT_EXEC;
c2503e35
RH
4775
4776 return log_unit_struct_errno(unit, LOG_INFO, r,
4777 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4778 LOG_UNIT_INVOCATION_ID(unit),
4779 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4780 command->path),
4781 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4782 }
4783
b83d5050
ZJS
4784 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4785 if (r < 0) {
4786 *exit_status = EXIT_FDS;
4787 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4788 }
4789
9f71ba8d 4790#if HAVE_SELINUX
49590d67
MS
4791 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4792 int fd = -1;
4793
4794 if (socket_fd >= 0)
4795 fd = socket_fd;
4796 else if (params->n_socket_fds == 1)
4797 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4798 * use context from that fd to compute the label. */
4799 fd = params->fds[0];
4800
4801 if (fd >= 0) {
4802 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4803 if (r < 0) {
4804 if (!context->selinux_context_ignore) {
4805 *exit_status = EXIT_SELINUX_CONTEXT;
4806 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4807 }
4808 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4809 }
9f71ba8d
ZJS
4810 }
4811 }
4812#endif
4813
165a31c0 4814 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4815 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4816 * however if we have it as we want to keep it open until the final execve(). */
4817
1da37e58 4818 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4819 if (r >= 0)
4820 r = shift_fds(fds, n_fds);
4821 if (r >= 0)
25b583d7 4822 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4823 if (r < 0) {
4824 *exit_status = EXIT_FDS;
12145637 4825 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4826 }
e66cf1a3 4827
5686391b
LP
4828 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4829 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4830 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4831 * came this far. */
4832
165a31c0 4833 secure_bits = context->secure_bits;
e66cf1a3 4834
165a31c0
LP
4835 if (needs_sandboxing) {
4836 uint64_t bset;
e66cf1a3 4837
ce932d2d
LP
4838 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4839 * requested. (Note this is placed after the general resource limit initialization, see
4840 * above, in order to take precedence.) */
f4170c67
LP
4841 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4842 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4843 *exit_status = EXIT_LIMITS;
12145637 4844 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4845 }
4846 }
4847
37ac2744
JB
4848#if ENABLE_SMACK
4849 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4850 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4851 if (use_smack) {
aa5ae971 4852 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 4853 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4854 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4855 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4856 }
4857 }
4858#endif
4859
165a31c0
LP
4860 bset = context->capability_bounding_set;
4861 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4862 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4863 * instead of us doing that */
4864 if (needs_ambient_hack)
4865 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4866 (UINT64_C(1) << CAP_SETUID) |
4867 (UINT64_C(1) << CAP_SETGID);
4868
4869 if (!cap_test_all(bset)) {
4870 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4871 if (r < 0) {
4872 *exit_status = EXIT_CAPABILITIES;
12145637 4873 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4874 }
4c2630eb 4875 }
3b8bddde 4876
16fcb191
TK
4877 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4878 * keep-caps set.
4879 * To be able to raise the ambient capabilities after setresuid() they have to be
4880 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4881 * After setresuid() the ambient capabilities can be raised as they are present in
4882 * the permitted and inhertiable set. However it is possible that someone wants to
4883 * set ambient capabilities without changing the user, so we also set the ambient
4884 * capabilities here.
4885 * The requested ambient capabilities are raised in the inheritable set if the
4886 * second argument is true. */
943800f4 4887 if (!needs_ambient_hack) {
755d4b67
IP
4888 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4889 if (r < 0) {
4890 *exit_status = EXIT_CAPABILITIES;
12145637 4891 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4892 }
755d4b67 4893 }
165a31c0 4894 }
755d4b67 4895
fa97f630
JB
4896 /* chroot to root directory first, before we lose the ability to chroot */
4897 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4898 if (r < 0)
4899 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4900
165a31c0 4901 if (needs_setuid) {
08f67696 4902 if (uid_is_valid(uid)) {
ff0af2a1
LP
4903 r = enforce_user(context, uid);
4904 if (r < 0) {
4905 *exit_status = EXIT_USER;
12145637 4906 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4907 }
165a31c0
LP
4908
4909 if (!needs_ambient_hack &&
4910 context->capability_ambient_set != 0) {
755d4b67 4911
16fcb191 4912 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4913 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4914 if (r < 0) {
4915 *exit_status = EXIT_CAPABILITIES;
12145637 4916 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4917 }
755d4b67 4918 }
5b6319dc 4919 }
165a31c0 4920 }
d35fbf6b 4921
56ef8db9
JB
4922 /* Apply working directory here, because the working directory might be on NFS and only the user running
4923 * this service might have the correct privilege to change to the working directory */
fa97f630 4924 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4925 if (r < 0)
4926 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4927
165a31c0 4928 if (needs_sandboxing) {
37ac2744 4929 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4930 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4931 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4932 * are restricted. */
4933
349cc4a5 4934#if HAVE_SELINUX
43b1f709 4935 if (use_selinux) {
5cd9cd35
LP
4936 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4937
4938 if (exec_context) {
4939 r = setexeccon(exec_context);
006d1864
TM
4940 if (r < 0) {
4941 if (!context->selinux_context_ignore) {
4942 *exit_status = EXIT_SELINUX_CONTEXT;
4943 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4944 }
4945 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4946 }
4947 }
4948 }
4949#endif
4950
349cc4a5 4951#if HAVE_APPARMOR
43b1f709 4952 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4953 r = aa_change_onexec(context->apparmor_profile);
4954 if (r < 0 && !context->apparmor_profile_ignore) {
4955 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4956 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4957 }
4958 }
4959#endif
4960
165a31c0 4961 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4962 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4963 * CAP_SETPCAP. */
4964 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4965 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4966 * effective set here.
4967 * The effective set is overwritten during execve with the following values:
4968 * - ambient set (for non-root processes)
4969 * - (inheritable | bounding) set for root processes)
4970 *
4971 * Hence there is no security impact to raise it in the effective set before execve
4972 */
4973 r = capability_gain_cap_setpcap(NULL);
4974 if (r < 0) {
4975 *exit_status = EXIT_CAPABILITIES;
4976 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4977 }
755d4b67 4978 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4979 *exit_status = EXIT_SECUREBITS;
12145637 4980 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4981 }
dbdc4098 4982 }
5b6319dc 4983
59eeb84b 4984 if (context_has_no_new_privileges(context))
d35fbf6b 4985 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4986 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4987 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4988 }
4989
349cc4a5 4990#if HAVE_SECCOMP
469830d1
LP
4991 r = apply_address_families(unit, context);
4992 if (r < 0) {
4993 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4994 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4995 }
04aa0cb9 4996
469830d1
LP
4997 r = apply_memory_deny_write_execute(unit, context);
4998 if (r < 0) {
4999 *exit_status = EXIT_SECCOMP;
12145637 5000 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5001 }
f4170c67 5002
469830d1
LP
5003 r = apply_restrict_realtime(unit, context);
5004 if (r < 0) {
5005 *exit_status = EXIT_SECCOMP;
12145637 5006 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5007 }
5008
f69567cb
LP
5009 r = apply_restrict_suid_sgid(unit, context);
5010 if (r < 0) {
5011 *exit_status = EXIT_SECCOMP;
5012 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5013 }
5014
add00535
LP
5015 r = apply_restrict_namespaces(unit, context);
5016 if (r < 0) {
5017 *exit_status = EXIT_SECCOMP;
12145637 5018 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5019 }
5020
469830d1
LP
5021 r = apply_protect_sysctl(unit, context);
5022 if (r < 0) {
5023 *exit_status = EXIT_SECCOMP;
12145637 5024 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5025 }
5026
469830d1
LP
5027 r = apply_protect_kernel_modules(unit, context);
5028 if (r < 0) {
5029 *exit_status = EXIT_SECCOMP;
12145637 5030 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5031 }
5032
84703040
KK
5033 r = apply_protect_kernel_logs(unit, context);
5034 if (r < 0) {
5035 *exit_status = EXIT_SECCOMP;
5036 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5037 }
5038
fc64760d
KK
5039 r = apply_protect_clock(unit, context);
5040 if (r < 0) {
5041 *exit_status = EXIT_SECCOMP;
5042 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5043 }
5044
469830d1
LP
5045 r = apply_private_devices(unit, context);
5046 if (r < 0) {
5047 *exit_status = EXIT_SECCOMP;
12145637 5048 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5049 }
5050
5051 r = apply_syscall_archs(unit, context);
5052 if (r < 0) {
5053 *exit_status = EXIT_SECCOMP;
12145637 5054 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5055 }
5056
78e864e5
TM
5057 r = apply_lock_personality(unit, context);
5058 if (r < 0) {
5059 *exit_status = EXIT_SECCOMP;
12145637 5060 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5061 }
5062
9df2cdd8
TM
5063 r = apply_syscall_log(unit, context);
5064 if (r < 0) {
5065 *exit_status = EXIT_SECCOMP;
5066 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5067 }
5068
5cd9cd35
LP
5069 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5070 * by the filter as little as possible. */
165a31c0 5071 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5072 if (r < 0) {
5073 *exit_status = EXIT_SECCOMP;
12145637 5074 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5075 }
5076#endif
b1994387
ILG
5077
5078#if HAVE_LIBBPF
5079 r = apply_restrict_filesystems(unit, context);
5080 if (r < 0) {
5081 *exit_status = EXIT_BPF;
5082 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5083 }
5084#endif
5085
d35fbf6b 5086 }
034c6ed7 5087
00819cc1
LP
5088 if (!strv_isempty(context->unset_environment)) {
5089 char **ee = NULL;
5090
5091 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5092 if (!ee) {
5093 *exit_status = EXIT_MEMORY;
12145637 5094 return log_oom();
00819cc1
LP
5095 }
5096
130d3d22 5097 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5098 }
5099
7ca69792
AZ
5100 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
5101 replaced_argv = replace_env_argv(command->argv, accum_env);
5102 if (!replaced_argv) {
5103 *exit_status = EXIT_MEMORY;
5104 return log_oom();
5105 }
5106 final_argv = replaced_argv;
5107 } else
5108 final_argv = command->argv;
034c6ed7 5109
f1d34068 5110 if (DEBUG_LOGGING) {
c2b2df60 5111 _cleanup_free_ char *line = NULL;
81a2b7ce 5112
4ef15008 5113 line = quote_command_line(final_argv, SHELL_ESCAPE_EMPTY);
8a62620e
ZJS
5114 if (!line) {
5115 *exit_status = EXIT_MEMORY;
5116 return log_oom();
5117 }
5118
5119 log_unit_struct(unit, LOG_DEBUG,
5120 "EXECUTABLE=%s", executable,
5121 LOG_UNIT_MESSAGE(unit, "Executing: %s", line));
d35fbf6b 5122 }
dd305ec9 5123
5686391b
LP
5124 if (exec_fd >= 0) {
5125 uint8_t hot = 1;
5126
5127 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5128 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5129
5130 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5131 *exit_status = EXIT_EXEC;
5132 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5133 }
5134 }
5135
a6d9111c 5136 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5137
5138 if (exec_fd >= 0) {
5139 uint8_t hot = 0;
5140
5141 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5142 * that POLLHUP on it no longer means execve() succeeded. */
5143
5144 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5145 *exit_status = EXIT_EXEC;
5146 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5147 }
5148 }
12145637 5149
ff0af2a1 5150 *exit_status = EXIT_EXEC;
9f71ba8d 5151 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5152}
81a2b7ce 5153
34cf6c43 5154static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5155static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5156
f2341e0a
LP
5157int exec_spawn(Unit *unit,
5158 ExecCommand *command,
d35fbf6b
DM
5159 const ExecContext *context,
5160 const ExecParameters *params,
5161 ExecRuntime *runtime,
29206d46 5162 DynamicCreds *dcreds,
d35fbf6b 5163 pid_t *ret) {
8351ceae 5164
ee39ca20 5165 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5166 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5167 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5168 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 5169 _cleanup_free_ char *line = NULL;
d35fbf6b 5170 pid_t pid;
8351ceae 5171
f2341e0a 5172 assert(unit);
d35fbf6b
DM
5173 assert(command);
5174 assert(context);
5175 assert(ret);
5176 assert(params);
25b583d7 5177 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5178
d35fbf6b
DM
5179 if (context->std_input == EXEC_INPUT_SOCKET ||
5180 context->std_output == EXEC_OUTPUT_SOCKET ||
5181 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5182
d85ff944
YW
5183 if (params->n_socket_fds > 1)
5184 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5185
d85ff944
YW
5186 if (params->n_socket_fds == 0)
5187 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5188
d35fbf6b
DM
5189 socket_fd = params->fds[0];
5190 } else {
5191 socket_fd = -1;
5192 fds = params->fds;
9b141911 5193 n_socket_fds = params->n_socket_fds;
25b583d7 5194 n_storage_fds = params->n_storage_fds;
d35fbf6b 5195 }
94f04347 5196
34cf6c43 5197 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5198 if (r < 0)
5199 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5200
f2341e0a 5201 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5202 if (r < 0)
f2341e0a 5203 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5204
4ef15008 5205 line = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
d35fbf6b
DM
5206 if (!line)
5207 return log_oom();
fab56fc5 5208
9f71ba8d
ZJS
5209 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5210 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5211 mac_selinux_maybe_reload();
5212
c2503e35
RH
5213 log_unit_struct(unit, LOG_DEBUG,
5214 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
5215 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
5216 the mount namespace in the child, but we want to log
5217 from the parent, so we need to use the (possibly
5218 inaccurate) path here. */
5219 LOG_UNIT_INVOCATION_ID(unit));
12145637 5220
78f93209
LP
5221 if (params->cgroup_path) {
5222 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
5223 if (r < 0)
5224 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
5225 if (r > 0) { /* We are using a child cgroup */
5226 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5227 if (r < 0)
5228 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
5229
5230 /* Normally we would not propagate the oomd xattrs to children but since we created this
5231 * sub-cgroup internally we should do it. */
5232 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
5233 }
5234 }
5235
d35fbf6b
DM
5236 pid = fork();
5237 if (pid < 0)
74129a12 5238 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5239
5240 if (pid == 0) {
12145637 5241 int exit_status = EXIT_SUCCESS;
ff0af2a1 5242
f2341e0a
LP
5243 r = exec_child(unit,
5244 command,
ff0af2a1
LP
5245 context,
5246 params,
5247 runtime,
29206d46 5248 dcreds,
ff0af2a1 5249 socket_fd,
52c239d7 5250 named_iofds,
4c47affc 5251 fds,
9b141911 5252 n_socket_fds,
25b583d7 5253 n_storage_fds,
ff0af2a1 5254 files_env,
00d9ef85 5255 unit->manager->user_lookup_fds[1],
12145637
LP
5256 &exit_status);
5257
e1714f02
ZJS
5258 if (r < 0) {
5259 const char *status =
5260 exit_status_to_string(exit_status,
e04ed6db 5261 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 5262
c2503e35
RH
5263 log_unit_struct_errno(unit, LOG_ERR, r,
5264 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5265 LOG_UNIT_INVOCATION_ID(unit),
5266 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5267 status, command->path),
5268 "EXECUTABLE=%s", command->path);
e1714f02 5269 }
4c2630eb 5270
ff0af2a1 5271 _exit(exit_status);
034c6ed7
LP
5272 }
5273
f2341e0a 5274 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5275
78f93209
LP
5276 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5277 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5278 * process will be killed too). */
5279 if (subcgroup_path)
5280 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5281
b58b4116 5282 exec_status_start(&command->exec_status, pid);
9fb86720 5283
034c6ed7 5284 *ret = pid;
5cb5a6ff
LP
5285 return 0;
5286}
5287
034c6ed7
LP
5288void exec_context_init(ExecContext *c) {
5289 assert(c);
5290
4c12626c 5291 c->umask = 0022;
0692548c 5292 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5293 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5294 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5295 c->syslog_level_prefix = true;
353e12c2 5296 c->ignore_sigpipe = true;
3a43da28 5297 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5298 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5299 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5300 c->directories[t].mode = 0755;
12213aed 5301 c->timeout_clean_usec = USEC_INFINITY;
a103496c 5302 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
5303 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5304 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5305 c->log_level_max = -1;
005bfaf1
TM
5306#if HAVE_SECCOMP
5307 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5308#endif
51462135
DDM
5309 c->tty_rows = UINT_MAX;
5310 c->tty_cols = UINT_MAX;
b070c7c0 5311 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
5312}
5313
613b411c 5314void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5315 assert(c);
5316
6796073e
LP
5317 c->environment = strv_free(c->environment);
5318 c->environment_files = strv_free(c->environment_files);
b4c14404 5319 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5320 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5321
31ce987c 5322 rlimit_free_all(c->rlimit);
034c6ed7 5323
5b10116e 5324 for (size_t l = 0; l < 3; l++) {
52c239d7 5325 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5326 c->stdio_file[l] = mfree(c->stdio_file[l]);
5327 }
52c239d7 5328
a1e58e8e
LP
5329 c->working_directory = mfree(c->working_directory);
5330 c->root_directory = mfree(c->root_directory);
915e6d16 5331 c->root_image = mfree(c->root_image);
18d73705 5332 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5333 c->root_hash = mfree(c->root_hash);
5334 c->root_hash_size = 0;
5335 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5336 c->root_hash_sig = mfree(c->root_hash_sig);
5337 c->root_hash_sig_size = 0;
5338 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5339 c->root_verity = mfree(c->root_verity);
93f59701 5340 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5341 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5342 c->tty_path = mfree(c->tty_path);
5343 c->syslog_identifier = mfree(c->syslog_identifier);
5344 c->user = mfree(c->user);
5345 c->group = mfree(c->group);
034c6ed7 5346
6796073e 5347 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5348
a1e58e8e 5349 c->pam_name = mfree(c->pam_name);
5b6319dc 5350
2a624c36
AP
5351 c->read_only_paths = strv_free(c->read_only_paths);
5352 c->read_write_paths = strv_free(c->read_write_paths);
5353 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5354 c->exec_paths = strv_free(c->exec_paths);
5355 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5356 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5357
d2d6c096 5358 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5359 c->bind_mounts = NULL;
5360 c->n_bind_mounts = 0;
2abd4e38
YW
5361 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5362 c->temporary_filesystems = NULL;
5363 c->n_temporary_filesystems = 0;
b3d13314 5364 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5365
0985c7c4 5366 cpu_set_reset(&c->cpu_set);
b070c7c0 5367 numa_policy_reset(&c->numa_policy);
86a3475b 5368
a1e58e8e
LP
5369 c->utmp_id = mfree(c->utmp_id);
5370 c->selinux_context = mfree(c->selinux_context);
5371 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5372 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5373
b1994387
ILG
5374 c->restrict_filesystems = set_free(c->restrict_filesystems);
5375
8cfa775f 5376 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5377 c->syscall_archs = set_free(c->syscall_archs);
5378 c->address_families = set_free(c->address_families);
e66cf1a3 5379
5b10116e 5380 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5381 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5382
5383 c->log_level_max = -1;
5384
5385 exec_context_free_log_extra_fields(c);
08f3be7a 5386
5ac1530e
ZJS
5387 c->log_ratelimit_interval_usec = 0;
5388 c->log_ratelimit_burst = 0;
90fc172e 5389
08f3be7a
LP
5390 c->stdin_data = mfree(c->stdin_data);
5391 c->stdin_data_size = 0;
a8d08f39
LP
5392
5393 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5394 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5395
5396 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5397
43144be4 5398 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5399 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
5400}
5401
34cf6c43 5402int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5403 assert(c);
5404
5405 if (!runtime_prefix)
5406 return 0;
5407
211a3d87 5408 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5409 _cleanup_free_ char *p = NULL;
e66cf1a3 5410
494d0247 5411 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5412 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5413 else
211a3d87 5414 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5415 if (!p)
5416 return -ENOMEM;
5417
7bc4bf4a
LP
5418 /* We execute this synchronously, since we need to be sure this is gone when we start the
5419 * service next. */
c6878637 5420 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5421
211a3d87
LB
5422 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5423 _cleanup_free_ char *symlink_abs = NULL;
5424
5425 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5426 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5427 else
5428 symlink_abs = path_join(runtime_prefix, *symlink);
5429 if (!symlink_abs)
5430 return -ENOMEM;
5431
5432 (void) unlink(symlink_abs);
5433 }
5434
e66cf1a3
LP
5435 }
5436
5437 return 0;
5cb5a6ff
LP
5438}
5439
bb0c0d6f
LP
5440int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
5441 _cleanup_free_ char *p = NULL;
5442
5443 assert(c);
5444
5445 if (!runtime_prefix || !unit)
5446 return 0;
5447
5448 p = path_join(runtime_prefix, "credentials", unit);
5449 if (!p)
5450 return -ENOMEM;
5451
5452 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5453 * unmount it, and afterwards remove the mount point */
5454 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5455 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5456
5457 return 0;
5458}
5459
34cf6c43 5460static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5461 assert(c);
5462
a1e58e8e 5463 c->path = mfree(c->path);
6796073e 5464 c->argv = strv_free(c->argv);
43d0fcbd
LP
5465}
5466
da6053d0 5467void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5468 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5469 exec_command_done(c+i);
5470}
5471
f1acf85a 5472ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5473 ExecCommand *i;
5474
5475 while ((i = c)) {
71fda00f 5476 LIST_REMOVE(command, c, i);
43d0fcbd 5477 exec_command_done(i);
5cb5a6ff
LP
5478 free(i);
5479 }
f1acf85a
ZJS
5480
5481 return NULL;
5cb5a6ff
LP
5482}
5483
da6053d0 5484void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5485 for (size_t i = 0; i < n; i++)
f1acf85a 5486 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5487}
5488
6a1d4d9f 5489void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5490 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5491 exec_status_reset(&c[i].exec_status);
5492}
5493
5494void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5495 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5496 LIST_FOREACH(command, z, c[i])
5497 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5498}
5499
039f0e70 5500typedef struct InvalidEnvInfo {
34cf6c43 5501 const Unit *unit;
039f0e70
LP
5502 const char *path;
5503} InvalidEnvInfo;
5504
5505static void invalid_env(const char *p, void *userdata) {
5506 InvalidEnvInfo *info = userdata;
5507
f2341e0a 5508 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5509}
5510
52c239d7
LB
5511const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5512 assert(c);
5513
5514 switch (fd_index) {
5073ff6b 5515
52c239d7
LB
5516 case STDIN_FILENO:
5517 if (c->std_input != EXEC_INPUT_NAMED_FD)
5518 return NULL;
5073ff6b 5519
52c239d7 5520 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5521
52c239d7
LB
5522 case STDOUT_FILENO:
5523 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5524 return NULL;
5073ff6b 5525
52c239d7 5526 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5527
52c239d7
LB
5528 case STDERR_FILENO:
5529 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5530 return NULL;
5073ff6b 5531
52c239d7 5532 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5533
52c239d7
LB
5534 default:
5535 return NULL;
5536 }
5537}
5538
2caa38e9
LP
5539static int exec_context_named_iofds(
5540 const ExecContext *c,
5541 const ExecParameters *p,
5542 int named_iofds[static 3]) {
5543
5b10116e 5544 size_t targets;
56fbd561 5545 const char* stdio_fdname[3];
da6053d0 5546 size_t n_fds;
52c239d7
LB
5547
5548 assert(c);
5549 assert(p);
2caa38e9 5550 assert(named_iofds);
52c239d7
LB
5551
5552 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5553 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5554 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5555
5b10116e 5556 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5557 stdio_fdname[i] = exec_context_fdname(c, i);
5558
4c47affc
FB
5559 n_fds = p->n_storage_fds + p->n_socket_fds;
5560
5b10116e 5561 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5562 if (named_iofds[STDIN_FILENO] < 0 &&
5563 c->std_input == EXEC_INPUT_NAMED_FD &&
5564 stdio_fdname[STDIN_FILENO] &&
5565 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5566
52c239d7
LB
5567 named_iofds[STDIN_FILENO] = p->fds[i];
5568 targets--;
56fbd561
ZJS
5569
5570 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5571 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5572 stdio_fdname[STDOUT_FILENO] &&
5573 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5574
52c239d7
LB
5575 named_iofds[STDOUT_FILENO] = p->fds[i];
5576 targets--;
56fbd561
ZJS
5577
5578 } else if (named_iofds[STDERR_FILENO] < 0 &&
5579 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5580 stdio_fdname[STDERR_FILENO] &&
5581 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5582
52c239d7
LB
5583 named_iofds[STDERR_FILENO] = p->fds[i];
5584 targets--;
5585 }
5586
56fbd561 5587 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5588}
5589
398a5009
ZJS
5590static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5591 _cleanup_strv_free_ char **v = NULL;
398a5009 5592 int r;
8c7be95e
LP
5593
5594 assert(c);
398a5009 5595 assert(ret);
8c7be95e
LP
5596
5597 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5598 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5599 bool ignore = false;
5600 char *fn = *i;
8c7be95e
LP
5601
5602 if (fn[0] == '-') {
5603 ignore = true;
313cefa1 5604 fn++;
8c7be95e
LP
5605 }
5606
5607 if (!path_is_absolute(fn)) {
8c7be95e
LP
5608 if (ignore)
5609 continue;
8c7be95e
LP
5610 return -EINVAL;
5611 }
5612
2bef10ab 5613 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5614 r = safe_glob(fn, 0, &pglob);
5615 if (r < 0) {
2bef10ab
PL
5616 if (ignore)
5617 continue;
398a5009 5618 return r;
2bef10ab 5619 }
8c7be95e 5620
d8c92e8b
ZJS
5621 /* When we don't match anything, -ENOENT should be returned */
5622 assert(pglob.gl_pathc > 0);
5623
5b10116e 5624 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5625 _cleanup_strv_free_ char **p = NULL;
5626
5627 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5628 if (r < 0) {
2bef10ab
PL
5629 if (ignore)
5630 continue;
398a5009 5631 return r;
e9c1ea9d 5632 }
398a5009 5633
ebc05a09 5634 /* Log invalid environment variables with filename */
039f0e70
LP
5635 if (p) {
5636 InvalidEnvInfo info = {
f2341e0a 5637 .unit = unit,
039f0e70
LP
5638 .path = pglob.gl_pathv[n]
5639 };
5640
5641 p = strv_env_clean_with_callback(p, invalid_env, &info);
5642 }
8c7be95e 5643
398a5009
ZJS
5644 if (!v)
5645 v = TAKE_PTR(p);
2bef10ab 5646 else {
398a5009 5647 char **m = strv_env_merge(v, p);
c84a9488 5648 if (!m)
2bef10ab 5649 return -ENOMEM;
2bef10ab 5650
398a5009 5651 strv_free_and_replace(v, m);
2bef10ab 5652 }
8c7be95e
LP
5653 }
5654 }
5655
398a5009 5656 *ret = TAKE_PTR(v);
8c7be95e
LP
5657
5658 return 0;
5659}
5660
6ac8fdc9 5661static bool tty_may_match_dev_console(const char *tty) {
7b912648 5662 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5663
1e22b5cd
LP
5664 if (!tty)
5665 return true;
5666
a119ec7c 5667 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5668
5669 /* trivial identity? */
5670 if (streq(tty, "console"))
5671 return true;
5672
7b912648
LP
5673 if (resolve_dev_console(&resolved) < 0)
5674 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5675
5676 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5677 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5678}
5679
6c0ae739
LP
5680static bool exec_context_may_touch_tty(const ExecContext *ec) {
5681 assert(ec);
1e22b5cd 5682
6c0ae739 5683 return ec->tty_reset ||
1e22b5cd
LP
5684 ec->tty_vhangup ||
5685 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5686 is_terminal_input(ec->std_input) ||
5687 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5688 is_terminal_output(ec->std_error);
5689}
5690
5691bool exec_context_may_touch_console(const ExecContext *ec) {
5692
5693 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5694 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5695}
5696
15ae422b 5697static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5698 assert(f);
5699
5700 STRV_FOREACH(g, l)
5701 fprintf(f, " %s", *g);
5702}
5703
ddc155b2
TM
5704static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5705 assert(f);
5706 assert(prefix);
5707 assert(name);
5708
5709 if (!strv_isempty(strv)) {
a7bd1656 5710 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5711 strv_fprintf(f, strv);
5712 fputs("\n", f);
5713 }
5714}
5715
34cf6c43 5716void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5717 int r;
9eba9da4 5718
5cb5a6ff
LP
5719 assert(c);
5720 assert(f);
5721
4ad49000 5722 prefix = strempty(prefix);
5cb5a6ff
LP
5723
5724 fprintf(f,
94f04347
LP
5725 "%sUMask: %04o\n"
5726 "%sWorkingDirectory: %s\n"
451a074f 5727 "%sRootDirectory: %s\n"
15ae422b 5728 "%sNonBlocking: %s\n"
64747e2d 5729 "%sPrivateTmp: %s\n"
7f112f50 5730 "%sPrivateDevices: %s\n"
59eeb84b 5731 "%sProtectKernelTunables: %s\n"
e66a2f65 5732 "%sProtectKernelModules: %s\n"
84703040 5733 "%sProtectKernelLogs: %s\n"
fc64760d 5734 "%sProtectClock: %s\n"
59eeb84b 5735 "%sProtectControlGroups: %s\n"
d251207d
LP
5736 "%sPrivateNetwork: %s\n"
5737 "%sPrivateUsers: %s\n"
1b8689f9
LP
5738 "%sProtectHome: %s\n"
5739 "%sProtectSystem: %s\n"
5d997827 5740 "%sMountAPIVFS: %s\n"
f3e43635 5741 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5742 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5743 "%sRestrictRealtime: %s\n"
f69567cb 5744 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5745 "%sKeyringMode: %s\n"
4e399953
LP
5746 "%sProtectHostname: %s\n"
5747 "%sProtectProc: %s\n"
5748 "%sProcSubset: %s\n",
5cb5a6ff 5749 prefix, c->umask,
14eb3285
LP
5750 prefix, empty_to_root(c->working_directory),
5751 prefix, empty_to_root(c->root_directory),
15ae422b 5752 prefix, yes_no(c->non_blocking),
64747e2d 5753 prefix, yes_no(c->private_tmp),
7f112f50 5754 prefix, yes_no(c->private_devices),
59eeb84b 5755 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5756 prefix, yes_no(c->protect_kernel_modules),
84703040 5757 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5758 prefix, yes_no(c->protect_clock),
59eeb84b 5759 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5760 prefix, yes_no(c->private_network),
5761 prefix, yes_no(c->private_users),
1b8689f9
LP
5762 prefix, protect_home_to_string(c->protect_home),
5763 prefix, protect_system_to_string(c->protect_system),
5e98086d 5764 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5765 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5766 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5767 prefix, yes_no(c->restrict_realtime),
f69567cb 5768 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5769 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5770 prefix, yes_no(c->protect_hostname),
5771 prefix, protect_proc_to_string(c->protect_proc),
5772 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5773
915e6d16
LP
5774 if (c->root_image)
5775 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5776
18d73705 5777 if (c->root_image_options) {
18d73705
LB
5778 fprintf(f, "%sRootImageOptions:", prefix);
5779 LIST_FOREACH(mount_options, o, c->root_image_options)
5780 if (!isempty(o->options))
9ece6444
LB
5781 fprintf(f, " %s:%s",
5782 partition_designator_to_string(o->partition_designator),
5783 o->options);
18d73705
LB
5784 fprintf(f, "\n");
5785 }
5786
0389f4fa
LB
5787 if (c->root_hash) {
5788 _cleanup_free_ char *encoded = NULL;
5789 encoded = hexmem(c->root_hash, c->root_hash_size);
5790 if (encoded)
5791 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5792 }
5793
5794 if (c->root_hash_path)
5795 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5796
d4d55b0d
LB
5797 if (c->root_hash_sig) {
5798 _cleanup_free_ char *encoded = NULL;
5799 ssize_t len;
5800 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5801 if (len)
5802 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5803 }
5804
5805 if (c->root_hash_sig_path)
5806 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5807
0389f4fa
LB
5808 if (c->root_verity)
5809 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5810
8c7be95e
LP
5811 STRV_FOREACH(e, c->environment)
5812 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5813
5814 STRV_FOREACH(e, c->environment_files)
5815 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5816
b4c14404
FB
5817 STRV_FOREACH(e, c->pass_environment)
5818 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5819
00819cc1
LP
5820 STRV_FOREACH(e, c->unset_environment)
5821 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5822
53f47dfc
YW
5823 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5824
5b10116e 5825 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5826 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5827
211a3d87
LB
5828 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5829 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5830
5831 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5832 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5833 }
3536f49e 5834 }
c2bbd90b 5835
5291f26d 5836 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5837
fb33a393 5838 if (c->nice_set)
5291f26d 5839 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5840
dd6c17b1 5841 if (c->oom_score_adjust_set)
5291f26d 5842 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5843
ad21e542 5844 if (c->coredump_filter_set)
5291f26d 5845 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5846
5b10116e 5847 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5848 if (c->rlimit[i]) {
4c3a2b84 5849 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5850 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5851 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5852 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5853 }
94f04347 5854
f8b69d1d 5855 if (c->ioprio_set) {
1756a011 5856 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5857
5bead76e 5858 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5859 if (r >= 0)
5860 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5861
5bead76e 5862 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5863 }
94f04347 5864
f8b69d1d 5865 if (c->cpu_sched_set) {
1756a011 5866 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5867
837df140
YW
5868 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5869 if (r >= 0)
5870 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5871
94f04347 5872 fprintf(f,
38b48754
LP
5873 "%sCPUSchedulingPriority: %i\n"
5874 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5875 prefix, c->cpu_sched_priority,
5876 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5877 }
94f04347 5878
0985c7c4 5879 if (c->cpu_set.set) {
e7fca352
MS
5880 _cleanup_free_ char *affinity = NULL;
5881
5882 affinity = cpu_set_to_range_string(&c->cpu_set);
5883 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5884 }
5885
b070c7c0
MS
5886 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5887 _cleanup_free_ char *nodes = NULL;
5888
5889 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5890 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5891 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5892 }
5893
3a43da28 5894 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5895 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5896
5897 fprintf(f,
80876c20
LP
5898 "%sStandardInput: %s\n"
5899 "%sStandardOutput: %s\n"
5900 "%sStandardError: %s\n",
5901 prefix, exec_input_to_string(c->std_input),
5902 prefix, exec_output_to_string(c->std_output),
5903 prefix, exec_output_to_string(c->std_error));
5904
befc4a80
LP
5905 if (c->std_input == EXEC_INPUT_NAMED_FD)
5906 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5907 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5908 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5909 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5910 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5911
5912 if (c->std_input == EXEC_INPUT_FILE)
5913 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5914 if (c->std_output == EXEC_OUTPUT_FILE)
5915 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5916 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5917 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5918 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5919 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5920 if (c->std_error == EXEC_OUTPUT_FILE)
5921 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5922 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5923 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5924 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5925 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5926
80876c20
LP
5927 if (c->tty_path)
5928 fprintf(f,
6ea832a2
LP
5929 "%sTTYPath: %s\n"
5930 "%sTTYReset: %s\n"
5931 "%sTTYVHangup: %s\n"
51462135
DDM
5932 "%sTTYVTDisallocate: %s\n"
5933 "%sTTYRows: %u\n"
5934 "%sTTYColumns: %u\n",
6ea832a2
LP
5935 prefix, c->tty_path,
5936 prefix, yes_no(c->tty_reset),
5937 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5938 prefix, yes_no(c->tty_vt_disallocate),
5939 prefix, c->tty_rows,
5940 prefix, c->tty_cols);
94f04347 5941
9f6444eb 5942 if (IN_SET(c->std_output,
9f6444eb
LP
5943 EXEC_OUTPUT_KMSG,
5944 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5945 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5946 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5947 IN_SET(c->std_error,
9f6444eb
LP
5948 EXEC_OUTPUT_KMSG,
5949 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5950 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5951 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5952
5ce70e5b 5953 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5954
837df140
YW
5955 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5956 if (r >= 0)
5957 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5958
837df140
YW
5959 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5960 if (r >= 0)
5961 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5962 }
94f04347 5963
d3070fbd
LP
5964 if (c->log_level_max >= 0) {
5965 _cleanup_free_ char *t = NULL;
5966
5967 (void) log_level_to_string_alloc(c->log_level_max, &t);
5968
5969 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5970 }
5971
5291f26d 5972 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5973 fprintf(f,
5974 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5975 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5976
5ac1530e
ZJS
5977 if (c->log_ratelimit_burst > 0)
5978 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5979
5b10116e
ZJS
5980 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5981 fprintf(f, "%sLogExtraFields: ", prefix);
5982 fwrite(c->log_extra_fields[j].iov_base,
5983 1, c->log_extra_fields[j].iov_len,
5984 f);
5985 fputc('\n', f);
d3070fbd
LP
5986 }
5987
91dd5f7c
LP
5988 if (c->log_namespace)
5989 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5990
07d46372
YW
5991 if (c->secure_bits) {
5992 _cleanup_free_ char *str = NULL;
5993
5994 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5995 if (r >= 0)
5996 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5997 }
94f04347 5998
a103496c 5999 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 6000 _cleanup_free_ char *str = NULL;
94f04347 6001
dd1f5bd0
YW
6002 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
6003 if (r >= 0)
6004 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6005 }
6006
6007 if (c->capability_ambient_set != 0) {
dd1f5bd0 6008 _cleanup_free_ char *str = NULL;
755d4b67 6009
dd1f5bd0
YW
6010 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
6011 if (r >= 0)
6012 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6013 }
6014
6015 if (c->user)
f2d3769a 6016 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6017 if (c->group)
f2d3769a 6018 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6019
29206d46
LP
6020 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6021
ddc155b2 6022 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6023
5b6319dc 6024 if (c->pam_name)
f2d3769a 6025 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6026
ddc155b2
TM
6027 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6028 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6029 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6030 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6031 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6032 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6033
5b10116e
ZJS
6034 for (size_t i = 0; i < c->n_bind_mounts; i++)
6035 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6036 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6037 c->bind_mounts[i].ignore_enoent ? "-": "",
6038 c->bind_mounts[i].source,
6039 c->bind_mounts[i].destination,
6040 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6041
5b10116e
ZJS
6042 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6043 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6044
5b10116e
ZJS
6045 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6046 t->path,
6047 isempty(t->options) ? "" : ":",
6048 strempty(t->options));
6049 }
2abd4e38 6050
169c1bda
LP
6051 if (c->utmp_id)
6052 fprintf(f,
6053 "%sUtmpIdentifier: %s\n",
6054 prefix, c->utmp_id);
7b52a628
MS
6055
6056 if (c->selinux_context)
6057 fprintf(f,
5f8640fb
LP
6058 "%sSELinuxContext: %s%s\n",
6059 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6060
80c21aea
WC
6061 if (c->apparmor_profile)
6062 fprintf(f,
6063 "%sAppArmorProfile: %s%s\n",
6064 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6065
6066 if (c->smack_process_label)
6067 fprintf(f,
6068 "%sSmackProcessLabel: %s%s\n",
6069 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6070
050f7277 6071 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6072 fprintf(f,
6073 "%sPersonality: %s\n",
6074 prefix, strna(personality_to_string(c->personality)));
6075
78e864e5
TM
6076 fprintf(f,
6077 "%sLockPersonality: %s\n",
6078 prefix, yes_no(c->lock_personality));
6079
17df7223 6080 if (c->syscall_filter) {
17df7223 6081 fprintf(f,
57183d11 6082 "%sSystemCallFilter: ",
17df7223
LP
6083 prefix);
6084
6b000af4 6085 if (!c->syscall_allow_list)
17df7223
LP
6086 fputc('~', f);
6087
349cc4a5 6088#if HAVE_SECCOMP
d5a99b7c
JJ
6089 void *id, *val;
6090 bool first = true;
90e74a66 6091 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6092 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6093 const char *errno_name = NULL;
6094 int num = PTR_TO_INT(val);
17df7223
LP
6095
6096 if (first)
6097 first = false;
6098 else
6099 fputc(' ', f);
6100
57183d11 6101 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6102 fputs(strna(name), f);
8cfa775f
YW
6103
6104 if (num >= 0) {
005bfaf1 6105 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6106 if (errno_name)
6107 fprintf(f, ":%s", errno_name);
6108 else
6109 fprintf(f, ":%d", num);
6110 }
17df7223 6111 }
351a19b1 6112#endif
17df7223
LP
6113
6114 fputc('\n', f);
6115 }
6116
57183d11 6117 if (c->syscall_archs) {
57183d11
LP
6118 fprintf(f,
6119 "%sSystemCallArchitectures:",
6120 prefix);
6121
349cc4a5 6122#if HAVE_SECCOMP
d5a99b7c 6123 void *id;
90e74a66 6124 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6125 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6126#endif
6127 fputc('\n', f);
6128 }
6129
add00535
LP
6130 if (exec_context_restrict_namespaces_set(c)) {
6131 _cleanup_free_ char *s = NULL;
6132
86c2a9f1 6133 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6134 if (r >= 0)
6135 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6136 prefix, strna(s));
add00535
LP
6137 }
6138
b1994387 6139#if HAVE_LIBBPF
8fe84dc8
YW
6140 if (exec_context_restrict_filesystems_set(c)) {
6141 char *fs;
6142 SET_FOREACH(fs, c->restrict_filesystems)
6143 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6144 }
b1994387
ILG
6145#endif
6146
a8d08f39
LP
6147 if (c->network_namespace_path)
6148 fprintf(f,
6149 "%sNetworkNamespacePath: %s\n",
6150 prefix, c->network_namespace_path);
6151
3df90f24 6152 if (c->syscall_errno > 0) {
3df90f24
YW
6153 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6154
005bfaf1 6155#if HAVE_SECCOMP
d5a99b7c 6156 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6157 if (errno_name)
005bfaf1 6158 fputs(errno_name, f);
3df90f24 6159 else
005bfaf1
TM
6160 fprintf(f, "%d", c->syscall_errno);
6161#endif
6162 fputc('\n', f);
3df90f24 6163 }
b3d13314 6164
5b10116e 6165 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6166 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6167 c->mount_images[i].ignore_enoent ? "-": "",
6168 c->mount_images[i].source,
79e20ceb 6169 c->mount_images[i].destination);
427353f6 6170 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6171 fprintf(f, ":%s:%s",
427353f6 6172 partition_designator_to_string(o->partition_designator),
79e20ceb 6173 strempty(o->options));
427353f6
LB
6174 fprintf(f, "\n");
6175 }
93f59701
LB
6176
6177 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6178 fprintf(f, "%sExtensionImages: %s%s", prefix,
6179 c->extension_images[i].ignore_enoent ? "-": "",
6180 c->extension_images[i].source);
6181 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6182 fprintf(f, ":%s:%s",
6183 partition_designator_to_string(o->partition_designator),
6184 strempty(o->options));
6185 fprintf(f, "\n");
6186 }
a07b9926
LB
6187
6188 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6189}
6190
34cf6c43 6191bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6192 assert(c);
6193
61233823 6194 /* Returns true if the process forked off would run under
a931ad47
LP
6195 * an unchanged UID or as root. */
6196
6197 if (!c->user)
6198 return true;
6199
6200 if (streq(c->user, "root") || streq(c->user, "0"))
6201 return true;
6202
6203 return false;
6204}
6205
34cf6c43 6206int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6207 int p;
6208
6209 assert(c);
6210
6211 if (c->ioprio_set)
6212 return c->ioprio;
6213
6214 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6215 if (p < 0)
0692548c 6216 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6217
8b330d7d 6218 return ioprio_normalize(p);
7f452159
LP
6219}
6220
5e98086d
ZJS
6221bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6222 assert(c);
6223
61198784 6224 /* Explicit setting wins */
5e98086d
ZJS
6225 if (c->mount_apivfs_set)
6226 return c->mount_apivfs;
6227
61198784 6228 /* Default to "yes" if root directory or image are specified */
74e12520 6229 if (exec_context_with_rootfs(c))
61198784
ZJS
6230 return true;
6231
5e98086d
ZJS
6232 return false;
6233}
6234
d3070fbd 6235void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6236 assert(c);
6237
5b10116e 6238 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6239 free(c->log_extra_fields[l].iov_base);
6240 c->log_extra_fields = mfree(c->log_extra_fields);
6241 c->n_log_extra_fields = 0;
6242}
6243
6f765baf 6244void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
6245 _cleanup_close_ int fd = -1;
6246 const char *path;
6247 struct stat st;
6f765baf
LP
6248 int r;
6249
6250 assert(c);
6251
6252 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6253 exec_context_tty_reset(c, NULL);
6254
6255 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6256 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6257 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6258 if (!exec_context_may_touch_tty(c))
6259 return;
6f765baf 6260
0ba976e8
LP
6261 path = exec_context_tty_path(c);
6262 if (!path)
6263 return;
6f765baf 6264
0ba976e8
LP
6265 fd = open(path, O_PATH|O_CLOEXEC);
6266 if (fd < 0)
6267 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6268 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6269 path);
6270
6271 if (fstat(fd, &st) < 0)
6272 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6273
6274 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6275 * if things are a character device, since a proper check either means we'd have to open the TTY and
6276 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6277 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6278 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6279 if (!S_ISCHR(st.st_mode))
6280 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6281
6282 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6283 if (r < 0)
6284 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6285}
6286
4c2f5842
LP
6287int exec_context_get_clean_directories(
6288 ExecContext *c,
6289 char **prefix,
6290 ExecCleanMask mask,
6291 char ***ret) {
6292
6293 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6294 int r;
6295
6296 assert(c);
6297 assert(prefix);
6298 assert(ret);
6299
5b10116e 6300 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6301 if (!FLAGS_SET(mask, 1U << t))
6302 continue;
6303
6304 if (!prefix[t])
6305 continue;
6306
211a3d87 6307 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6308 char *j;
6309
211a3d87 6310 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6311 if (!j)
6312 return -ENOMEM;
6313
6314 r = strv_consume(&l, j);
6315 if (r < 0)
6316 return r;
7f622a19
YW
6317
6318 /* Also remove private directories unconditionally. */
6319 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6320 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6321 if (!j)
6322 return -ENOMEM;
6323
6324 r = strv_consume(&l, j);
6325 if (r < 0)
6326 return r;
6327 }
6328
211a3d87
LB
6329 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6330 j = path_join(prefix[t], *symlink);
7f622a19
YW
6331 if (!j)
6332 return -ENOMEM;
6333
6334 r = strv_consume(&l, j);
6335 if (r < 0)
6336 return r;
6337 }
4c2f5842
LP
6338 }
6339 }
6340
6341 *ret = TAKE_PTR(l);
6342 return 0;
6343}
6344
6345int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6346 ExecCleanMask mask = 0;
6347
6348 assert(c);
6349 assert(ret);
6350
6351 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6352 if (c->directories[t].n_items > 0)
4c2f5842
LP
6353 mask |= 1U << t;
6354
6355 *ret = mask;
6356 return 0;
6357}
6358
b58b4116 6359void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6360 assert(s);
5cb5a6ff 6361
2ed26ed0
LP
6362 *s = (ExecStatus) {
6363 .pid = pid,
6364 };
6365
b58b4116
LP
6366 dual_timestamp_get(&s->start_timestamp);
6367}
6368
34cf6c43 6369void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6370 assert(s);
6371
d46b79bb 6372 if (s->pid != pid)
2ed26ed0
LP
6373 *s = (ExecStatus) {
6374 .pid = pid,
6375 };
b58b4116 6376
63983207 6377 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6378
034c6ed7
LP
6379 s->code = code;
6380 s->status = status;
169c1bda 6381
6f765baf
LP
6382 if (context && context->utmp_id)
6383 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6384}
6385
6a1d4d9f
LP
6386void exec_status_reset(ExecStatus *s) {
6387 assert(s);
6388
6389 *s = (ExecStatus) {};
6390}
6391
34cf6c43 6392void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6393 assert(s);
6394 assert(f);
6395
9fb86720
LP
6396 if (s->pid <= 0)
6397 return;
6398
4c940960
LP
6399 prefix = strempty(prefix);
6400
9fb86720 6401 fprintf(f,
ccd06097
ZJS
6402 "%sPID: "PID_FMT"\n",
6403 prefix, s->pid);
9fb86720 6404
af9d16e1 6405 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6406 fprintf(f,
6407 "%sStart Timestamp: %s\n",
04f5c018 6408 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6409
af9d16e1 6410 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6411 fprintf(f,
6412 "%sExit Timestamp: %s\n"
6413 "%sExit Code: %s\n"
6414 "%sExit Status: %i\n",
04f5c018 6415 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6416 prefix, sigchld_code_to_string(s->code),
6417 prefix, s->status);
5cb5a6ff 6418}
44d8db9e 6419
34cf6c43 6420static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6421 _cleanup_free_ char *cmd = NULL;
4c940960 6422 const char *prefix2;
44d8db9e
LP
6423
6424 assert(c);
6425 assert(f);
6426
4c940960 6427 prefix = strempty(prefix);
63c372cb 6428 prefix2 = strjoina(prefix, "\t");
44d8db9e 6429
4ef15008 6430 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
44d8db9e
LP
6431 fprintf(f,
6432 "%sCommand Line: %s\n",
7c248223 6433 prefix, cmd ?: strerror_safe(ENOMEM));
44d8db9e 6434
9fb86720 6435 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6436}
6437
6438void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6439 assert(f);
6440
4c940960 6441 prefix = strempty(prefix);
44d8db9e 6442
03677889
YW
6443 LIST_FOREACH(command, i, c)
6444 exec_command_dump(i, f, prefix);
44d8db9e 6445}
94f04347 6446
a6a80b4f
LP
6447void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6448 ExecCommand *end;
6449
6450 assert(l);
6451 assert(e);
6452
6453 if (*l) {
35b8ca3a 6454 /* It's kind of important, that we keep the order here */
71fda00f
LP
6455 LIST_FIND_TAIL(command, *l, end);
6456 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6457 } else
6458 *l = e;
6459}
6460
26fd040d
LP
6461int exec_command_set(ExecCommand *c, const char *path, ...) {
6462 va_list ap;
6463 char **l, *p;
6464
6465 assert(c);
6466 assert(path);
6467
6468 va_start(ap, path);
6469 l = strv_new_ap(path, ap);
6470 va_end(ap);
6471
6472 if (!l)
6473 return -ENOMEM;
6474
250a918d
LP
6475 p = strdup(path);
6476 if (!p) {
26fd040d
LP
6477 strv_free(l);
6478 return -ENOMEM;
6479 }
6480
6897dfe8 6481 free_and_replace(c->path, p);
26fd040d 6482
130d3d22 6483 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6484}
6485
86b23b07 6486int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6487 _cleanup_strv_free_ char **l = NULL;
86b23b07 6488 va_list ap;
86b23b07
JS
6489 int r;
6490
6491 assert(c);
6492 assert(path);
6493
6494 va_start(ap, path);
6495 l = strv_new_ap(path, ap);
6496 va_end(ap);
6497
6498 if (!l)
6499 return -ENOMEM;
6500
e287086b 6501 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6502 if (r < 0)
86b23b07 6503 return r;
86b23b07
JS
6504
6505 return 0;
6506}
6507
e8a565cb
YW
6508static void *remove_tmpdir_thread(void *p) {
6509 _cleanup_free_ char *path = p;
86b23b07 6510
e8a565cb
YW
6511 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6512 return NULL;
6513}
6514
6515static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6516 int r;
6517
6518 if (!rt)
6519 return NULL;
6520
6521 if (rt->manager)
6522 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6523
6524 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6525
6526 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6527 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6528
6529 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6530 if (r < 0)
e8a565cb 6531 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6532 else
6533 rt->tmp_dir = NULL;
e8a565cb 6534 }
613b411c 6535
56a13a49 6536 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6537 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6538
6539 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6540 if (r < 0)
e8a565cb 6541 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6542 else
6543 rt->var_tmp_dir = NULL;
e8a565cb
YW
6544 }
6545
6546 rt->id = mfree(rt->id);
6547 rt->tmp_dir = mfree(rt->tmp_dir);
6548 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6549 safe_close_pair(rt->netns_storage_socket);
a70581ff 6550 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6551 return mfree(rt);
6552}
6553
6554static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6555 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6556}
6557
56a13a49
ZJS
6558static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6559 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6560 ExecRuntime *n;
613b411c 6561
8e8009dc 6562 assert(ret);
613b411c 6563
56a13a49
ZJS
6564 id_copy = strdup(id);
6565 if (!id_copy)
6566 return -ENOMEM;
6567
8e8009dc
LP
6568 n = new(ExecRuntime, 1);
6569 if (!n)
613b411c
LP
6570 return -ENOMEM;
6571
8e8009dc 6572 *n = (ExecRuntime) {
56a13a49 6573 .id = TAKE_PTR(id_copy),
8e8009dc 6574 .netns_storage_socket = { -1, -1 },
a70581ff 6575 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6576 };
6577
6578 *ret = n;
613b411c
LP
6579 return 0;
6580}
6581
e8a565cb
YW
6582static int exec_runtime_add(
6583 Manager *m,
6584 const char *id,
56a13a49
ZJS
6585 char **tmp_dir,
6586 char **var_tmp_dir,
6587 int netns_storage_socket[2],
a70581ff 6588 int ipcns_storage_socket[2],
e8a565cb
YW
6589 ExecRuntime **ret) {
6590
6591 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6592 int r;
6593
e8a565cb 6594 assert(m);
613b411c
LP
6595 assert(id);
6596
a70581ff 6597 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6598
56a13a49 6599 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6600 if (r < 0)
6601 return r;
6602
63083706 6603 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6604 if (r < 0)
6605 return r;
e8a565cb 6606
56a13a49
ZJS
6607 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6608 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6609 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6610
6611 if (netns_storage_socket) {
56a13a49
ZJS
6612 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6613 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6614 }
6615
a70581ff
XR
6616 if (ipcns_storage_socket) {
6617 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6618 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6619 }
6620
e8a565cb
YW
6621 rt->manager = m;
6622
6623 if (ret)
6624 *ret = rt;
e8a565cb 6625 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6626 TAKE_PTR(rt);
e8a565cb
YW
6627 return 0;
6628}
6629
74aaf59b
LP
6630static int exec_runtime_make(
6631 Manager *m,
6632 const ExecContext *c,
6633 const char *id,
6634 ExecRuntime **ret) {
6635
56a13a49 6636 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6637 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6638 int r;
6639
6640 assert(m);
6641 assert(c);
6642 assert(id);
6643
6644 /* It is not necessary to create ExecRuntime object. */
a70581ff 6645 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6646 *ret = NULL;
e8a565cb 6647 return 0;
74aaf59b 6648 }
e8a565cb 6649
efa2f3a1
TM
6650 if (c->private_tmp &&
6651 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6652 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6653 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6654 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6655 if (r < 0)
6656 return r;
6657 }
6658
a8d08f39 6659 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6660 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6661 return -errno;
6662 }
6663
a70581ff
XR
6664 if (c->private_ipc || c->ipc_namespace_path) {
6665 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6666 return -errno;
6667 }
6668
6669 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6670 if (r < 0)
6671 return r;
6672
613b411c
LP
6673 return 1;
6674}
6675
e8a565cb
YW
6676int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6677 ExecRuntime *rt;
6678 int r;
613b411c 6679
e8a565cb
YW
6680 assert(m);
6681 assert(id);
6682 assert(ret);
6683
6684 rt = hashmap_get(m->exec_runtime_by_id, id);
6685 if (rt)
387f6955 6686 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6687 goto ref;
6688
74aaf59b
LP
6689 if (!create) {
6690 *ret = NULL;
e8a565cb 6691 return 0;
74aaf59b 6692 }
e8a565cb
YW
6693
6694 /* If not found, then create a new object. */
6695 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6696 if (r < 0)
e8a565cb 6697 return r;
74aaf59b
LP
6698 if (r == 0) {
6699 /* When r == 0, it is not necessary to create ExecRuntime object. */
6700 *ret = NULL;
6701 return 0;
6702 }
613b411c 6703
e8a565cb
YW
6704ref:
6705 /* increment reference counter. */
6706 rt->n_ref++;
6707 *ret = rt;
6708 return 1;
6709}
613b411c 6710
e8a565cb
YW
6711ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6712 if (!rt)
613b411c
LP
6713 return NULL;
6714
e8a565cb 6715 assert(rt->n_ref > 0);
613b411c 6716
e8a565cb
YW
6717 rt->n_ref--;
6718 if (rt->n_ref > 0)
f2341e0a
LP
6719 return NULL;
6720
e8a565cb 6721 return exec_runtime_free(rt, destroy);
613b411c
LP
6722}
6723
e8a565cb
YW
6724int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6725 ExecRuntime *rt;
e8a565cb
YW
6726
6727 assert(m);
613b411c
LP
6728 assert(f);
6729 assert(fds);
6730
90e74a66 6731 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6732 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6733
e8a565cb
YW
6734 if (rt->tmp_dir)
6735 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6736
e8a565cb
YW
6737 if (rt->var_tmp_dir)
6738 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6739
e8a565cb
YW
6740 if (rt->netns_storage_socket[0] >= 0) {
6741 int copy;
613b411c 6742
e8a565cb
YW
6743 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6744 if (copy < 0)
6745 return copy;
613b411c 6746
e8a565cb
YW
6747 fprintf(f, " netns-socket-0=%i", copy);
6748 }
613b411c 6749
e8a565cb
YW
6750 if (rt->netns_storage_socket[1] >= 0) {
6751 int copy;
613b411c 6752
e8a565cb
YW
6753 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6754 if (copy < 0)
6755 return copy;
613b411c 6756
e8a565cb
YW
6757 fprintf(f, " netns-socket-1=%i", copy);
6758 }
6759
a70581ff
XR
6760 if (rt->ipcns_storage_socket[0] >= 0) {
6761 int copy;
6762
6763 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6764 if (copy < 0)
6765 return copy;
6766
6767 fprintf(f, " ipcns-socket-0=%i", copy);
6768 }
6769
6770 if (rt->ipcns_storage_socket[1] >= 0) {
6771 int copy;
6772
6773 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6774 if (copy < 0)
6775 return copy;
6776
6777 fprintf(f, " ipcns-socket-1=%i", copy);
6778 }
6779
e8a565cb 6780 fputc('\n', f);
613b411c
LP
6781 }
6782
6783 return 0;
6784}
6785
e8a565cb
YW
6786int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6787 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6788 ExecRuntime *rt;
613b411c
LP
6789 int r;
6790
e8a565cb
YW
6791 /* This is for the migration from old (v237 or earlier) deserialization text.
6792 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6793 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6794 * so or not from the serialized text, then we always creates a new object owned by this. */
6795
6796 assert(u);
613b411c
LP
6797 assert(key);
6798 assert(value);
6799
e8a565cb
YW
6800 /* Manager manages ExecRuntime objects by the unit id.
6801 * So, we omit the serialized text when the unit does not have id (yet?)... */
6802 if (isempty(u->id)) {
6803 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6804 return 0;
6805 }
613b411c 6806
cbc165d1
ZJS
6807 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6808 return log_oom();
e8a565cb
YW
6809
6810 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6811 if (!rt) {
cbc165d1 6812 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6813 return log_oom();
613b411c 6814
e8a565cb
YW
6815 rt = rt_create;
6816 }
6817
6818 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6819 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6820 return -ENOMEM;
613b411c
LP
6821
6822 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6823 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6824 return -ENOMEM;
613b411c
LP
6825
6826 } else if (streq(key, "netns-socket-0")) {
6827 int fd;
6828
e8a565cb 6829 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6830 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6831 return 0;
613b411c 6832 }
e8a565cb
YW
6833
6834 safe_close(rt->netns_storage_socket[0]);
6835 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6836
613b411c
LP
6837 } else if (streq(key, "netns-socket-1")) {
6838 int fd;
6839
e8a565cb 6840 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6841 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6842 return 0;
613b411c 6843 }
e8a565cb
YW
6844
6845 safe_close(rt->netns_storage_socket[1]);
6846 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6847
613b411c
LP
6848 } else
6849 return 0;
6850
e8a565cb
YW
6851 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6852 if (rt_create) {
6853 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6854 if (r < 0) {
3fe91079 6855 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6856 return 0;
6857 }
613b411c 6858
e8a565cb 6859 rt_create->manager = u->manager;
613b411c 6860
e8a565cb 6861 /* Avoid cleanup */
56a13a49 6862 TAKE_PTR(rt_create);
e8a565cb 6863 }
98b47d54 6864
e8a565cb
YW
6865 return 1;
6866}
613b411c 6867
56a13a49
ZJS
6868int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6869 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6870 char *id = NULL;
a70581ff 6871 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 6872 const char *p, *v = ASSERT_PTR(value);
e8a565cb 6873 size_t n;
613b411c 6874
e8a565cb 6875 assert(m);
e8a565cb 6876 assert(fds);
98b47d54 6877
e8a565cb 6878 n = strcspn(v, " ");
2f82562b 6879 id = strndupa_safe(v, n);
e8a565cb
YW
6880 if (v[n] != ' ')
6881 goto finalize;
6882 p = v + n + 1;
6883
6884 v = startswith(p, "tmp-dir=");
6885 if (v) {
6886 n = strcspn(v, " ");
56a13a49
ZJS
6887 tmp_dir = strndup(v, n);
6888 if (!tmp_dir)
6889 return log_oom();
e8a565cb
YW
6890 if (v[n] != ' ')
6891 goto finalize;
6892 p = v + n + 1;
6893 }
6894
6895 v = startswith(p, "var-tmp-dir=");
6896 if (v) {
6897 n = strcspn(v, " ");
56a13a49
ZJS
6898 var_tmp_dir = strndup(v, n);
6899 if (!var_tmp_dir)
6900 return log_oom();
e8a565cb
YW
6901 if (v[n] != ' ')
6902 goto finalize;
6903 p = v + n + 1;
6904 }
6905
6906 v = startswith(p, "netns-socket-0=");
6907 if (v) {
6908 char *buf;
6909
6910 n = strcspn(v, " ");
2f82562b 6911 buf = strndupa_safe(v, n);
c413bb28 6912
a70581ff 6913 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6914 if (r < 0)
6915 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6916 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6917 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6918 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6919 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6920 if (v[n] != ' ')
6921 goto finalize;
6922 p = v + n + 1;
613b411c
LP
6923 }
6924
e8a565cb
YW
6925 v = startswith(p, "netns-socket-1=");
6926 if (v) {
6927 char *buf;
98b47d54 6928
e8a565cb 6929 n = strcspn(v, " ");
2f82562b 6930 buf = strndupa_safe(v, n);
a70581ff
XR
6931
6932 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6933 if (r < 0)
6934 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6935 if (!fdset_contains(fds, netns_fdpair[1]))
6936 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6937 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6938 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6939 if (v[n] != ' ')
6940 goto finalize;
6941 p = v + n + 1;
6942 }
6943
6944 v = startswith(p, "ipcns-socket-0=");
6945 if (v) {
6946 char *buf;
6947
6948 n = strcspn(v, " ");
2f82562b 6949 buf = strndupa_safe(v, n);
a70581ff
XR
6950
6951 r = safe_atoi(buf, &ipcns_fdpair[0]);
6952 if (r < 0)
6953 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6954 if (!fdset_contains(fds, ipcns_fdpair[0]))
6955 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6956 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6957 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6958 if (v[n] != ' ')
6959 goto finalize;
6960 p = v + n + 1;
6961 }
6962
6963 v = startswith(p, "ipcns-socket-1=");
6964 if (v) {
6965 char *buf;
6966
6967 n = strcspn(v, " ");
2f82562b 6968 buf = strndupa_safe(v, n);
a70581ff
XR
6969
6970 r = safe_atoi(buf, &ipcns_fdpair[1]);
6971 if (r < 0)
6972 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6973 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6974 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6975 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6976 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6977 }
98b47d54 6978
e8a565cb 6979finalize:
a70581ff 6980 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6981 if (r < 0)
56a13a49
ZJS
6982 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6983 return 0;
e8a565cb 6984}
613b411c 6985
e8a565cb
YW
6986void exec_runtime_vacuum(Manager *m) {
6987 ExecRuntime *rt;
e8a565cb
YW
6988
6989 assert(m);
6990
6991 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6992
90e74a66 6993 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6994 if (rt->n_ref > 0)
6995 continue;
6996
6997 (void) exec_runtime_free(rt, false);
6998 }
613b411c
LP
6999}
7000
b9c04eaf
YW
7001void exec_params_clear(ExecParameters *p) {
7002 if (!p)
7003 return;
7004
c3f8a065
LP
7005 p->environment = strv_free(p->environment);
7006 p->fd_names = strv_free(p->fd_names);
7007 p->fds = mfree(p->fds);
7008 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7009}
7010
bb0c0d6f
LP
7011ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
7012 if (!sc)
7013 return NULL;
7014
7015 free(sc->id);
7016 free(sc->data);
7017 return mfree(sc);
7018}
7019
43144be4
LP
7020ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
7021 if (!lc)
7022 return NULL;
7023
7024 free(lc->id);
7025 free(lc->path);
7026 return mfree(lc);
7027}
7028
211a3d87
LB
7029void exec_directory_done(ExecDirectory *d) {
7030 if (!d)
7031 return;
7032
7033 for (size_t i = 0; i < d->n_items; i++) {
7034 free(d->items[i].path);
7035 strv_free(d->items[i].symlinks);
7036 }
7037
7038 d->items = mfree(d->items);
7039 d->n_items = 0;
7040 d->mode = 0755;
7041}
7042
7043int exec_directory_add(ExecDirectoryItem **d, size_t *n, const char *path, char **symlinks) {
7044 _cleanup_strv_free_ char **s = NULL;
7045 _cleanup_free_ char *p = NULL;
7046
7047 assert(d);
7048 assert(n);
7049 assert(path);
7050
7051 p = strdup(path);
7052 if (!p)
7053 return -ENOMEM;
7054
7055 if (symlinks) {
7056 s = strv_copy(symlinks);
7057 if (!s)
7058 return -ENOMEM;
7059 }
7060
7061 if (!GREEDY_REALLOC(*d, *n + 1))
7062 return -ENOMEM;
7063
7064 (*d)[(*n) ++] = (ExecDirectoryItem) {
7065 .path = TAKE_PTR(p),
7066 .symlinks = TAKE_PTR(s),
7067 };
7068
7069 return 0;
7070}
7071
bb0c0d6f 7072DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 7073DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 7074
80876c20
LP
7075static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7076 [EXEC_INPUT_NULL] = "null",
7077 [EXEC_INPUT_TTY] = "tty",
7078 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7079 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7080 [EXEC_INPUT_SOCKET] = "socket",
7081 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7082 [EXEC_INPUT_DATA] = "data",
2038c3f5 7083 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7084};
7085
8a0867d6
LP
7086DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7087
94f04347 7088static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7089 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7090 [EXEC_OUTPUT_NULL] = "null",
80876c20 7091 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7092 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7093 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7094 [EXEC_OUTPUT_JOURNAL] = "journal",
7095 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7096 [EXEC_OUTPUT_SOCKET] = "socket",
7097 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7098 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7099 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7100 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7101};
7102
7103DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7104
7105static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7106 [EXEC_UTMP_INIT] = "init",
7107 [EXEC_UTMP_LOGIN] = "login",
7108 [EXEC_UTMP_USER] = "user",
7109};
7110
7111DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7112
7113static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7114 [EXEC_PRESERVE_NO] = "no",
7115 [EXEC_PRESERVE_YES] = "yes",
7116 [EXEC_PRESERVE_RESTART] = "restart",
7117};
7118
7119DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7120
6b7b2ed9 7121/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7122static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7123 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7124 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7125 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7126 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7127 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7128};
7129
7130DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7131
211a3d87
LB
7132/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7133static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7134 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7135 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7136 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7137 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7138 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7139};
7140
7141DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7142
6b7b2ed9
LP
7143/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7144 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7145 * directories, specifically .timer units with their timestamp touch file. */
7146static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7147 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7148 [EXEC_DIRECTORY_STATE] = "state",
7149 [EXEC_DIRECTORY_CACHE] = "cache",
7150 [EXEC_DIRECTORY_LOGS] = "logs",
7151 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7152};
7153
7154DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7155
7156/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7157 * the service payload in. */
fb2042dd
YW
7158static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7159 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7160 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7161 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7162 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7163 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7164};
7165
7166DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7167
b1edf445
LP
7168static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7169 [EXEC_KEYRING_INHERIT] = "inherit",
7170 [EXEC_KEYRING_PRIVATE] = "private",
7171 [EXEC_KEYRING_SHARED] = "shared",
7172};
7173
7174DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);