]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
core: rename credential.[ch] -> exec-credential.[ch]
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
ac8db36c 7#include <sys/file.h>
f5947a5e 8#include <sys/ioctl.h>
f3e43635 9#include <sys/mman.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
9c0c6701
DDM
18#include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
349cc4a5 20#if HAVE_PAM
5b6319dc
LP
21#include <security/pam_appl.h>
22#endif
23
349cc4a5 24#if HAVE_SELINUX
7b52a628
MS
25#include <selinux/selinux.h>
26#endif
27
349cc4a5 28#if HAVE_APPARMOR
eef65bf3
MS
29#include <sys/apparmor.h>
30#endif
31
24882e06 32#include "sd-messages.h"
8dd4c05b
LP
33
34#include "af-list.h"
b5efdb8a 35#include "alloc-util.h"
349cc4a5 36#if HAVE_APPARMOR
3ffd4af2
LP
37#include "apparmor-util.h"
38#endif
ee617a4e 39#include "argv-util.h"
8dd4c05b
LP
40#include "async.h"
41#include "barrier.h"
b1994387 42#include "bpf-lsm.h"
9c0c6701 43#include "btrfs-util.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
9c0c6701 46#include "chattr-util.h"
fdb3deca 47#include "cgroup-setup.h"
f461a28d 48#include "chase.h"
bb0c0d6f 49#include "chown-recursive.h"
28db6fbf 50#include "constants.h"
da681e1b 51#include "cpu-set-util.h"
6a818c3c 52#include "data-fd-util.h"
686d13b9 53#include "env-file.h"
4d1a6904 54#include "env-util.h"
17df7223 55#include "errno-list.h"
8a62620e 56#include "escape.h"
43962c30 57#include "exec-credential.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
f97b34a6 61#include "format-util.h"
7d50b32a 62#include "glob-util.h"
0389f4fa 63#include "hexdecoct.h"
c004493c 64#include "io-util.h"
032b3afb 65#include "ioprio-util.h"
9c0c6701 66#include "lock-util.h"
8dd4c05b
LP
67#include "log.h"
68#include "macro.h"
e8a565cb 69#include "manager.h"
2a341bb9 70#include "manager-dump.h"
0a970718 71#include "memory-util.h"
f5947a5e 72#include "missing_fs.h"
5bead76e 73#include "missing_ioprio.h"
7a114ed4 74#include "missing_prctl.h"
35cd0ba5 75#include "mkdir-label.h"
8dd4c05b 76#include "namespace.h"
6bedfcbb 77#include "parse-util.h"
8dd4c05b 78#include "path-util.h"
4d62ee55 79#include "proc-cmdline.h"
0b452006 80#include "process-util.h"
6bb00842 81#include "psi-util.h"
78f22b97 82#include "rlimit-util.h"
8dd4c05b 83#include "rm-rf.h"
3ffd4af2 84#include "seccomp-util.h"
07d46372 85#include "securebits-util.h"
8dd4c05b 86#include "selinux-util.h"
24882e06 87#include "signal-util.h"
8dd4c05b 88#include "smack-util.h"
57b7a260 89#include "socket-util.h"
a2ab603c 90#include "sort-util.h"
fd63e712 91#include "special.h"
949befd3 92#include "stat-util.h"
8b43440b 93#include "string-table.h"
07630cea 94#include "string-util.h"
8dd4c05b 95#include "strv.h"
7ccbd1ae 96#include "syslog-util.h"
8dd4c05b 97#include "terminal-util.h"
bb0c0d6f 98#include "tmpfile-util.h"
566b7d23 99#include "umask-util.h"
2d3b784d 100#include "unit-serialize.h"
b1d4f8e1 101#include "user-util.h"
8dd4c05b 102#include "utmp-wtmp.h"
5cb5a6ff 103
e056b01d 104#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 105#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 106
531dca78
LP
107#define SNDBUF_SIZE (8*1024*1024)
108
da6053d0 109static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
110 if (n_fds <= 0)
111 return 0;
112
a0d40ac5
LP
113 /* Modifies the fds array! (sorts it) */
114
034c6ed7
LP
115 assert(fds);
116
5b10116e
ZJS
117 for (int start = 0;;) {
118 int restart_from = -1;
034c6ed7 119
5b10116e 120 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
3cc2aff1
LP
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
034c6ed7
LP
129 return -errno;
130
03e334a1 131 safe_close(fds[i]);
034c6ed7
LP
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 135 * let's remember that and try again from here */
034c6ed7
LP
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147}
148
cd48e23f
RP
149static int flags_fds(
150 const int fds[],
151 size_t n_socket_fds,
152 size_t n_fds,
153 bool nonblock) {
154
e2c76839 155 int r;
47a71eed
LP
156
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
9b141911
FB
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
47a71eed 164
5b10116e 165 for (size_t i = 0; i < n_fds; i++) {
47a71eed 166
9b141911
FB
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
47a71eed 172
451a074f
LP
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
47a71eed 176
3cc2aff1
LP
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
e2c76839 179 return r;
47a71eed
LP
180 }
181
182 return 0;
183}
184
1e22b5cd 185static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
186 assert(context);
187
1e22b5cd
LP
188 if (context->stdio_as_fds)
189 return NULL;
190
80876c20
LP
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195}
196
4d62ee55 197static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
4d62ee55
DDM
198 unsigned rows, cols;
199 const char *tty;
4d62ee55
DDM
200
201 assert(context);
202 assert(ret_rows);
203 assert(ret_cols);
204
205 rows = context->tty_rows;
206 cols = context->tty_cols;
207
208 tty = exec_context_tty_path(context);
29f5a5ae
DDM
209 if (tty)
210 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
4d62ee55
DDM
211
212 *ret_rows = rows;
213 *ret_cols = cols;
214
215 return 0;
216}
217
1e22b5cd 218static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
a0043bfa
ZJS
219 _cleanup_close_ int fd = -EBADF;
220 const char *path = exec_context_tty_path(ASSERT_PTR(context));
1e22b5cd 221
a0043bfa
ZJS
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
225 */
6ea832a2 226
a0043bfa
ZJS
227 if (p && p->stdin_fd >= 0) {
228 fd = xopenat_lock(p->stdin_fd, NULL,
229 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
230 if (fd < 0)
231 return;
232 } else if (path) {
233 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
234 if (fd < 0)
235 return;
6ea832a2 236
a0043bfa
ZJS
237 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
238 return;
239 } else
240 return; /* nothing to do */
6ea832a2 241
a0043bfa
ZJS
242 if (context->tty_vhangup)
243 (void) terminal_vhangup_fd(fd);
244
245 if (context->tty_reset)
246 (void) reset_terminal_fd(fd, true);
1e22b5cd 247
4d62ee55
DDM
248 if (p && p->stdin_fd >= 0) {
249 unsigned rows = context->tty_rows, cols = context->tty_cols;
250
251 (void) exec_context_tty_size(context, &rows, &cols);
252 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
253 }
51462135 254
1e22b5cd
LP
255 if (context->tty_vt_disallocate && path)
256 (void) vt_disallocate(path);
6ea832a2
LP
257}
258
6af760f3
LP
259static bool is_terminal_input(ExecInput i) {
260 return IN_SET(i,
261 EXEC_INPUT_TTY,
262 EXEC_INPUT_TTY_FORCE,
263 EXEC_INPUT_TTY_FAIL);
264}
265
3a1286b6 266static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
267 return IN_SET(o,
268 EXEC_OUTPUT_TTY,
6af760f3
LP
269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
271}
272
aac8c0c3
LP
273static bool is_kmsg_output(ExecOutput o) {
274 return IN_SET(o,
275 EXEC_OUTPUT_KMSG,
276 EXEC_OUTPUT_KMSG_AND_CONSOLE);
277}
278
6af760f3
LP
279static bool exec_context_needs_term(const ExecContext *c) {
280 assert(c);
281
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
283
284 if (is_terminal_input(c->std_input))
285 return true;
286
287 if (is_terminal_output(c->std_output))
288 return true;
289
290 if (is_terminal_output(c->std_error))
291 return true;
292
293 return !!c->tty_path;
3a1286b6
MS
294}
295
80876c20 296static int open_null_as(int flags, int nfd) {
046a82c1 297 int fd;
071830ff 298
80876c20 299 assert(nfd >= 0);
071830ff 300
613b411c
LP
301 fd = open("/dev/null", flags|O_NOCTTY);
302 if (fd < 0)
071830ff
LP
303 return -errno;
304
046a82c1 305 return move_fd(fd, nfd, false);
071830ff
LP
306}
307
91dd5f7c
LP
308static int connect_journal_socket(
309 int fd,
310 const char *log_namespace,
311 uid_t uid,
312 gid_t gid) {
313
524daa8c
ZJS
314 uid_t olduid = UID_INVALID;
315 gid_t oldgid = GID_INVALID;
91dd5f7c 316 const char *j;
524daa8c
ZJS
317 int r;
318
91dd5f7c
LP
319 j = log_namespace ?
320 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
321 "/run/systemd/journal/stdout";
91dd5f7c 322
cad93f29 323 if (gid_is_valid(gid)) {
524daa8c
ZJS
324 oldgid = getgid();
325
92a17af9 326 if (setegid(gid) < 0)
524daa8c
ZJS
327 return -errno;
328 }
329
cad93f29 330 if (uid_is_valid(uid)) {
524daa8c
ZJS
331 olduid = getuid();
332
92a17af9 333 if (seteuid(uid) < 0) {
524daa8c
ZJS
334 r = -errno;
335 goto restore_gid;
336 }
337 }
338
1861986a 339 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 340
1861986a
LP
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
524daa8c 343
cad93f29 344 if (uid_is_valid(uid))
524daa8c
ZJS
345 (void) seteuid(olduid);
346
347 restore_gid:
cad93f29 348 if (gid_is_valid(gid))
524daa8c
ZJS
349 (void) setegid(oldgid);
350
351 return r;
352}
353
fd1f9c89 354static int connect_logger_as(
34cf6c43 355 const Unit *unit,
fd1f9c89 356 const ExecContext *context,
af635cf3 357 const ExecParameters *params,
fd1f9c89
LP
358 ExecOutput output,
359 const char *ident,
fd1f9c89
LP
360 int nfd,
361 uid_t uid,
362 gid_t gid) {
363
254d1313 364 _cleanup_close_ int fd = -EBADF;
2ac1ff68 365 int r;
071830ff
LP
366
367 assert(context);
af635cf3 368 assert(params);
80876c20
LP
369 assert(output < _EXEC_OUTPUT_MAX);
370 assert(ident);
371 assert(nfd >= 0);
071830ff 372
54fe0cdb
LP
373 fd = socket(AF_UNIX, SOCK_STREAM, 0);
374 if (fd < 0)
80876c20 375 return -errno;
071830ff 376
91dd5f7c 377 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
378 if (r < 0)
379 return r;
071830ff 380
2ac1ff68 381 if (shutdown(fd, SHUT_RD) < 0)
80876c20 382 return -errno;
071830ff 383
fd1f9c89 384 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 385
2ac1ff68 386 if (dprintf(fd,
62bca2c6 387 "%s\n"
80876c20
LP
388 "%s\n"
389 "%i\n"
54fe0cdb
LP
390 "%i\n"
391 "%i\n"
392 "%i\n"
4f4a1dbf 393 "%i\n",
c867611e 394 context->syslog_identifier ?: ident,
af635cf3 395 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
396 context->syslog_priority,
397 !!context->syslog_level_prefix,
f3dc6af2 398 false,
aac8c0c3 399 is_kmsg_output(output),
2ac1ff68
EV
400 is_terminal_output(output)) < 0)
401 return -errno;
80876c20 402
2ac1ff68 403 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 404}
2ac1ff68 405
3a274a21 406static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 407 int fd;
071830ff 408
80876c20
LP
409 assert(path);
410 assert(nfd >= 0);
fd1f9c89 411
3a274a21 412 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 413 if (fd < 0)
80876c20 414 return fd;
071830ff 415
046a82c1 416 return move_fd(fd, nfd, false);
80876c20 417}
071830ff 418
2038c3f5 419static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 420 _cleanup_close_ int fd = -EBADF;
86fca584 421 int r;
071830ff 422
80876c20 423 assert(path);
071830ff 424
2038c3f5
LP
425 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
426 flags |= O_CREAT;
427
428 fd = open(path, flags|O_NOCTTY, mode);
429 if (fd >= 0)
15a3e96f 430 return TAKE_FD(fd);
071830ff 431
2038c3f5
LP
432 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
433 return -errno;
2038c3f5
LP
434
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
436
437 fd = socket(AF_UNIX, SOCK_STREAM, 0);
438 if (fd < 0)
439 return -errno;
440
1861986a
LP
441 r = connect_unix_path(fd, AT_FDCWD, path);
442 if (IN_SET(r, -ENOTSOCK, -EINVAL))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
445 return -ENXIO;
446 if (r < 0)
447 return r;
071830ff 448
2038c3f5
LP
449 if ((flags & O_ACCMODE) == O_RDONLY)
450 r = shutdown(fd, SHUT_WR);
451 else if ((flags & O_ACCMODE) == O_WRONLY)
452 r = shutdown(fd, SHUT_RD);
453 else
86fca584 454 r = 0;
15a3e96f 455 if (r < 0)
2038c3f5 456 return -errno;
2038c3f5 457
15a3e96f 458 return TAKE_FD(fd);
80876c20 459}
071830ff 460
08f3be7a
LP
461static int fixup_input(
462 const ExecContext *context,
463 int socket_fd,
464 bool apply_tty_stdin) {
465
466 ExecInput std_input;
467
468 assert(context);
469
470 std_input = context->std_input;
1e3ad081
LP
471
472 if (is_terminal_input(std_input) && !apply_tty_stdin)
473 return EXEC_INPUT_NULL;
071830ff 474
03fd9c49 475 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
476 return EXEC_INPUT_NULL;
477
08f3be7a
LP
478 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
479 return EXEC_INPUT_NULL;
480
03fd9c49 481 return std_input;
4f2d528d
LP
482}
483
7966a916 484static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 485
7966a916 486 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
487 return EXEC_OUTPUT_INHERIT;
488
7966a916 489 return output;
4f2d528d
LP
490}
491
a34ceba6
LP
492static int setup_input(
493 const ExecContext *context,
494 const ExecParameters *params,
52c239d7 495 int socket_fd,
2caa38e9 496 const int named_iofds[static 3]) {
a34ceba6 497
4f2d528d 498 ExecInput i;
51462135 499 int r;
4f2d528d
LP
500
501 assert(context);
a34ceba6 502 assert(params);
2caa38e9 503 assert(named_iofds);
a34ceba6
LP
504
505 if (params->stdin_fd >= 0) {
506 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
507 return -errno;
508
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e 510 if (isatty(STDIN_FILENO)) {
4d62ee55
DDM
511 unsigned rows = context->tty_rows, cols = context->tty_cols;
512
513 (void) exec_context_tty_size(context, &rows, &cols);
1fb0682e
LP
514 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
515 (void) reset_terminal_fd(STDIN_FILENO, true);
4d62ee55 516 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
1fb0682e 517 }
a34ceba6
LP
518
519 return STDIN_FILENO;
520 }
4f2d528d 521
08f3be7a 522 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
523
524 switch (i) {
071830ff 525
80876c20
LP
526 case EXEC_INPUT_NULL:
527 return open_null_as(O_RDONLY, STDIN_FILENO);
528
529 case EXEC_INPUT_TTY:
530 case EXEC_INPUT_TTY_FORCE:
531 case EXEC_INPUT_TTY_FAIL: {
4d62ee55 532 unsigned rows, cols;
046a82c1 533 int fd;
071830ff 534
1e22b5cd 535 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
536 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
537 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
538 ACQUIRE_TERMINAL_WAIT,
3a43da28 539 USEC_INFINITY);
970edce6 540 if (fd < 0)
80876c20
LP
541 return fd;
542
4d62ee55
DDM
543 r = exec_context_tty_size(context, &rows, &cols);
544 if (r < 0)
545 return r;
546
547 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
51462135
DDM
548 if (r < 0)
549 return r;
550
046a82c1 551 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
552 }
553
4f2d528d 554 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
555 assert(socket_fd >= 0);
556
7c248223 557 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 558
52c239d7 559 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
560 assert(named_iofds[STDIN_FILENO] >= 0);
561
52c239d7 562 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 563 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 564
08f3be7a
LP
565 case EXEC_INPUT_DATA: {
566 int fd;
567
568 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
569 if (fd < 0)
570 return fd;
571
572 return move_fd(fd, STDIN_FILENO, false);
573 }
574
2038c3f5
LP
575 case EXEC_INPUT_FILE: {
576 bool rw;
577 int fd;
578
579 assert(context->stdio_file[STDIN_FILENO]);
580
581 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
582 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
583
584 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
585 if (fd < 0)
586 return fd;
587
588 return move_fd(fd, STDIN_FILENO, false);
589 }
590
80876c20 591 default:
04499a70 592 assert_not_reached();
80876c20
LP
593 }
594}
595
41fc585a
LP
596static bool can_inherit_stderr_from_stdout(
597 const ExecContext *context,
598 ExecOutput o,
599 ExecOutput e) {
600
601 assert(context);
602
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
604 * stderr fd */
605
606 if (e == EXEC_OUTPUT_INHERIT)
607 return true;
608 if (e != o)
609 return false;
610
611 if (e == EXEC_OUTPUT_NAMED_FD)
612 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
613
8d7dab1f 614 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
615 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
616
617 return true;
618}
619
a34ceba6 620static int setup_output(
34cf6c43 621 const Unit *unit,
a34ceba6
LP
622 const ExecContext *context,
623 const ExecParameters *params,
624 int fileno,
625 int socket_fd,
2caa38e9 626 const int named_iofds[static 3],
a34ceba6 627 const char *ident,
7bce046b
LP
628 uid_t uid,
629 gid_t gid,
630 dev_t *journal_stream_dev,
631 ino_t *journal_stream_ino) {
a34ceba6 632
4f2d528d
LP
633 ExecOutput o;
634 ExecInput i;
47c1d80d 635 int r;
4f2d528d 636
f2341e0a 637 assert(unit);
80876c20 638 assert(context);
a34ceba6 639 assert(params);
80876c20 640 assert(ident);
7bce046b
LP
641 assert(journal_stream_dev);
642 assert(journal_stream_ino);
80876c20 643
a34ceba6
LP
644 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
645
646 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
647 return -errno;
648
649 return STDOUT_FILENO;
650 }
651
652 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
653 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
654 return -errno;
655
656 return STDERR_FILENO;
657 }
658
08f3be7a 659 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 660 o = fixup_output(context->std_output, socket_fd);
4f2d528d 661
eb17e935
MS
662 if (fileno == STDERR_FILENO) {
663 ExecOutput e;
664 e = fixup_output(context->std_error, socket_fd);
80876c20 665
eb17e935
MS
666 /* This expects the input and output are already set up */
667
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e == EXEC_OUTPUT_INHERIT &&
671 o == EXEC_OUTPUT_INHERIT &&
672 i == EXEC_INPUT_NULL &&
673 !is_terminal_input(context->std_input) &&
7966a916 674 getppid() != 1)
eb17e935
MS
675 return fileno;
676
677 /* Duplicate from stdout if possible */
41fc585a 678 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 679 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 680
eb17e935 681 o = e;
80876c20 682
eb17e935 683 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
684 /* If input got downgraded, inherit the original value */
685 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 686 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 687
08f3be7a
LP
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 690 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 691
acb591e4
LP
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
693 if (getppid() != 1)
eb17e935 694 return fileno;
94f04347 695
eb17e935
MS
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY, fileno);
071830ff 698 }
94f04347 699
eb17e935 700 switch (o) {
80876c20
LP
701
702 case EXEC_OUTPUT_NULL:
eb17e935 703 return open_null_as(O_WRONLY, fileno);
80876c20
LP
704
705 case EXEC_OUTPUT_TTY:
4f2d528d 706 if (is_terminal_input(i))
7c248223 707 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
708
709 /* We don't reset the terminal if this is just about output */
1e22b5cd 710 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 711
9a6bca7a 712 case EXEC_OUTPUT_KMSG:
28dbc1e8 713 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
714 case EXEC_OUTPUT_JOURNAL:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 716 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 717 if (r < 0) {
7966a916
ZJS
718 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 720 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
721 } else {
722 struct stat st;
723
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
726 * services to detect whether they are connected to the journal or not.
727 *
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
7bce046b 730
ab2116b1
LP
731 if (fstat(fileno, &st) >= 0 &&
732 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
733 *journal_stream_dev = st.st_dev;
734 *journal_stream_ino = st.st_ino;
735 }
47c1d80d
MS
736 }
737 return r;
4f2d528d
LP
738
739 case EXEC_OUTPUT_SOCKET:
740 assert(socket_fd >= 0);
e75a9ed1 741
7c248223 742 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 743
52c239d7 744 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
745 assert(named_iofds[fileno] >= 0);
746
52c239d7 747 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 748 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 749
566b7d23 750 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
751 case EXEC_OUTPUT_FILE_APPEND:
752 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 753 bool rw;
566b7d23 754 int fd, flags;
2038c3f5
LP
755
756 assert(context->stdio_file[fileno]);
757
758 rw = context->std_input == EXEC_INPUT_FILE &&
759 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
760
761 if (rw)
7c248223 762 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 763
566b7d23
ZD
764 flags = O_WRONLY;
765 if (o == EXEC_OUTPUT_FILE_APPEND)
766 flags |= O_APPEND;
8d7dab1f
LW
767 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
768 flags |= O_TRUNC;
566b7d23
ZD
769
770 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
771 if (fd < 0)
772 return fd;
773
566b7d23 774 return move_fd(fd, fileno, 0);
2038c3f5
LP
775 }
776
94f04347 777 default:
04499a70 778 assert_not_reached();
94f04347 779 }
071830ff
LP
780}
781
02a51aba 782static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 783 int r;
02a51aba
LP
784
785 assert(fd >= 0);
02a51aba 786
1ff74fb6 787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
788 if (isatty(fd) < 1) {
789 if (IN_SET(errno, EINVAL, ENOTTY))
790 return 0; /* not a tty */
1ff74fb6 791
02a51aba 792 return -errno;
4b3b5bc7 793 }
02a51aba 794
4b3b5bc7 795 /* This might fail. What matters are the results. */
f2df231f 796 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
797 if (r < 0)
798 return r;
02a51aba 799
4b3b5bc7 800 return 1;
02a51aba
LP
801}
802
aedec452 803static int setup_confirm_stdio(
51462135 804 const ExecContext *context,
aedec452
LP
805 const char *vc,
806 int *ret_saved_stdin,
807 int *ret_saved_stdout) {
808
254d1313 809 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
4d62ee55 810 unsigned rows, cols;
3d18b167 811 int r;
80876c20 812
aedec452
LP
813 assert(ret_saved_stdin);
814 assert(ret_saved_stdout);
80876c20 815
af6da548
LP
816 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
817 if (saved_stdin < 0)
818 return -errno;
80876c20 819
af6da548 820 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
821 if (saved_stdout < 0)
822 return -errno;
80876c20 823
8854d795 824 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
825 if (fd < 0)
826 return fd;
80876c20 827
af6da548
LP
828 r = chown_terminal(fd, getuid());
829 if (r < 0)
3d18b167 830 return r;
02a51aba 831
3d18b167
LP
832 r = reset_terminal_fd(fd, true);
833 if (r < 0)
834 return r;
80876c20 835
4d62ee55
DDM
836 r = exec_context_tty_size(context, &rows, &cols);
837 if (r < 0)
838 return r;
839
840 r = terminal_set_size_fd(fd, vc, rows, cols);
51462135
DDM
841 if (r < 0)
842 return r;
843
aedec452
LP
844 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
845 TAKE_FD(fd);
2b33ab09
LP
846 if (r < 0)
847 return r;
80876c20 848
aedec452
LP
849 *ret_saved_stdin = TAKE_FD(saved_stdin);
850 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 851 return 0;
80876c20
LP
852}
853
63d77c92 854static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
855 assert(err < 0);
856
857 if (err == -ETIMEDOUT)
63d77c92 858 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
859 else {
860 errno = -err;
63d77c92 861 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
862 }
863}
864
63d77c92 865static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 866 _cleanup_close_ int fd = -EBADF;
80876c20 867
3b20f877 868 assert(vc);
80876c20 869
7d5ceb64 870 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 871 if (fd < 0)
3b20f877 872 return;
80876c20 873
63d77c92 874 write_confirm_error_fd(err, fd, u);
af6da548 875}
80876c20 876
3d18b167 877static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 878 int r = 0;
80876c20 879
af6da548
LP
880 assert(saved_stdin);
881 assert(saved_stdout);
882
883 release_terminal();
884
885 if (*saved_stdin >= 0)
80876c20 886 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 887 r = -errno;
80876c20 888
af6da548 889 if (*saved_stdout >= 0)
80876c20 890 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 891 r = -errno;
80876c20 892
3d18b167
LP
893 *saved_stdin = safe_close(*saved_stdin);
894 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
895
896 return r;
897}
898
3b20f877
FB
899enum {
900 CONFIRM_PRETEND_FAILURE = -1,
901 CONFIRM_PRETEND_SUCCESS = 0,
902 CONFIRM_EXECUTE = 1,
903};
904
51462135 905static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 906 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 907 _cleanup_free_ char *e = NULL;
3b20f877 908 char c;
af6da548 909
3b20f877 910 /* For any internal errors, assume a positive response. */
51462135 911 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 912 if (r < 0) {
63d77c92 913 write_confirm_error(r, vc, u);
3b20f877
FB
914 return CONFIRM_EXECUTE;
915 }
af6da548 916
b0eb2944
FB
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u->manager)) {
919 r = 1;
920 goto restore_stdio;
921 }
af6da548 922
2bcd3c26
FB
923 e = ellipsize(cmdline, 60, 100);
924 if (!e) {
925 log_oom();
926 r = CONFIRM_EXECUTE;
927 goto restore_stdio;
928 }
af6da548 929
d172b175 930 for (;;) {
539622bd 931 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 932 if (r < 0) {
63d77c92 933 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
934 r = CONFIRM_EXECUTE;
935 goto restore_stdio;
936 }
af6da548 937
d172b175 938 switch (c) {
b0eb2944
FB
939 case 'c':
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
942 r = 1;
943 break;
dd6f9ac0
FB
944 case 'D':
945 unit_dump(u, stdout, " ");
946 continue; /* ask again */
d172b175
FB
947 case 'f':
948 printf("Failing execution.\n");
949 r = CONFIRM_PRETEND_FAILURE;
950 break;
951 case 'h':
b0eb2944
FB
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
dd6f9ac0 954 " f - fail, don't execute the command and pretend it failed\n"
d172b175 955 " h - help\n"
eedf223a 956 " i - info, show a short summary of the unit\n"
56fde33a 957 " j - jobs, show jobs that are in progress\n"
d172b175
FB
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
dd6f9ac0 960 continue; /* ask again */
eedf223a
FB
961 case 'i':
962 printf(" Description: %s\n"
963 " Unit: %s\n"
964 " Command: %s\n",
965 u->id, u->description, cmdline);
966 continue; /* ask again */
56fde33a 967 case 'j':
d1d8786c 968 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 969 continue; /* ask again */
539622bd
FB
970 case 'n':
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
d172b175
FB
974 case 's':
975 printf("Skipping execution.\n");
976 r = CONFIRM_PRETEND_SUCCESS;
977 break;
978 case 'y':
979 r = CONFIRM_EXECUTE;
980 break;
981 default:
04499a70 982 assert_not_reached();
d172b175 983 }
3b20f877 984 break;
3b20f877 985 }
af6da548 986
3b20f877 987restore_stdio:
af6da548 988 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 989 return r;
80876c20
LP
990}
991
4d885bd3
DH
992static int get_fixed_user(const ExecContext *c, const char **user,
993 uid_t *uid, gid_t *gid,
994 const char **home, const char **shell) {
81a2b7ce 995 int r;
4d885bd3 996 const char *name;
81a2b7ce 997
4d885bd3 998 assert(c);
81a2b7ce 999
23deef88
LP
1000 if (!c->user)
1001 return 0;
1002
4d885bd3
DH
1003 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1004 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 1005
23deef88 1006 name = c->user;
fafff8f1 1007 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
1008 if (r < 0)
1009 return r;
81a2b7ce 1010
4d885bd3
DH
1011 *user = name;
1012 return 0;
1013}
1014
1015static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1016 int r;
1017 const char *name;
1018
1019 assert(c);
1020
1021 if (!c->group)
1022 return 0;
1023
1024 name = c->group;
fafff8f1 1025 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
1026 if (r < 0)
1027 return r;
1028
1029 *group = name;
1030 return 0;
1031}
1032
cdc5d5c5
DH
1033static int get_supplementary_groups(const ExecContext *c, const char *user,
1034 const char *group, gid_t gid,
1035 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
1036 int r, k = 0;
1037 int ngroups_max;
1038 bool keep_groups = false;
1039 gid_t *groups = NULL;
1040 _cleanup_free_ gid_t *l_gids = NULL;
1041
1042 assert(c);
1043
bbeea271
DH
1044 /*
1045 * If user is given, then lookup GID and supplementary groups list.
1046 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1047 * here and as early as possible so we keep the list of supplementary
1048 * groups of the caller.
bbeea271
DH
1049 */
1050 if (user && gid_is_valid(gid) && gid != 0) {
1051 /* First step, initialize groups from /etc/groups */
1052 if (initgroups(user, gid) < 0)
1053 return -errno;
1054
1055 keep_groups = true;
1056 }
1057
ac6e8be6 1058 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1059 return 0;
1060
366ddd25
DH
1061 /*
1062 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1063 * be positive, otherwise fail.
1064 */
1065 errno = 0;
1066 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1067 if (ngroups_max <= 0)
1068 return errno_or_else(EOPNOTSUPP);
366ddd25 1069
4d885bd3
DH
1070 l_gids = new(gid_t, ngroups_max);
1071 if (!l_gids)
1072 return -ENOMEM;
81a2b7ce 1073
4d885bd3
DH
1074 if (keep_groups) {
1075 /*
1076 * Lookup the list of groups that the user belongs to, we
1077 * avoid NSS lookups here too for gid=0.
1078 */
1079 k = ngroups_max;
1080 if (getgrouplist(user, gid, l_gids, &k) < 0)
1081 return -EINVAL;
1082 } else
1083 k = 0;
81a2b7ce 1084
4d885bd3
DH
1085 STRV_FOREACH(i, c->supplementary_groups) {
1086 const char *g;
81a2b7ce 1087
4d885bd3
DH
1088 if (k >= ngroups_max)
1089 return -E2BIG;
81a2b7ce 1090
4d885bd3 1091 g = *i;
fafff8f1 1092 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1093 if (r < 0)
1094 return r;
81a2b7ce 1095
4d885bd3
DH
1096 k++;
1097 }
81a2b7ce 1098
4d885bd3
DH
1099 /*
1100 * Sets ngids to zero to drop all supplementary groups, happens
1101 * when we are under root and SupplementaryGroups= is empty.
1102 */
1103 if (k == 0) {
1104 *ngids = 0;
1105 return 0;
1106 }
81a2b7ce 1107
4d885bd3
DH
1108 /* Otherwise get the final list of supplementary groups */
1109 groups = memdup(l_gids, sizeof(gid_t) * k);
1110 if (!groups)
1111 return -ENOMEM;
1112
1113 *supplementary_gids = groups;
1114 *ngids = k;
1115
1116 groups = NULL;
1117
1118 return 0;
1119}
1120
34cf6c43 1121static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1122 int r;
1123
709dbeac
YW
1124 /* Handle SupplementaryGroups= if it is not empty */
1125 if (ngids > 0) {
4d885bd3
DH
1126 r = maybe_setgroups(ngids, supplementary_gids);
1127 if (r < 0)
97f0e76f 1128 return r;
4d885bd3 1129 }
81a2b7ce 1130
4d885bd3
DH
1131 if (gid_is_valid(gid)) {
1132 /* Then set our gids */
1133 if (setresgid(gid, gid, gid) < 0)
1134 return -errno;
81a2b7ce
LP
1135 }
1136
1137 return 0;
1138}
1139
a954b249
LP
1140static int set_securebits(unsigned bits, unsigned mask) {
1141 unsigned applied;
1142 int current;
1143
dbdc4098
TK
1144 current = prctl(PR_GET_SECUREBITS);
1145 if (current < 0)
1146 return -errno;
a954b249 1147
dbdc4098 1148 /* Clear all securebits defined in mask and set bits */
a954b249
LP
1149 applied = ((unsigned) current & ~mask) | bits;
1150 if ((unsigned) current == applied)
dbdc4098 1151 return 0;
a954b249 1152
dbdc4098
TK
1153 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1154 return -errno;
a954b249 1155
dbdc4098
TK
1156 return 1;
1157}
1158
638fd8cc
LP
1159static int enforce_user(
1160 const ExecContext *context,
1161 uid_t uid,
1162 uint64_t capability_ambient_set) {
81a2b7ce 1163 assert(context);
dbdc4098 1164 int r;
81a2b7ce 1165
4d885bd3
DH
1166 if (!uid_is_valid(uid))
1167 return 0;
1168
a954b249
LP
1169 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1170 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1171 * case. */
81a2b7ce 1172
638fd8cc 1173 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
81a2b7ce 1174
a954b249
LP
1175 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1176 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1177 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1178 if (r < 0)
1179 return r;
81a2b7ce
LP
1180 }
1181
479050b3 1182 /* Second step: actually set the uids */
81a2b7ce
LP
1183 if (setresuid(uid, uid, uid) < 0)
1184 return -errno;
1185
a954b249
LP
1186 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1187 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1188 * outside of this call. */
81a2b7ce
LP
1189 return 0;
1190}
1191
349cc4a5 1192#if HAVE_PAM
5b6319dc
LP
1193
1194static int null_conv(
1195 int num_msg,
1196 const struct pam_message **msg,
1197 struct pam_response **resp,
1198 void *appdata_ptr) {
1199
1200 /* We don't support conversations */
1201
1202 return PAM_CONV_ERR;
1203}
1204
cefc33ae
LP
1205#endif
1206
5b6319dc
LP
1207static int setup_pam(
1208 const char *name,
1209 const char *user,
940c5210 1210 uid_t uid,
2d6fce8d 1211 gid_t gid,
5b6319dc 1212 const char *tty,
421bb42d 1213 char ***env, /* updated on success */
5b8d1f6b 1214 const int fds[], size_t n_fds) {
5b6319dc 1215
349cc4a5 1216#if HAVE_PAM
cefc33ae 1217
5b6319dc
LP
1218 static const struct pam_conv conv = {
1219 .conv = null_conv,
1220 .appdata_ptr = NULL
1221 };
1222
2d7c6aa2 1223 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1224 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1225 pam_handle_t *handle = NULL;
d6e5f3ad 1226 sigset_t old_ss;
7bb70b6e 1227 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1228 bool close_session = false;
1229 pid_t pam_pid = 0, parent_pid;
970edce6 1230 int flags = 0;
5b6319dc
LP
1231
1232 assert(name);
1233 assert(user);
2065ca69 1234 assert(env);
5b6319dc
LP
1235
1236 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1237 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1238 * systemd via the cgroup logic. It will then remove the PAM
1239 * session again. The parent process will exec() the actual
1240 * daemon. We do things this way to ensure that the main PID
1241 * of the daemon is the one we initially fork()ed. */
1242
7bb70b6e
LP
1243 r = barrier_create(&barrier);
1244 if (r < 0)
2d7c6aa2
DH
1245 goto fail;
1246
553d2243 1247 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1248 flags |= PAM_SILENT;
1249
f546241b
ZJS
1250 pam_code = pam_start(name, user, &conv, &handle);
1251 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1252 handle = NULL;
1253 goto fail;
1254 }
1255
3cd24c1a
LP
1256 if (!tty) {
1257 _cleanup_free_ char *q = NULL;
1258
1259 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1260 * out if that's the case, and read the TTY off it. */
1261
1262 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1263 tty = strjoina("/dev/", q);
1264 }
1265
513cf7da
MS
1266 if (tty) {
1267 pam_code = pam_set_item(handle, PAM_TTY, tty);
1268 if (pam_code != PAM_SUCCESS)
1269 goto fail;
1270 }
5b6319dc 1271
84eada2f
JW
1272 STRV_FOREACH(nv, *env) {
1273 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1274 if (pam_code != PAM_SUCCESS)
1275 goto fail;
1276 }
1277
970edce6 1278 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1279 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1280 goto fail;
1281
3bb39ea9
DG
1282 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1283 if (pam_code != PAM_SUCCESS)
46d7c6af 1284 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1285
970edce6 1286 pam_code = pam_open_session(handle, flags);
f546241b 1287 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1288 goto fail;
1289
1290 close_session = true;
1291
f546241b
ZJS
1292 e = pam_getenvlist(handle);
1293 if (!e) {
5b6319dc
LP
1294 pam_code = PAM_BUF_ERR;
1295 goto fail;
1296 }
1297
cafc5ca1 1298 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1299
72c0a2c2 1300 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1301
df0ff127 1302 parent_pid = getpid_cached();
5b6319dc 1303
4c253ed1
LP
1304 r = safe_fork("(sd-pam)", 0, &pam_pid);
1305 if (r < 0)
5b6319dc 1306 goto fail;
4c253ed1 1307 if (r == 0) {
7bb70b6e 1308 int sig, ret = EXIT_PAM;
5b6319dc 1309
cafc5ca1 1310 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1311 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1312
1da37e58
ZJS
1313 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1314 * those fds are open here that have been opened by PAM. */
4c253ed1 1315 (void) close_many(fds, n_fds);
5b6319dc 1316
cafc5ca1
LP
1317 /* Drop privileges - we don't need any to pam_close_session and this will make
1318 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1319 * threads to fail to exit normally */
2d6fce8d 1320
97f0e76f
LP
1321 r = maybe_setgroups(0, NULL);
1322 if (r < 0)
1323 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1324 if (setresgid(gid, gid, gid) < 0)
1325 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1326 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1327 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1328
9c274488 1329 (void) ignore_signals(SIGPIPE);
ce30c8dc 1330
cafc5ca1
LP
1331 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1332 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1333 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1334 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1335 goto child_finish;
1336
cafc5ca1
LP
1337 /* Tell the parent that our setup is done. This is especially important regarding dropping
1338 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1339 *
cafc5ca1 1340 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1341 (void) barrier_place(&barrier);
2d7c6aa2 1342
643f4706 1343 /* Check if our parent process might already have died? */
5b6319dc 1344 if (getppid() == parent_pid) {
d6e5f3ad
DM
1345 sigset_t ss;
1346
1347 assert_se(sigemptyset(&ss) >= 0);
1348 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1349
3dead8d9
LP
1350 for (;;) {
1351 if (sigwait(&ss, &sig) < 0) {
1352 if (errno == EINTR)
1353 continue;
1354
1355 goto child_finish;
1356 }
5b6319dc 1357
3dead8d9
LP
1358 assert(sig == SIGTERM);
1359 break;
1360 }
5b6319dc
LP
1361 }
1362
3bb39ea9
DG
1363 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1364 if (pam_code != PAM_SUCCESS)
1365 goto child_finish;
1366
3dead8d9 1367 /* If our parent died we'll end the session */
f546241b 1368 if (getppid() != parent_pid) {
970edce6 1369 pam_code = pam_close_session(handle, flags);
f546241b 1370 if (pam_code != PAM_SUCCESS)
5b6319dc 1371 goto child_finish;
f546241b 1372 }
5b6319dc 1373
7bb70b6e 1374 ret = 0;
5b6319dc
LP
1375
1376 child_finish:
7feb2b57
LP
1377 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1378 * know about this. See pam_end(3) */
1379 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1380 _exit(ret);
5b6319dc
LP
1381 }
1382
2d7c6aa2
DH
1383 barrier_set_role(&barrier, BARRIER_PARENT);
1384
cafc5ca1
LP
1385 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1386 * here. */
5b6319dc
LP
1387 handle = NULL;
1388
3b8bddde 1389 /* Unblock SIGTERM again in the parent */
72c0a2c2 1390 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1391
cafc5ca1
LP
1392 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1393 * this fd around. */
5b6319dc
LP
1394 closelog();
1395
cafc5ca1
LP
1396 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1397 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1398 if (!barrier_place_and_sync(&barrier))
1399 log_error("PAM initialization failed");
1400
130d3d22 1401 return strv_free_and_replace(*env, e);
5b6319dc
LP
1402
1403fail:
970edce6
ZJS
1404 if (pam_code != PAM_SUCCESS) {
1405 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1406 r = -EPERM; /* PAM errors do not map to errno */
1407 } else
1408 log_error_errno(r, "PAM failed: %m");
9ba35398 1409
5b6319dc
LP
1410 if (handle) {
1411 if (close_session)
970edce6 1412 pam_code = pam_close_session(handle, flags);
5b6319dc 1413
7feb2b57 1414 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1415 }
1416
5b6319dc 1417 closelog();
7bb70b6e 1418 return r;
cefc33ae
LP
1419#else
1420 return 0;
5b6319dc 1421#endif
cefc33ae 1422}
5b6319dc 1423
5d6b1584 1424static void rename_process_from_path(const char *path) {
a99626c1 1425 _cleanup_free_ char *buf = NULL;
5d6b1584 1426 const char *p;
5d6b1584 1427
a99626c1
LP
1428 assert(path);
1429
1430 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1431 * /bin/ps */
5d6b1584 1432
a99626c1 1433 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1434 rename_process("(...)");
1435 return;
1436 }
1437
a99626c1 1438 size_t l = strlen(buf);
5d6b1584 1439 if (l > 8) {
a99626c1 1440 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1441 * "systemd-" */
a99626c1 1442 p = buf + l - 8;
5d6b1584 1443 l = 8;
a99626c1
LP
1444 } else
1445 p = buf;
5d6b1584 1446
a99626c1 1447 char process_name[11];
5d6b1584
LP
1448 process_name[0] = '(';
1449 memcpy(process_name+1, p, l);
1450 process_name[1+l] = ')';
1451 process_name[1+l+1] = 0;
1452
1453 rename_process(process_name);
1454}
1455
469830d1
LP
1456static bool context_has_address_families(const ExecContext *c) {
1457 assert(c);
1458
6b000af4 1459 return c->address_families_allow_list ||
469830d1
LP
1460 !set_isempty(c->address_families);
1461}
1462
1463static bool context_has_syscall_filters(const ExecContext *c) {
1464 assert(c);
1465
6b000af4 1466 return c->syscall_allow_list ||
8cfa775f 1467 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1468}
1469
9df2cdd8
TM
1470static bool context_has_syscall_logs(const ExecContext *c) {
1471 assert(c);
1472
1473 return c->syscall_log_allow_list ||
1474 !hashmap_isempty(c->syscall_log);
1475}
1476
469830d1
LP
1477static bool context_has_no_new_privileges(const ExecContext *c) {
1478 assert(c);
1479
1480 if (c->no_new_privileges)
1481 return true;
1482
26c45a6c 1483 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1484 return false;
1485
1486 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1487 return c->lock_personality ||
469830d1 1488 c->memory_deny_write_execute ||
0538d2a8 1489 c->private_devices ||
fc64760d 1490 c->protect_clock ||
0538d2a8 1491 c->protect_hostname ||
469830d1
LP
1492 c->protect_kernel_tunables ||
1493 c->protect_kernel_modules ||
84703040 1494 c->protect_kernel_logs ||
0538d2a8
YW
1495 context_has_address_families(c) ||
1496 exec_context_restrict_namespaces_set(c) ||
1497 c->restrict_realtime ||
1498 c->restrict_suid_sgid ||
78e864e5 1499 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1500 context_has_syscall_filters(c) ||
1501 context_has_syscall_logs(c);
469830d1
LP
1502}
1503
349cc4a5 1504#if HAVE_SECCOMP
17df7223 1505
83f12b27 1506static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1507
1508 if (is_seccomp_available())
1509 return false;
1510
f673b62d 1511 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1512 return true;
83f12b27
FS
1513}
1514
165a31c0 1515static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1516 uint32_t negative_action, default_action, action;
165a31c0 1517 int r;
8351ceae 1518
469830d1 1519 assert(u);
c0467cf3 1520 assert(c);
8351ceae 1521
469830d1 1522 if (!context_has_syscall_filters(c))
83f12b27
FS
1523 return 0;
1524
469830d1
LP
1525 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1526 return 0;
e9642be2 1527
005bfaf1 1528 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1529
6b000af4 1530 if (c->syscall_allow_list) {
469830d1
LP
1531 default_action = negative_action;
1532 action = SCMP_ACT_ALLOW;
7c66bae2 1533 } else {
469830d1
LP
1534 default_action = SCMP_ACT_ALLOW;
1535 action = negative_action;
57183d11 1536 }
8351ceae 1537
165a31c0 1538 if (needs_ambient_hack) {
6b000af4 1539 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1540 if (r < 0)
1541 return r;
1542 }
1543
b54f36c6 1544 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1545}
1546
9df2cdd8
TM
1547static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1548#ifdef SCMP_ACT_LOG
1549 uint32_t default_action, action;
1550#endif
1551
1552 assert(u);
1553 assert(c);
1554
1555 if (!context_has_syscall_logs(c))
1556 return 0;
1557
1558#ifdef SCMP_ACT_LOG
1559 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1560 return 0;
1561
1562 if (c->syscall_log_allow_list) {
1563 /* Log nothing but the ones listed */
1564 default_action = SCMP_ACT_ALLOW;
1565 action = SCMP_ACT_LOG;
1566 } else {
1567 /* Log everything but the ones listed */
1568 default_action = SCMP_ACT_LOG;
1569 action = SCMP_ACT_ALLOW;
1570 }
1571
1572 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1573#else
1574 /* old libseccomp */
1575 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1576 return 0;
1577#endif
1578}
1579
469830d1
LP
1580static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1581 assert(u);
4298d0b5
LP
1582 assert(c);
1583
469830d1 1584 if (set_isempty(c->syscall_archs))
83f12b27
FS
1585 return 0;
1586
469830d1
LP
1587 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1588 return 0;
4298d0b5 1589
469830d1
LP
1590 return seccomp_restrict_archs(c->syscall_archs);
1591}
4298d0b5 1592
469830d1
LP
1593static int apply_address_families(const Unit* u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
4298d0b5 1596
469830d1
LP
1597 if (!context_has_address_families(c))
1598 return 0;
4298d0b5 1599
469830d1
LP
1600 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1601 return 0;
4298d0b5 1602
6b000af4 1603 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1604}
4298d0b5 1605
83f12b27 1606static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
7a114ed4
TM
1607 int r;
1608
469830d1 1609 assert(u);
f3e43635
TM
1610 assert(c);
1611
469830d1 1612 if (!c->memory_deny_write_execute)
83f12b27
FS
1613 return 0;
1614
7a114ed4
TM
1615 /* use prctl() if kernel supports it (6.3) */
1616 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1617 if (r == 0) {
1618 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1619 return 0;
1620 }
1621 if (r < 0 && errno != EINVAL)
1622 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1623 /* else use seccomp */
1624 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1625
469830d1
LP
1626 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1627 return 0;
f3e43635 1628
469830d1 1629 return seccomp_memory_deny_write_execute();
f3e43635
TM
1630}
1631
83f12b27 1632static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1633 assert(u);
f4170c67
LP
1634 assert(c);
1635
469830d1 1636 if (!c->restrict_realtime)
83f12b27
FS
1637 return 0;
1638
469830d1
LP
1639 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1640 return 0;
f4170c67 1641
469830d1 1642 return seccomp_restrict_realtime();
f4170c67
LP
1643}
1644
f69567cb
LP
1645static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1646 assert(u);
1647 assert(c);
1648
1649 if (!c->restrict_suid_sgid)
1650 return 0;
1651
1652 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1653 return 0;
1654
1655 return seccomp_restrict_suid_sgid();
1656}
1657
59e856c7 1658static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1659 assert(u);
59eeb84b
LP
1660 assert(c);
1661
1662 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1663 * let's protect even those systems where this is left on in the kernel. */
1664
469830d1 1665 if (!c->protect_kernel_tunables)
59eeb84b
LP
1666 return 0;
1667
469830d1
LP
1668 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1669 return 0;
59eeb84b 1670
469830d1 1671 return seccomp_protect_sysctl();
59eeb84b
LP
1672}
1673
59e856c7 1674static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1675 assert(u);
502d704e
DH
1676 assert(c);
1677
25a8d8a0 1678 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1679
469830d1
LP
1680 if (!c->protect_kernel_modules)
1681 return 0;
1682
502d704e
DH
1683 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1684 return 0;
1685
b54f36c6 1686 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1687}
1688
84703040
KK
1689static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1690 assert(u);
1691 assert(c);
1692
1693 if (!c->protect_kernel_logs)
1694 return 0;
1695
1696 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1697 return 0;
1698
1699 return seccomp_protect_syslog();
1700}
1701
daf8f72b 1702static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1703 assert(u);
1704 assert(c);
1705
1706 if (!c->protect_clock)
1707 return 0;
1708
1709 if (skip_seccomp_unavailable(u, "ProtectClock="))
1710 return 0;
1711
1712 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1713}
1714
59e856c7 1715static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1716 assert(u);
ba128bb8
LP
1717 assert(c);
1718
8f81a5f6 1719 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1720
469830d1
LP
1721 if (!c->private_devices)
1722 return 0;
1723
ba128bb8
LP
1724 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1725 return 0;
1726
b54f36c6 1727 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1728}
1729
34cf6c43 1730static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1731 assert(u);
add00535
LP
1732 assert(c);
1733
1734 if (!exec_context_restrict_namespaces_set(c))
1735 return 0;
1736
1737 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1738 return 0;
1739
1740 return seccomp_restrict_namespaces(c->restrict_namespaces);
1741}
1742
78e864e5 1743static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1744 unsigned long personality;
1745 int r;
78e864e5
TM
1746
1747 assert(u);
1748 assert(c);
1749
1750 if (!c->lock_personality)
1751 return 0;
1752
1753 if (skip_seccomp_unavailable(u, "LockPersonality="))
1754 return 0;
1755
e8132d63
LP
1756 personality = c->personality;
1757
1758 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1759 if (personality == PERSONALITY_INVALID) {
1760
1761 r = opinionated_personality(&personality);
1762 if (r < 0)
1763 return r;
1764 }
78e864e5
TM
1765
1766 return seccomp_lock_personality(personality);
1767}
1768
c0467cf3 1769#endif
8351ceae 1770
7a8288f6 1771#if HAVE_LIBBPF
7a8288f6
DM
1772static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1773 assert(u);
1774 assert(c);
1775
1776 if (!exec_context_restrict_filesystems_set(c))
1777 return 0;
1778
46004616
ZJS
1779 if (!u->manager->restrict_fs) {
1780 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1781 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1782 return 0;
46004616 1783 }
7a8288f6
DM
1784
1785 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1786}
1787#endif
1788
daf8f72b 1789static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1790 assert(u);
1791 assert(c);
1792
1793 if (!c->protect_hostname)
1794 return 0;
1795
1796 if (ns_type_supported(NAMESPACE_UTS)) {
1797 if (unshare(CLONE_NEWUTS) < 0) {
1798 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1799 *ret_exit_status = EXIT_NAMESPACE;
1800 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1801 }
1802
1803 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1804 }
1805 } else
1806 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1807
1808#if HAVE_SECCOMP
8f3e342f
ZJS
1809 int r;
1810
daf8f72b
LP
1811 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1812 return 0;
1813
1814 r = seccomp_protect_hostname();
1815 if (r < 0) {
1816 *ret_exit_status = EXIT_SECCOMP;
1817 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1818 }
1819#endif
1820
1821 return 0;
1822}
1823
3042bbeb 1824static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1825 assert(idle_pipe);
1826
54eb2300
LP
1827 idle_pipe[1] = safe_close(idle_pipe[1]);
1828 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1829
1830 if (idle_pipe[0] >= 0) {
1831 int r;
1832
1833 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1834
1835 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1836 ssize_t n;
1837
31a7eb86 1838 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1839 n = write(idle_pipe[3], "x", 1);
1840 if (n > 0)
cd972d69 1841 /* Wait for systemd to react to the signal above. */
54756dce 1842 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1843 }
1844
54eb2300 1845 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1846
1847 }
1848
54eb2300 1849 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1850}
1851
fb2042dd
YW
1852static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1853
7cae38c4 1854static int build_environment(
34cf6c43 1855 const Unit *u,
9fa95f85 1856 const ExecContext *c,
1e22b5cd 1857 const ExecParameters *p,
6bb00842 1858 const CGroupContext *cgroup_context,
da6053d0 1859 size_t n_fds,
cd48e23f 1860 char **fdnames,
7cae38c4
LP
1861 const char *home,
1862 const char *username,
1863 const char *shell,
7bce046b
LP
1864 dev_t journal_stream_dev,
1865 ino_t journal_stream_ino,
6bb00842 1866 const char *memory_pressure_path,
7cae38c4
LP
1867 char ***ret) {
1868
1869 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1870 size_t n_env = 0;
7cae38c4 1871 char *x;
4d62ee55 1872 int r;
7cae38c4 1873
4b58153d 1874 assert(u);
7cae38c4 1875 assert(c);
7c1cb6f1 1876 assert(p);
7cae38c4
LP
1877 assert(ret);
1878
6bb00842 1879#define N_ENV_VARS 19
8d5bb13d 1880 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1881 if (!our_env)
1882 return -ENOMEM;
1883
1884 if (n_fds > 0) {
8dd4c05b
LP
1885 _cleanup_free_ char *joined = NULL;
1886
df0ff127 1887 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1888 return -ENOMEM;
1889 our_env[n_env++] = x;
1890
da6053d0 1891 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1892 return -ENOMEM;
1893 our_env[n_env++] = x;
8dd4c05b 1894
cd48e23f 1895 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1896 if (!joined)
1897 return -ENOMEM;
1898
605405c6 1899 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1900 if (!x)
1901 return -ENOMEM;
1902 our_env[n_env++] = x;
7cae38c4
LP
1903 }
1904
b08af3b1 1905 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1906 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1907 return -ENOMEM;
1908 our_env[n_env++] = x;
1909
1e22b5cd 1910 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1911 return -ENOMEM;
1912 our_env[n_env++] = x;
1913 }
1914
de90700f
LP
1915 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1916 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1917 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1918 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1919 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
7cae38c4 1925 if (home) {
b910cc72 1926 x = strjoin("HOME=", home);
7cae38c4
LP
1927 if (!x)
1928 return -ENOMEM;
7bbead1d 1929
4ff361cc 1930 path_simplify(x + 5);
7cae38c4
LP
1931 our_env[n_env++] = x;
1932 }
1933
1934 if (username) {
b910cc72 1935 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1936 if (!x)
1937 return -ENOMEM;
1938 our_env[n_env++] = x;
1939
b910cc72 1940 x = strjoin("USER=", username);
7cae38c4
LP
1941 if (!x)
1942 return -ENOMEM;
1943 our_env[n_env++] = x;
1944 }
1945
1946 if (shell) {
b910cc72 1947 x = strjoin("SHELL=", shell);
7cae38c4
LP
1948 if (!x)
1949 return -ENOMEM;
7bbead1d 1950
4ff361cc 1951 path_simplify(x + 6);
7cae38c4
LP
1952 our_env[n_env++] = x;
1953 }
1954
4b58153d
LP
1955 if (!sd_id128_is_null(u->invocation_id)) {
1956 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1957 return -ENOMEM;
1958
1959 our_env[n_env++] = x;
1960 }
1961
6af760f3 1962 if (exec_context_needs_term(c)) {
4d62ee55 1963 _cleanup_free_ char *cmdline = NULL;
6af760f3
LP
1964 const char *tty_path, *term = NULL;
1965
1966 tty_path = exec_context_tty_path(c);
1967
e8cf09b2
LP
1968 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1969 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1970 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1971
e8cf09b2 1972 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1973 term = getenv("TERM");
4d62ee55
DDM
1974 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1975 _cleanup_free_ char *key = NULL;
1976
1977 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1978 if (!key)
1979 return -ENOMEM;
1980
1981 r = proc_cmdline_get_key(key, 0, &cmdline);
1982 if (r < 0)
1983 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1984 else if (r > 0)
1985 term = cmdline;
1986 }
e8cf09b2 1987
6af760f3
LP
1988 if (!term)
1989 term = default_term_for_tty(tty_path);
7cae38c4 1990
b910cc72 1991 x = strjoin("TERM=", term);
7cae38c4
LP
1992 if (!x)
1993 return -ENOMEM;
1994 our_env[n_env++] = x;
1995 }
1996
7bce046b
LP
1997 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1998 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1999 return -ENOMEM;
2000
2001 our_env[n_env++] = x;
2002 }
2003
91dd5f7c
LP
2004 if (c->log_namespace) {
2005 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2006 if (!x)
2007 return -ENOMEM;
2008
2009 our_env[n_env++] = x;
2010 }
2011
5b10116e 2012 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 2013 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
2014 const char *n;
2015
2016 if (!p->prefix[t])
2017 continue;
2018
211a3d87 2019 if (c->directories[t].n_items == 0)
fb2042dd
YW
2020 continue;
2021
2022 n = exec_directory_env_name_to_string(t);
2023 if (!n)
2024 continue;
2025
211a3d87
LB
2026 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2027 _cleanup_free_ char *prefixed = NULL;
fb2042dd 2028
211a3d87
LB
2029 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2030 if (!prefixed)
2031 return -ENOMEM;
2032
2033 if (!strextend_with_separator(&joined, ":", prefixed))
2034 return -ENOMEM;
2035 }
fb2042dd
YW
2036
2037 x = strjoin(n, "=", joined);
2038 if (!x)
2039 return -ENOMEM;
2040
2041 our_env[n_env++] = x;
2042 }
2043
73ff4d48
YW
2044 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
2045 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
bb0c0d6f
LP
2046 if (!x)
2047 return -ENOMEM;
2048
2049 our_env[n_env++] = x;
2050 }
2051
dc4e2940
YW
2052 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2053 return -ENOMEM;
2054
2055 our_env[n_env++] = x;
2056
6bb00842
LP
2057 if (memory_pressure_path) {
2058 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2059 if (!x)
2060 return -ENOMEM;
2061
2062 our_env[n_env++] = x;
2063
2064 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2065 _cleanup_free_ char *b = NULL, *e = NULL;
2066
2067 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2068 MEMORY_PRESSURE_DEFAULT_TYPE,
2069 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2070 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2071 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2072 return -ENOMEM;
2073
2074 if (base64mem(b, strlen(b) + 1, &e) < 0)
2075 return -ENOMEM;
2076
2077 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2078 if (!x)
2079 return -ENOMEM;
2080
2081 our_env[n_env++] = x;
2082 }
2083 }
2084
2085 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
8d5bb13d 2086#undef N_ENV_VARS
7cae38c4 2087
ae2a15bc 2088 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2089
2090 return 0;
2091}
2092
b4c14404
FB
2093static int build_pass_environment(const ExecContext *c, char ***ret) {
2094 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2095 size_t n_env = 0;
b4c14404
FB
2096
2097 STRV_FOREACH(i, c->pass_environment) {
2098 _cleanup_free_ char *x = NULL;
2099 char *v;
2100
2101 v = getenv(*i);
2102 if (!v)
2103 continue;
605405c6 2104 x = strjoin(*i, "=", v);
b4c14404
FB
2105 if (!x)
2106 return -ENOMEM;
00819cc1 2107
319a4f4b 2108 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2109 return -ENOMEM;
00819cc1 2110
1cc6c93a 2111 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2112 pass_env[n_env] = NULL;
b4c14404
FB
2113 }
2114
ae2a15bc 2115 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2116
2117 return 0;
2118}
2119
fbbb9697
YW
2120bool exec_needs_network_namespace(const ExecContext *context) {
2121 assert(context);
2122
2123 return context->private_network || context->network_namespace_path;
2124}
2125
9c0c6701
DDM
2126static bool exec_needs_ephemeral(const ExecContext *context) {
2127 return (context->root_image || context->root_directory) && context->root_ephemeral;
2128}
2129
fde36d25
YW
2130static bool exec_needs_ipc_namespace(const ExecContext *context) {
2131 assert(context);
2132
2133 return context->private_ipc || context->ipc_namespace_path;
2134}
2135
5e8deb94 2136bool exec_needs_mount_namespace(
8b44a3d2
LP
2137 const ExecContext *context,
2138 const ExecParameters *params,
28135da3 2139 const ExecRuntime *runtime) {
8b44a3d2
LP
2140
2141 assert(context);
8b44a3d2 2142
915e6d16
LP
2143 if (context->root_image)
2144 return true;
2145
2a624c36
AP
2146 if (!strv_isempty(context->read_write_paths) ||
2147 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2148 !strv_isempty(context->inaccessible_paths) ||
2149 !strv_isempty(context->exec_paths) ||
2150 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2151 return true;
2152
42b1d8e0 2153 if (context->n_bind_mounts > 0)
d2d6c096
LP
2154 return true;
2155
2abd4e38
YW
2156 if (context->n_temporary_filesystems > 0)
2157 return true;
2158
b3d13314
LB
2159 if (context->n_mount_images > 0)
2160 return true;
2161
93f59701
LB
2162 if (context->n_extension_images > 0)
2163 return true;
2164
a07b9926
LB
2165 if (!strv_isempty(context->extension_directories))
2166 return true;
2167
874cdcbc 2168 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
8b44a3d2
LP
2169 return true;
2170
28135da3 2171 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
8b44a3d2
LP
2172 return true;
2173
8b44a3d2 2174 if (context->private_devices ||
24002121 2175 context->private_mounts > 0 ||
c2da3bf2 2176 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
8b44a3d2 2177 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2178 context->protect_home != PROTECT_HOME_NO ||
2179 context->protect_kernel_tunables ||
c575770b 2180 context->protect_kernel_modules ||
94a7b275 2181 context->protect_kernel_logs ||
4e399953
LP
2182 context->protect_control_groups ||
2183 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44 2184 context->proc_subset != PROC_SUBSET_ALL ||
fde36d25 2185 exec_needs_ipc_namespace(context))
8b44a3d2
LP
2186 return true;
2187
37c56f89 2188 if (context->root_directory) {
5e98086d 2189 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2190 return true;
2191
5b10116e 2192 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2193 if (params && !params->prefix[t])
37c56f89
YW
2194 continue;
2195
211a3d87 2196 if (context->directories[t].n_items > 0)
37c56f89
YW
2197 return true;
2198 }
2199 }
5d997827 2200
42b1d8e0 2201 if (context->dynamic_user &&
211a3d87
LB
2202 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2203 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2204 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2205 return true;
2206
91dd5f7c
LP
2207 if (context->log_namespace)
2208 return true;
2209
8b44a3d2
LP
2210 return false;
2211}
2212
5749f855 2213static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2214 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2215 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2216 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2217 _cleanup_(sigkill_waitp) pid_t pid = 0;
2218 uint64_t c = 1;
d251207d
LP
2219 ssize_t n;
2220 int r;
2221
5749f855
AZ
2222 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2223 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2224 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2225 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2226 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2227 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2228 * continues execution normally.
2229 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2230 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2231
5749f855 2232 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2233 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2234 r = asprintf(&uid_map,
5749f855 2235 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2236 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2237 ouid, ouid, uid, uid);
2238 else
2239 r = asprintf(&uid_map,
2240 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2241 ouid, ouid);
d251207d 2242
5749f855
AZ
2243 if (r < 0)
2244 return -ENOMEM;
2245
2246 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2247 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2248 r = asprintf(&gid_map,
5749f855 2249 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2250 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2251 ogid, ogid, gid, gid);
2252 else
2253 r = asprintf(&gid_map,
2254 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2255 ogid, ogid);
2256
2257 if (r < 0)
2258 return -ENOMEM;
d251207d
LP
2259
2260 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2261 * namespace. */
2262 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2263 if (unshare_ready_fd < 0)
2264 return -errno;
2265
2266 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2267 * failed. */
2268 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2269 return -errno;
2270
4c253ed1
LP
2271 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2272 if (r < 0)
2273 return r;
2274 if (r == 0) {
254d1313 2275 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2276 const char *a;
2277 pid_t ppid;
2278
2279 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2280 * here, after the parent opened its own user namespace. */
2281
2282 ppid = getppid();
2283 errno_pipe[0] = safe_close(errno_pipe[0]);
2284
2285 /* Wait until the parent unshared the user namespace */
2286 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2287 r = -errno;
2288 goto child_fail;
2289 }
2290
2291 /* Disable the setgroups() system call in the child user namespace, for good. */
2292 a = procfs_file_alloca(ppid, "setgroups");
2293 fd = open(a, O_WRONLY|O_CLOEXEC);
2294 if (fd < 0) {
2295 if (errno != ENOENT) {
2296 r = -errno;
2297 goto child_fail;
2298 }
2299
2300 /* If the file is missing the kernel is too old, let's continue anyway. */
2301 } else {
2302 if (write(fd, "deny\n", 5) < 0) {
2303 r = -errno;
2304 goto child_fail;
2305 }
2306
2307 fd = safe_close(fd);
2308 }
2309
2310 /* First write the GID map */
2311 a = procfs_file_alloca(ppid, "gid_map");
2312 fd = open(a, O_WRONLY|O_CLOEXEC);
2313 if (fd < 0) {
2314 r = -errno;
2315 goto child_fail;
2316 }
2317 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2318 r = -errno;
2319 goto child_fail;
2320 }
2321 fd = safe_close(fd);
2322
2323 /* The write the UID map */
2324 a = procfs_file_alloca(ppid, "uid_map");
2325 fd = open(a, O_WRONLY|O_CLOEXEC);
2326 if (fd < 0) {
2327 r = -errno;
2328 goto child_fail;
2329 }
2330 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2331 r = -errno;
2332 goto child_fail;
2333 }
2334
2335 _exit(EXIT_SUCCESS);
2336
2337 child_fail:
2338 (void) write(errno_pipe[1], &r, sizeof(r));
2339 _exit(EXIT_FAILURE);
2340 }
2341
2342 errno_pipe[1] = safe_close(errno_pipe[1]);
2343
2344 if (unshare(CLONE_NEWUSER) < 0)
2345 return -errno;
2346
2347 /* Let the child know that the namespace is ready now */
2348 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2349 return -errno;
2350
2351 /* Try to read an error code from the child */
2352 n = read(errno_pipe[0], &r, sizeof(r));
2353 if (n < 0)
2354 return -errno;
2355 if (n == sizeof(r)) { /* an error code was sent to us */
2356 if (r < 0)
2357 return r;
2358 return -EIO;
2359 }
2360 if (n != 0) /* on success we should have read 0 bytes */
2361 return -EIO;
2362
8f03de53 2363 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2364 if (r < 0)
2365 return r;
2e87a1fd 2366 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2367 return -EIO;
2368
2369 return 0;
2370}
2371
494d0247 2372static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
e43911a7
LP
2373 assert(context);
2374
494d0247
YW
2375 if (!context->dynamic_user)
2376 return false;
2377
2378 if (type == EXEC_DIRECTORY_CONFIGURATION)
2379 return false;
2380
2381 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2382 return false;
2383
2384 return true;
2385}
2386
211a3d87
LB
2387static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2388 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2389 int r;
2390
2391 assert(source);
2392
2393 src_abs = path_join(root, source);
2394 if (!src_abs)
2395 return -ENOMEM;
2396
2397 STRV_FOREACH(dst, symlinks) {
2398 _cleanup_free_ char *dst_abs = NULL;
2399
2400 dst_abs = path_join(root, *dst);
2401 if (!dst_abs)
2402 return -ENOMEM;
2403
2404 r = mkdir_parents_label(dst_abs, 0755);
2405 if (r < 0)
2406 return r;
2407
2408 r = symlink_idempotent(src_abs, dst_abs, true);
2409 if (r < 0)
2410 return r;
2411 }
2412
2413 return 0;
2414}
2415
3536f49e 2416static int setup_exec_directory(
59dd2bbb 2417 Unit *u,
07689d5d
LP
2418 const ExecContext *context,
2419 const ExecParameters *params,
2420 uid_t uid,
3536f49e 2421 gid_t gid,
3536f49e 2422 ExecDirectoryType type,
211a3d87 2423 bool needs_mount_namespace,
3536f49e 2424 int *exit_status) {
07689d5d 2425
72fd1768 2426 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2427 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2428 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2429 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2430 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2431 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2432 };
07689d5d
LP
2433 int r;
2434
2435 assert(context);
2436 assert(params);
72fd1768 2437 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2438 assert(exit_status);
07689d5d 2439
3536f49e
YW
2440 if (!params->prefix[type])
2441 return 0;
2442
8679efde 2443 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2444 if (!uid_is_valid(uid))
2445 uid = 0;
2446 if (!gid_is_valid(gid))
2447 gid = 0;
2448 }
2449
211a3d87 2450 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2451 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2452
211a3d87 2453 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2454 if (!p) {
2455 r = -ENOMEM;
2456 goto fail;
2457 }
07689d5d 2458
23a7448e
YW
2459 r = mkdir_parents_label(p, 0755);
2460 if (r < 0)
3536f49e 2461 goto fail;
23a7448e 2462
f9c91932
LP
2463 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2464
2465 /* If we are in user mode, and a configuration directory exists but a state directory
2466 * doesn't exist, then we likely are upgrading from an older systemd version that
2467 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2468 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2469 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
627cdcc7 2470 * separated. If a service has both dirs configured but only the configuration dir
f9c91932
LP
2471 * exists and the state dir does not, we assume we are looking at an update
2472 * situation. Hence, create a compatibility symlink, so that all expectations are
2473 * met.
2474 *
2475 * (We also do something similar with the log directory, which still doesn't exist in
2476 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2477
2478 /* this assumes the state dir is always created before the configuration dir */
2479 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2480 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2481
2482 r = laccess(p, F_OK);
2483 if (r == -ENOENT) {
2484 _cleanup_free_ char *q = NULL;
2485
2486 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2487 * under the configuration hierarchy. */
2488
2489 if (type == EXEC_DIRECTORY_STATE)
2490 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2491 else if (type == EXEC_DIRECTORY_LOGS)
2492 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2493 else
2494 assert_not_reached();
2495 if (!q) {
2496 r = -ENOMEM;
2497 goto fail;
2498 }
2499
2500 r = laccess(q, F_OK);
2501 if (r >= 0) {
2502 /* It does exist! This hence looks like an update. Symlink the
2503 * configuration directory into the state directory. */
2504
2505 r = symlink_idempotent(q, p, /* make_relative= */ true);
2506 if (r < 0)
2507 goto fail;
2508
59dd2bbb 2509 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
f9c91932
LP
2510 continue;
2511 } else if (r != -ENOENT)
59dd2bbb 2512 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
f9c91932
LP
2513
2514 } else if (r < 0)
59dd2bbb 2515 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
f9c91932
LP
2516 }
2517
494d0247 2518 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2519 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2520 * case we want to avoid leaving a directory around fully accessible that is owned by
2521 * a dynamic user whose UID is later on reused. To lock this down we use the same
2522 * trick used by container managers to prohibit host users to get access to files of
2523 * the same UID in containers: we place everything inside a directory that has an
2524 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2525 * for unprivileged host code. We then use fs namespacing to make this directory
2526 * permeable for the service itself.
6c47cd7d 2527 *
3f5b1508
LP
2528 * Specifically: for a service which wants a special directory "foo/" we first create
2529 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2530 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2531 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2532 * unprivileged host users can't look into it. Inside of the namespace of the unit
2533 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2534 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2535 * for the service and making sure it only gets access to the dirs it needs but no
2536 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2537 *
3f5b1508
LP
2538 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2539 * to be owned by the service itself.
2540 *
2541 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2542 * for sharing files or sockets with other services. */
6c47cd7d 2543
4ede9802
LP
2544 pp = path_join(params->prefix[type], "private");
2545 if (!pp) {
6c47cd7d
LP
2546 r = -ENOMEM;
2547 goto fail;
2548 }
2549
2550 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2551 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2552 if (r < 0)
2553 goto fail;
2554
211a3d87 2555 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2556 r = -ENOMEM;
2557 goto fail;
2558 }
2559
2560 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2561 r = mkdir_parents_label(pp, 0755);
2562 if (r < 0)
2563 goto fail;
2564
949befd3 2565 if (is_dir(p, false) > 0 &&
b93d24e0 2566 (laccess(pp, F_OK) == -ENOENT)) {
949befd3
LP
2567
2568 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2569 * it over. Most likely the service has been upgraded from one that didn't use
2570 * DynamicUser=1, to one that does. */
2571
59dd2bbb
LP
2572 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2573 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2574 exec_directory_type_to_string(type), p, pp);
cf52c45d 2575
db58f5de
LP
2576 r = RET_NERRNO(rename(p, pp));
2577 if (r < 0)
949befd3 2578 goto fail;
949befd3
LP
2579 } else {
2580 /* Otherwise, create the actual directory for the service */
2581
2582 r = mkdir_label(pp, context->directories[type].mode);
2583 if (r < 0 && r != -EEXIST)
2584 goto fail;
2585 }
6c47cd7d 2586
a2ab603c
YW
2587 if (!context->directories[type].items[i].only_create) {
2588 /* And link it up from the original place.
2589 * Notes
2590 * 1) If a mount namespace is going to be used, then this symlink remains on
2591 * the host, and a new one for the child namespace will be created later.
2592 * 2) It is not necessary to create this symlink when one of its parent
2593 * directories is specified and already created. E.g.
2594 * StateDirectory=foo foo/bar
2595 * In that case, the inode points to pp and p for "foo/bar" are the same:
2596 * pp = "/var/lib/private/foo/bar"
2597 * p = "/var/lib/foo/bar"
2598 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2599 * we do not need to create the symlink, but we cannot create the symlink.
2600 * See issue #24783. */
2601 r = symlink_idempotent(pp, p, true);
2602 if (r < 0)
2603 goto fail;
2604 }
6c47cd7d 2605
6c47cd7d 2606 } else {
5c6d40d1
LP
2607 _cleanup_free_ char *target = NULL;
2608
2609 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2610 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2611 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2612
2613 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2614 * by DynamicUser=1 (see above)?
2615 *
2616 * We do this for all directory types except for ConfigurationDirectory=,
2617 * since they all support the private/ symlink logic at least in some
2618 * configurations, see above. */
5c6d40d1 2619
f461a28d 2620 r = chase(target, NULL, 0, &target_resolved, NULL);
578dc69f
YW
2621 if (r < 0)
2622 goto fail;
2623
211a3d87 2624 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2625 if (!q) {
2626 r = -ENOMEM;
2627 goto fail;
2628 }
2629
578dc69f 2630 /* /var/lib or friends may be symlinks. So, let's chase them also. */
f461a28d 2631 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
578dc69f
YW
2632 if (r < 0)
2633 goto fail;
2634
2635 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2636
2637 /* Hmm, apparently DynamicUser= was once turned on for this service,
2638 * but is no longer. Let's move the directory back up. */
2639
59dd2bbb
LP
2640 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2641 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2642 exec_directory_type_to_string(type), q, p);
cf52c45d 2643
db58f5de
LP
2644 r = RET_NERRNO(unlink(p));
2645 if (r < 0)
5c6d40d1 2646 goto fail;
5c6d40d1 2647
db58f5de
LP
2648 r = RET_NERRNO(rename(q, p));
2649 if (r < 0)
5c6d40d1 2650 goto fail;
5c6d40d1
LP
2651 }
2652 }
2653
6c47cd7d 2654 r = mkdir_label(p, context->directories[type].mode);
d484580c 2655 if (r < 0) {
d484580c
LP
2656 if (r != -EEXIST)
2657 goto fail;
2658
206e9864
LP
2659 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2660 struct stat st;
2661
2662 /* Don't change the owner/access mode of the configuration directory,
2663 * as in the common case it is not written to by a service, and shall
2664 * not be writable. */
2665
db58f5de
LP
2666 r = RET_NERRNO(stat(p, &st));
2667 if (r < 0)
206e9864 2668 goto fail;
206e9864
LP
2669
2670 /* Still complain if the access mode doesn't match */
2671 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
59dd2bbb
LP
2672 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2673 "(File system: %o %sMode: %o)",
2674 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2675 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
206e9864 2676
6cff72eb 2677 continue;
206e9864 2678 }
6cff72eb 2679 }
a1164ae3 2680 }
07689d5d 2681
206e9864 2682 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2683 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2684 * current UID/GID ownership.) */
2685 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2686 if (r < 0)
2687 goto fail;
c71b2eb7 2688
f5bb36dc
LP
2689 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2690 * available to user code anyway */
2691 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2692 continue;
2693
607b358e
LP
2694 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2695 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2696 * assignments to exist. */
d5602c16 2697 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
07689d5d 2698 if (r < 0)
3536f49e 2699 goto fail;
07689d5d
LP
2700 }
2701
211a3d87
LB
2702 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2703 * they are set up later, to allow configuring empty var/run/etc. */
2704 if (!needs_mount_namespace)
2705 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2706 r = create_many_symlinks(params->prefix[type],
2707 context->directories[type].items[i].path,
2708 context->directories[type].items[i].symlinks);
2709 if (r < 0)
2710 goto fail;
2711 }
2712
07689d5d 2713 return 0;
3536f49e
YW
2714
2715fail:
2716 *exit_status = exit_status_table[type];
3536f49e 2717 return r;
07689d5d
LP
2718}
2719
92b423b9 2720#if ENABLE_SMACK
cefc33ae 2721static int setup_smack(
aa5ae971 2722 const Manager *manager,
cefc33ae 2723 const ExecContext *context,
b83d5050 2724 int executable_fd) {
cefc33ae
LP
2725 int r;
2726
2727 assert(context);
b83d5050 2728 assert(executable_fd >= 0);
cefc33ae 2729
cefc33ae
LP
2730 if (context->smack_process_label) {
2731 r = mac_smack_apply_pid(0, context->smack_process_label);
2732 if (r < 0)
2733 return r;
aa5ae971 2734 } else if (manager->default_smack_process_label) {
cefc33ae
LP
2735 _cleanup_free_ char *exec_label = NULL;
2736
b83d5050 2737 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 2738 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
2739 return r;
2740
1da3cb81 2741 r = mac_smack_apply_pid(0, exec_label ?: manager->default_smack_process_label);
cefc33ae
LP
2742 if (r < 0)
2743 return r;
2744 }
cefc33ae
LP
2745
2746 return 0;
2747}
92b423b9 2748#endif
cefc33ae 2749
6c47cd7d
LP
2750static int compile_bind_mounts(
2751 const ExecContext *context,
2752 const ExecParameters *params,
2753 BindMount **ret_bind_mounts,
da6053d0 2754 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2755 char ***ret_empty_directories) {
2756
2757 _cleanup_strv_free_ char **empty_directories = NULL;
ed8267c7 2758 BindMount *bind_mounts = NULL;
5b10116e 2759 size_t n, h = 0;
6c47cd7d
LP
2760 int r;
2761
2762 assert(context);
2763 assert(params);
2764 assert(ret_bind_mounts);
2765 assert(ret_n_bind_mounts);
2766 assert(ret_empty_directories);
2767
ed8267c7
DT
2768 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2769
6c47cd7d 2770 n = context->n_bind_mounts;
5b10116e 2771 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2772 if (!params->prefix[t])
2773 continue;
2774
a2ab603c
YW
2775 for (size_t i = 0; i < context->directories[t].n_items; i++)
2776 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
2777 }
2778
2779 if (n <= 0) {
2780 *ret_bind_mounts = NULL;
2781 *ret_n_bind_mounts = 0;
2782 *ret_empty_directories = NULL;
2783 return 0;
2784 }
2785
2786 bind_mounts = new(BindMount, n);
2787 if (!bind_mounts)
2788 return -ENOMEM;
2789
5b10116e 2790 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d 2791 BindMount *item = context->bind_mounts + i;
93404d34 2792 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d
LP
2793
2794 s = strdup(item->source);
ed8267c7
DT
2795 if (!s)
2796 return -ENOMEM;
6c47cd7d
LP
2797
2798 d = strdup(item->destination);
93404d34 2799 if (!d)
ed8267c7 2800 return -ENOMEM;
6c47cd7d
LP
2801
2802 bind_mounts[h++] = (BindMount) {
93404d34
DT
2803 .source = TAKE_PTR(s),
2804 .destination = TAKE_PTR(d),
6c47cd7d
LP
2805 .read_only = item->read_only,
2806 .recursive = item->recursive,
2807 .ignore_enoent = item->ignore_enoent,
2808 };
2809 }
2810
5b10116e 2811 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2812 if (!params->prefix[t])
2813 continue;
2814
211a3d87 2815 if (context->directories[t].n_items == 0)
6c47cd7d
LP
2816 continue;
2817
494d0247 2818 if (exec_directory_is_private(context, t) &&
74e12520 2819 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
2820 char *private_root;
2821
2822 /* So this is for a dynamic user, and we need to make sure the process can access its own
2823 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2824 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2825
657ee2d8 2826 private_root = path_join(params->prefix[t], "private");
ed8267c7
DT
2827 if (!private_root)
2828 return -ENOMEM;
6c47cd7d
LP
2829
2830 r = strv_consume(&empty_directories, private_root);
a635a7ae 2831 if (r < 0)
ed8267c7 2832 return r;
6c47cd7d
LP
2833 }
2834
211a3d87 2835 for (size_t i = 0; i < context->directories[t].n_items; i++) {
93404d34 2836 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d 2837
a2ab603c
YW
2838 /* When one of the parent directories is in the list, we cannot create the symlink
2839 * for the child directory. See also the comments in setup_exec_directory(). */
2840 if (context->directories[t].items[i].only_create)
2841 continue;
2842
494d0247 2843 if (exec_directory_is_private(context, t))
211a3d87 2844 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 2845 else
211a3d87 2846 s = path_join(params->prefix[t], context->directories[t].items[i].path);
ed8267c7
DT
2847 if (!s)
2848 return -ENOMEM;
6c47cd7d 2849
494d0247 2850 if (exec_directory_is_private(context, t) &&
74e12520 2851 exec_context_with_rootfs(context))
5609f688
YW
2852 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2853 * directory is not created on the root directory. So, let's bind-mount the directory
2854 * on the 'non-private' place. */
211a3d87 2855 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
2856 else
2857 d = strdup(s);
93404d34 2858 if (!d)
ed8267c7 2859 return -ENOMEM;
6c47cd7d
LP
2860
2861 bind_mounts[h++] = (BindMount) {
93404d34
DT
2862 .source = TAKE_PTR(s),
2863 .destination = TAKE_PTR(d),
6c47cd7d 2864 .read_only = false,
9ce4e4b0 2865 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
2866 .recursive = true,
2867 .ignore_enoent = false,
2868 };
2869 }
2870 }
2871
2872 assert(h == n);
2873
ed8267c7 2874 *ret_bind_mounts = TAKE_PTR(bind_mounts);
6c47cd7d 2875 *ret_n_bind_mounts = n;
ae2a15bc 2876 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
2877
2878 return (int) n;
6c47cd7d
LP
2879}
2880
df61e79a
LB
2881/* ret_symlinks will contain a list of pairs src:dest that describes
2882 * the symlinks to create later on. For example, the symlinks needed
2883 * to safely give private directories to DynamicUser=1 users. */
2884static int compile_symlinks(
2885 const ExecContext *context,
2886 const ExecParameters *params,
663e2756 2887 bool setup_os_release_symlink,
df61e79a
LB
2888 char ***ret_symlinks) {
2889
2890 _cleanup_strv_free_ char **symlinks = NULL;
2891 int r;
2892
2893 assert(context);
2894 assert(params);
2895 assert(ret_symlinks);
2896
2897 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
2898 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2899 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 2900
211a3d87
LB
2901 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2902 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 2903
211a3d87
LB
2904 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2905 dst_abs = path_join(params->prefix[dt], *symlink);
2906 if (!src_abs || !dst_abs)
2907 return -ENOMEM;
df61e79a 2908
211a3d87
LB
2909 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2910 if (r < 0)
2911 return r;
2912 }
2913
a2ab603c
YW
2914 if (!exec_directory_is_private(context, dt) ||
2915 exec_context_with_rootfs(context) ||
2916 context->directories[dt].items[i].only_create)
211a3d87
LB
2917 continue;
2918
2919 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
2920 if (!private_path)
2921 return -ENOMEM;
2922
211a3d87 2923 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
2924 if (!path)
2925 return -ENOMEM;
2926
2927 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2928 if (r < 0)
2929 return r;
2930 }
2931 }
2932
663e2756
LB
2933 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2934 * and readers will never get a half-written version. Note that, while the paths specified here are
2935 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2936 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2937 if (setup_os_release_symlink) {
2938 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2939 if (r < 0)
2940 return r;
2941
2942 r = strv_extend(&symlinks, "/run/host/os-release");
2943 if (r < 0)
2944 return r;
2945 }
2946
df61e79a
LB
2947 *ret_symlinks = TAKE_PTR(symlinks);
2948
2949 return 0;
2950}
2951
4e677599
LP
2952static bool insist_on_sandboxing(
2953 const ExecContext *context,
2954 const char *root_dir,
2955 const char *root_image,
2956 const BindMount *bind_mounts,
2957 size_t n_bind_mounts) {
2958
4e677599
LP
2959 assert(context);
2960 assert(n_bind_mounts == 0 || bind_mounts);
2961
2962 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 2963 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
2964 * rearrange stuff in a way we cannot ignore gracefully. */
2965
2966 if (context->n_temporary_filesystems > 0)
2967 return true;
2968
2969 if (root_dir || root_image)
2970 return true;
2971
b3d13314
LB
2972 if (context->n_mount_images > 0)
2973 return true;
2974
4e677599
LP
2975 if (context->dynamic_user)
2976 return true;
2977
4355c04f
LB
2978 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2979 return true;
2980
4e677599
LP
2981 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2982 * essential. */
5b10116e 2983 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
2984 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2985 return true;
2986
91dd5f7c
LP
2987 if (context->log_namespace)
2988 return true;
2989
4e677599
LP
2990 return false;
2991}
2992
9c0c6701
DDM
2993static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2994 _cleanup_close_ int fd = -EBADF;
2995 int r;
2996
2997 if (!runtime || !runtime->ephemeral_copy)
2998 return 0;
2999
3000 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3001 if (r < 0)
3002 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3003
3004 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3005
3006 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3007 if (fd >= 0)
3008 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3009 return 0;
3010
3011 if (fd != -EAGAIN)
3012 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3013
3014 log_debug("Making ephemeral snapshot of %s to %s",
3015 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3016
3017 if (context->root_image)
3018 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3019 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3020 else
3021 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3022 AT_FDCWD, runtime->ephemeral_copy,
3023 BTRFS_SNAPSHOT_FALLBACK_COPY |
3024 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3025 BTRFS_SNAPSHOT_RECURSIVE |
3026 BTRFS_SNAPSHOT_LOCK_BSD);
3027 if (fd < 0)
3028 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3029 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3030
3031 if (context->root_image) {
3032 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3033 * which tends to not perform well in combination with lots of random writes.
3034 *
3035 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3036 * copy, but we at least want to make the intention clear.
3037 */
3038 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3039 if (r < 0)
3040 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3041 }
3042
3043 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3044 if (r < 0)
3045 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3046
3047 return 1;
3048}
3049
66130f0a
DDM
3050static int verity_settings_prepare(
3051 VeritySettings *verity,
3052 const char *root_image,
3053 const void *root_hash,
3054 size_t root_hash_size,
3055 const char *root_hash_path,
3056 const void *root_hash_sig,
3057 size_t root_hash_sig_size,
3058 const char *root_hash_sig_path,
3059 const char *verity_data_path) {
3060
3061 int r;
3062
3063 assert(verity);
3064
3065 if (root_hash) {
3066 void *d;
3067
3068 d = memdup(root_hash, root_hash_size);
3069 if (!d)
3070 return -ENOMEM;
3071
3072 free_and_replace(verity->root_hash, d);
3073 verity->root_hash_size = root_hash_size;
3074 verity->designator = PARTITION_ROOT;
3075 }
3076
3077 if (root_hash_sig) {
3078 void *d;
3079
3080 d = memdup(root_hash_sig, root_hash_sig_size);
3081 if (!d)
3082 return -ENOMEM;
3083
3084 free_and_replace(verity->root_hash_sig, d);
3085 verity->root_hash_sig_size = root_hash_sig_size;
3086 verity->designator = PARTITION_ROOT;
3087 }
3088
3089 if (verity_data_path) {
3090 r = free_and_strdup(&verity->data_path, verity_data_path);
3091 if (r < 0)
3092 return r;
3093 }
3094
3095 r = verity_settings_load(
3096 verity,
3097 root_image,
3098 root_hash_path,
3099 root_hash_sig_path);
3100 if (r < 0)
3101 return log_debug_errno(r, "Failed to load root hash: %m");
3102
3103 return 0;
3104}
3105
6818c54c 3106static int apply_mount_namespace(
34cf6c43 3107 const Unit *u,
9f71ba8d 3108 ExecCommandFlags command_flags,
6818c54c
LP
3109 const ExecContext *context,
3110 const ExecParameters *params,
9c0c6701 3111 ExecRuntime *runtime,
d4b6ec98 3112 const char *memory_pressure_path,
7cc5ef5f 3113 char **error_path) {
6818c54c 3114
66130f0a 3115 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
d4b6ec98
LB
3116 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3117 **read_write_paths_cleanup = NULL;
73ff4d48 3118 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
663e2756 3119 *extension_dir = NULL, *host_os_release_stage = NULL;
66130f0a 3120 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
d4b6ec98 3121 char **read_write_paths;
228af36f 3122 NamespaceInfo ns_info;
663e2756 3123 bool needs_sandboxing, setup_os_release_symlink;
6c47cd7d 3124 BindMount *bind_mounts = NULL;
da6053d0 3125 size_t n_bind_mounts = 0;
6818c54c 3126 int r;
93c6bb51 3127
2b3c1b9e
DH
3128 assert(context);
3129
29933daf
DT
3130 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3131
915e6d16 3132 if (params->flags & EXEC_APPLY_CHROOT) {
9c0c6701
DDM
3133 r = setup_ephemeral(context, runtime);
3134 if (r < 0)
3135 return r;
915e6d16 3136
9c0c6701
DDM
3137 if (context->root_image)
3138 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3139 else
3140 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
915e6d16 3141 }
93c6bb51 3142
6c47cd7d
LP
3143 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3144 if (r < 0)
3145 return r;
3146
d4b6ec98
LB
3147 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3148 * service will need to write to it in order to start the notifications. */
3149 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3150 read_write_paths_cleanup = strv_copy(context->read_write_paths);
29933daf
DT
3151 if (!read_write_paths_cleanup)
3152 return -ENOMEM;
d4b6ec98
LB
3153
3154 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3155 if (r < 0)
29933daf 3156 return r;
d4b6ec98
LB
3157
3158 read_write_paths = read_write_paths_cleanup;
3159 } else
3160 read_write_paths = context->read_write_paths;
3161
9f71ba8d 3162 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3163 if (needs_sandboxing) {
3164 /* The runtime struct only contains the parent of the private /tmp,
3165 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3166 * that is sticky, and that's the one we want to use here.
3167 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91 3168
28135da3
DDM
3169 if (context->private_tmp && runtime && runtime->shared) {
3170 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3171 tmp_dir = runtime->shared->tmp_dir;
3172 else if (runtime->shared->tmp_dir)
3173 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
56a13a49 3174
28135da3
DDM
3175 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3176 var_tmp_dir = runtime->shared->var_tmp_dir;
3177 else if (runtime->shared->var_tmp_dir)
3178 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
ecf63c91
NJ
3179 }
3180
b5a33299
YW
3181 ns_info = (NamespaceInfo) {
3182 .ignore_protect_paths = false,
3183 .private_dev = context->private_devices,
3184 .protect_control_groups = context->protect_control_groups,
3185 .protect_kernel_tunables = context->protect_kernel_tunables,
3186 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3187 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3188 .protect_hostname = context->protect_hostname,
5e98086d 3189 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
52b3d652
LP
3190 .protect_home = context->protect_home,
3191 .protect_system = context->protect_system,
4e399953
LP
3192 .protect_proc = context->protect_proc,
3193 .proc_subset = context->proc_subset,
c2da3bf2 3194 .private_network = exec_needs_network_namespace(context),
fde36d25 3195 .private_ipc = exec_needs_ipc_namespace(context),
6720e356 3196 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3197 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3198 };
ecf63c91 3199 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3200 /*
3201 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3202 * sandbox info, otherwise enforce it, don't ignore protected paths and
3203 * fail if we are enable to apply the sandbox inside the mount namespace.
3204 */
3205 ns_info = (NamespaceInfo) {
3206 .ignore_protect_paths = true,
3207 };
3208 else
3209 ns_info = (NamespaceInfo) {};
b5a33299 3210
663e2756
LB
3211 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3212 setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
3213 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3214 if (r < 0)
3215 return r;
3216
874cdcbc 3217 if (context->mount_propagation_flag == MS_SHARED)
37ed15d7
FB
3218 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3219
73ff4d48
YW
3220 if (exec_context_has_credentials(context) &&
3221 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3222 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3223 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
3224 if (!creds_path)
3225 return -ENOMEM;
3226 }
3227
170d978b 3228 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
5e8deb94 3229 propagate_dir = path_join("/run/systemd/propagate/", u->id);
29933daf
DT
3230 if (!propagate_dir)
3231 return -ENOMEM;
f2550b98 3232
5e8deb94 3233 incoming_dir = strdup("/run/systemd/incoming");
29933daf
DT
3234 if (!incoming_dir)
3235 return -ENOMEM;
24759d8f
LB
3236
3237 extension_dir = strdup("/run/systemd/unit-extensions");
29933daf
DT
3238 if (!extension_dir)
3239 return -ENOMEM;
3f37a825
LB
3240
3241 /* If running under a different root filesystem, propagate the host's os-release. We make a
3242 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
663e2756
LB
3243 if (setup_os_release_symlink) {
3244 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3245 if (!host_os_release_stage)
3f37a825
LB
3246 return -ENOMEM;
3247 }
170d978b
LP
3248 } else {
3249 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3250
29933daf
DT
3251 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3252 return -ENOMEM;
3f37a825 3253
663e2756
LB
3254 if (setup_os_release_symlink) {
3255 if (asprintf(&host_os_release_stage,
3256 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3257 geteuid()) < 0)
3f37a825
LB
3258 return -ENOMEM;
3259 }
170d978b 3260 }
5e8deb94 3261
66130f0a
DDM
3262 if (root_image) {
3263 r = verity_settings_prepare(
3264 &verity,
3265 root_image,
3266 context->root_hash, context->root_hash_size, context->root_hash_path,
3267 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3268 context->root_verity);
3269 if (r < 0)
3270 return r;
3271 }
3272
84be0c71
LP
3273 r = setup_namespace(
3274 root_dir,
3275 root_image,
3276 context->root_image_options,
3277 context->root_image_policy ?: &image_policy_service,
3278 &ns_info,
3279 read_write_paths,
3280 needs_sandboxing ? context->read_only_paths : NULL,
3281 needs_sandboxing ? context->inaccessible_paths : NULL,
3282 needs_sandboxing ? context->exec_paths : NULL,
3283 needs_sandboxing ? context->no_exec_paths : NULL,
3284 empty_directories,
3285 symlinks,
3286 bind_mounts,
3287 n_bind_mounts,
3288 context->temporary_filesystems,
3289 context->n_temporary_filesystems,
3290 context->mount_images,
3291 context->n_mount_images,
3292 context->mount_image_policy ?: &image_policy_service,
3293 tmp_dir,
3294 var_tmp_dir,
3295 creds_path,
3296 context->log_namespace,
3297 context->mount_propagation_flag,
66130f0a 3298 &verity,
84be0c71
LP
3299 context->extension_images,
3300 context->n_extension_images,
3301 context->extension_image_policy ?: &image_policy_sysext,
3302 context->extension_directories,
3303 propagate_dir,
3304 incoming_dir,
3305 extension_dir,
3306 root_dir || root_image ? params->notify_socket : NULL,
663e2756 3307 host_os_release_stage,
84be0c71 3308 error_path);
93c6bb51 3309
1beab8b0 3310 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3311 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3312 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3313 * completely different execution environment. */
aca835ed 3314 if (r == -ENOANO) {
4e677599
LP
3315 if (insist_on_sandboxing(
3316 context,
3317 root_dir, root_image,
3318 bind_mounts,
29933daf
DT
3319 n_bind_mounts))
3320 return log_unit_debug_errno(u,
3321 SYNTHETIC_ERRNO(EOPNOTSUPP),
3322 "Failed to set up namespace, and refusing to continue since "
3323 "the selected namespacing options alter mount environment non-trivially.\n"
3324 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3325 n_bind_mounts,
3326 context->n_temporary_filesystems,
3327 yes_no(root_dir),
3328 yes_no(root_image),
3329 yes_no(context->dynamic_user));
3330
3331 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3332 return 0;
93c6bb51
DH
3333 }
3334
3335 return r;
3336}
3337
915e6d16
LP
3338static int apply_working_directory(
3339 const ExecContext *context,
3340 const ExecParameters *params,
9c0c6701 3341 ExecRuntime *runtime,
915e6d16 3342 const char *home,
376fecf6 3343 int *exit_status) {
915e6d16 3344
6732edab 3345 const char *d, *wd;
2b3c1b9e
DH
3346
3347 assert(context);
376fecf6 3348 assert(exit_status);
2b3c1b9e 3349
6732edab
LP
3350 if (context->working_directory_home) {
3351
376fecf6
LP
3352 if (!home) {
3353 *exit_status = EXIT_CHDIR;
6732edab 3354 return -ENXIO;
376fecf6 3355 }
6732edab 3356
2b3c1b9e 3357 wd = home;
6732edab 3358
14eb3285
LP
3359 } else
3360 wd = empty_to_root(context->working_directory);
e7f1e7c6 3361
fa97f630 3362 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3363 d = wd;
fa97f630 3364 else
9c0c6701 3365 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
e7f1e7c6 3366
376fecf6
LP
3367 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3368 *exit_status = EXIT_CHDIR;
2b3c1b9e 3369 return -errno;
376fecf6 3370 }
e7f1e7c6
DH
3371
3372 return 0;
3373}
3374
fa97f630
JB
3375static int apply_root_directory(
3376 const ExecContext *context,
3377 const ExecParameters *params,
9c0c6701 3378 ExecRuntime *runtime,
fa97f630
JB
3379 const bool needs_mount_ns,
3380 int *exit_status) {
3381
3382 assert(context);
3383 assert(exit_status);
3384
5b10116e 3385 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630 3386 if (!needs_mount_ns && context->root_directory)
9c0c6701 3387 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
fa97f630
JB
3388 *exit_status = EXIT_CHROOT;
3389 return -errno;
3390 }
fa97f630
JB
3391
3392 return 0;
3393}
3394
b1edf445 3395static int setup_keyring(
34cf6c43 3396 const Unit *u,
b1edf445
LP
3397 const ExecContext *context,
3398 const ExecParameters *p,
3399 uid_t uid, gid_t gid) {
3400
74dd6b51 3401 key_serial_t keyring;
e64c2d0b
DJL
3402 int r = 0;
3403 uid_t saved_uid;
3404 gid_t saved_gid;
74dd6b51
LP
3405
3406 assert(u);
b1edf445 3407 assert(context);
74dd6b51
LP
3408 assert(p);
3409
3410 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3411 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3412 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3413 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3414 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3415 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3416
b1edf445
LP
3417 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3418 return 0;
3419
e64c2d0b
DJL
3420 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3421 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3422 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3423 * & group is just as nasty as acquiring a reference to the user keyring. */
3424
3425 saved_uid = getuid();
3426 saved_gid = getgid();
3427
3428 if (gid_is_valid(gid) && gid != saved_gid) {
3429 if (setregid(gid, -1) < 0)
3430 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3431 }
3432
3433 if (uid_is_valid(uid) && uid != saved_uid) {
3434 if (setreuid(uid, -1) < 0) {
3435 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3436 goto out;
3437 }
3438 }
3439
74dd6b51
LP
3440 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3441 if (keyring == -1) {
3442 if (errno == ENOSYS)
8002fb97 3443 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3444 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3445 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3446 else if (errno == EDQUOT)
8002fb97 3447 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3448 else
e64c2d0b 3449 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3450
e64c2d0b 3451 goto out;
74dd6b51
LP
3452 }
3453
e64c2d0b
DJL
3454 /* When requested link the user keyring into the session keyring. */
3455 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3456
3457 if (keyctl(KEYCTL_LINK,
3458 KEY_SPEC_USER_KEYRING,
3459 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3460 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3461 goto out;
3462 }
3463 }
3464
3465 /* Restore uid/gid back */
3466 if (uid_is_valid(uid) && uid != saved_uid) {
3467 if (setreuid(saved_uid, -1) < 0) {
3468 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3469 goto out;
3470 }
3471 }
3472
3473 if (gid_is_valid(gid) && gid != saved_gid) {
3474 if (setregid(saved_gid, -1) < 0)
3475 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3476 }
3477
3478 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3479 if (!sd_id128_is_null(u->invocation_id)) {
3480 key_serial_t key;
3481
3482 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3483 if (key == -1)
8002fb97 3484 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3485 else {
3486 if (keyctl(KEYCTL_SETPERM, key,
3487 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3488 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3489 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3490 }
3491 }
3492
e64c2d0b 3493out:
37b22b3b 3494 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3495 /* no extra logging, as only the first already reported error matters */
3496 if (getuid() != saved_uid)
3497 (void) setreuid(saved_uid, -1);
b1edf445 3498
e64c2d0b
DJL
3499 if (getgid() != saved_gid)
3500 (void) setregid(saved_gid, -1);
b1edf445 3501
e64c2d0b 3502 return r;
74dd6b51
LP
3503}
3504
3042bbeb 3505static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3506 assert(array);
3507 assert(n);
2caa38e9 3508 assert(pair);
29206d46
LP
3509
3510 if (pair[0] >= 0)
3511 array[(*n)++] = pair[0];
3512 if (pair[1] >= 0)
3513 array[(*n)++] = pair[1];
3514}
3515
a34ceba6
LP
3516static int close_remaining_fds(
3517 const ExecParameters *params,
28135da3 3518 const ExecRuntime *runtime,
00d9ef85 3519 int user_lookup_fd,
a34ceba6 3520 int socket_fd,
5b8d1f6b 3521 const int *fds, size_t n_fds) {
a34ceba6 3522
da6053d0 3523 size_t n_dont_close = 0;
9c0c6701 3524 int dont_close[n_fds + 14];
a34ceba6
LP
3525
3526 assert(params);
3527
3528 if (params->stdin_fd >= 0)
3529 dont_close[n_dont_close++] = params->stdin_fd;
3530 if (params->stdout_fd >= 0)
3531 dont_close[n_dont_close++] = params->stdout_fd;
3532 if (params->stderr_fd >= 0)
3533 dont_close[n_dont_close++] = params->stderr_fd;
3534
3535 if (socket_fd >= 0)
3536 dont_close[n_dont_close++] = socket_fd;
3537 if (n_fds > 0) {
3538 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3539 n_dont_close += n_fds;
3540 }
3541
9c0c6701
DDM
3542 if (runtime)
3543 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3544
28135da3
DDM
3545 if (runtime && runtime->shared) {
3546 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3547 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
a70581ff 3548 }
29206d46 3549
15220772
DDM
3550 if (runtime && runtime->dynamic_creds) {
3551 if (runtime->dynamic_creds->user)
3552 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3553 if (runtime->dynamic_creds->group)
3554 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
a34ceba6
LP
3555 }
3556
00d9ef85
LP
3557 if (user_lookup_fd >= 0)
3558 dont_close[n_dont_close++] = user_lookup_fd;
3559
a34ceba6
LP
3560 return close_all_fds(dont_close, n_dont_close);
3561}
3562
00d9ef85
LP
3563static int send_user_lookup(
3564 Unit *unit,
3565 int user_lookup_fd,
3566 uid_t uid,
3567 gid_t gid) {
3568
3569 assert(unit);
3570
3571 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3572 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3573 * specified. */
3574
3575 if (user_lookup_fd < 0)
3576 return 0;
3577
3578 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3579 return 0;
3580
3581 if (writev(user_lookup_fd,
3582 (struct iovec[]) {
ce16d177
YW
3583 IOVEC_MAKE(&uid, sizeof(uid)),
3584 IOVEC_MAKE(&gid, sizeof(gid)),
3585 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3586 return -errno;
3587
3588 return 0;
3589}
3590
6732edab
LP
3591static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3592 int r;
3593
3594 assert(c);
3595 assert(home);
3596 assert(buf);
3597
3598 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3599
3600 if (*home)
3601 return 0;
3602
3603 if (!c->working_directory_home)
3604 return 0;
3605
6732edab
LP
3606 r = get_home_dir(buf);
3607 if (r < 0)
3608 return r;
3609
3610 *home = *buf;
3611 return 1;
3612}
3613
da50b85a
LP
3614static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3615 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3616 int r;
3617
3618 assert(c);
3619 assert(p);
3620 assert(ret);
3621
3622 assert(c->dynamic_user);
3623
3624 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3625 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3626 * directories. */
3627
5b10116e 3628 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3629 if (t == EXEC_DIRECTORY_CONFIGURATION)
3630 continue;
3631
3632 if (!p->prefix[t])
3633 continue;
3634
211a3d87 3635 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3636 char *e;
3637
494d0247 3638 if (exec_directory_is_private(c, t))
211a3d87 3639 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3640 else
211a3d87 3641 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3642 if (!e)
3643 return -ENOMEM;
3644
3645 r = strv_consume(&list, e);
3646 if (r < 0)
3647 return r;
3648 }
3649 }
3650
ae2a15bc 3651 *ret = TAKE_PTR(list);
da50b85a
LP
3652
3653 return 0;
3654}
3655
a8b993dc
LP
3656static int exec_parameters_get_cgroup_path(
3657 const ExecParameters *params,
3658 const CGroupContext *c,
3659 char **ret) {
3660
3661 const char *subgroup = NULL;
78f93209
LP
3662 char *p;
3663
3664 assert(params);
3665 assert(ret);
3666
3667 if (!params->cgroup_path)
3668 return -EINVAL;
3669
3670 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3671 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3672 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3673 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3674 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3675 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3676 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3677 * flag, which is only passed for the former statements, not for the latter. */
3678
a8b993dc
LP
3679 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3680 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3681 subgroup = ".control";
3682 else
3683 subgroup = c->delegate_subgroup;
3684 }
3685
3686 if (subgroup)
3687 p = path_join(params->cgroup_path, subgroup);
78f93209
LP
3688 else
3689 p = strdup(params->cgroup_path);
3690 if (!p)
3691 return -ENOMEM;
3692
3693 *ret = p;
a8b993dc 3694 return !!subgroup;
78f93209
LP
3695}
3696
e2b2fb7f
MS
3697static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3698 _cleanup_(cpu_set_reset) CPUSet s = {};
3699 int r;
3700
3701 assert(c);
3702 assert(ret);
3703
3704 if (!c->numa_policy.nodes.set) {
3705 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3706 return 0;
3707 }
3708
3709 r = numa_to_cpu_set(&c->numa_policy, &s);
3710 if (r < 0)
3711 return r;
3712
3713 cpu_set_reset(ret);
3714
3715 return cpu_set_add_all(ret, &s);
3716}
3717
3718bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3719 assert(c);
3720
3721 return c->cpu_affinity_from_numa;
3722}
3723
1da37e58
ZJS
3724static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3725 int r;
3726
3727 assert(fds);
3728 assert(n_fds);
3729 assert(*n_fds < fds_size);
3730 assert(ret_fd);
3731
3732 if (fd < 0) {
254d1313 3733 *ret_fd = -EBADF;
1da37e58
ZJS
3734 return 0;
3735 }
3736
3737 if (fd < 3 + (int) *n_fds) {
3738 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3739 * the fds we pass to the process (or which are closed only during execve). */
3740
3741 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3742 if (r < 0)
3743 return -errno;
3744
ee3455cf 3745 close_and_replace(fd, r);
1da37e58
ZJS
3746 }
3747
3748 *ret_fd = fds[*n_fds] = fd;
3749 (*n_fds) ++;
3750 return 1;
3751}
3752
cd48e23f
RP
3753static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3754 union sockaddr_union addr = {
3755 .un.sun_family = AF_UNIX,
3756 };
3757 socklen_t sa_len;
3758 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3759 int r;
3760
3761 assert(u);
3762 assert(of);
3763 assert(ofd >= 0);
3764
3765 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3766 if (r < 0)
3767 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3768
3769 sa_len = r;
3770
3771 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3772 _cleanup_close_ int fd = -EBADF;
3773
3774 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3775 if (fd < 0)
3776 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3777
3778 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3779 if (r == -EPROTOTYPE)
3780 continue;
3781 if (r < 0)
3782 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3783
3784 return TAKE_FD(fd);
3785 }
3786
3787 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3788}
3789
3790static int get_open_file_fd(Unit *u, const OpenFile *of) {
3791 struct stat st;
3792 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3793
3794 assert(u);
3795 assert(of);
3796
3797 ofd = open(of->path, O_PATH | O_CLOEXEC);
3798 if (ofd < 0)
dcebb015
DDM
3799 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3800
cd48e23f 3801 if (fstat(ofd, &st) < 0)
dcebb015 3802 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
cd48e23f
RP
3803
3804 if (S_ISSOCK(st.st_mode)) {
3805 fd = connect_unix_harder(u, of, ofd);
3806 if (fd < 0)
3807 return fd;
3808
3809 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
dcebb015
DDM
3810 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3811 of->path);
cd48e23f
RP
3812
3813 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3814 } else {
3815 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3816 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3817 flags |= O_APPEND;
3818 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3819 flags |= O_TRUNC;
3820
3821 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3822 if (fd < 0)
3823 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3824
3825 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3826 }
3827
3828 return TAKE_FD(fd);
3829}
3830
3831static int collect_open_file_fds(
3832 Unit *u,
3833 OpenFile* open_files,
3834 int **fds,
3835 char ***fdnames,
3836 size_t *n_fds) {
3837 int r;
3838
3839 assert(u);
3840 assert(fds);
3841 assert(fdnames);
3842 assert(n_fds);
3843
3844 LIST_FOREACH(open_files, of, open_files) {
3845 _cleanup_close_ int fd = -EBADF;
3846
3847 fd = get_open_file_fd(u, of);
3848 if (fd < 0) {
3849 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3850 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3851 continue;
3852 }
3853
3854 return fd;
3855 }
3856
3857 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3858 return -ENOMEM;
3859
3860 r = strv_extend(fdnames, of->fdname);
3861 if (r < 0)
3862 return r;
3863
3864 (*fds)[*n_fds] = TAKE_FD(fd);
3865
3866 (*n_fds)++;
3867 }
3868
3869 return 0;
3870}
3871
3ff67ec4
ZJS
3872static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3873 assert(unit);
3874 assert(msg);
3875 assert(executable);
3876
3877 if (!DEBUG_LOGGING)
3878 return;
3879
3880 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3881
3882 log_unit_struct(unit, LOG_DEBUG,
3883 "EXECUTABLE=%s", executable,
3884 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3885 LOG_UNIT_INVOCATION_ID(unit));
3886}
3887
170d978b
LP
3888static bool exec_context_need_unprivileged_private_users(
3889 const ExecContext *context,
3890 const ExecParameters *params) {
3891
6ef721cb 3892 assert(context);
170d978b 3893 assert(params);
6ef721cb
LB
3894
3895 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3896 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3897 * (system manager) then we have privileges and don't need this. */
170d978b 3898 if (params->runtime_scope != RUNTIME_SCOPE_USER)
6ef721cb
LB
3899 return false;
3900
3901 return context->private_users ||
3902 context->private_tmp ||
3903 context->private_devices ||
3904 context->private_network ||
3905 context->network_namespace_path ||
3906 context->private_ipc ||
3907 context->ipc_namespace_path ||
adeff822 3908 context->private_mounts > 0 ||
6ef721cb
LB
3909 context->mount_apivfs ||
3910 context->n_bind_mounts > 0 ||
3911 context->n_temporary_filesystems > 0 ||
3912 context->root_directory ||
3913 !strv_isempty(context->extension_directories) ||
3914 context->protect_system != PROTECT_SYSTEM_NO ||
3915 context->protect_home != PROTECT_HOME_NO ||
3916 context->protect_kernel_tunables ||
3917 context->protect_kernel_modules ||
3918 context->protect_kernel_logs ||
3919 context->protect_control_groups ||
3920 context->protect_clock ||
3921 context->protect_hostname ||
3922 !strv_isempty(context->read_write_paths) ||
3923 !strv_isempty(context->read_only_paths) ||
3924 !strv_isempty(context->inaccessible_paths) ||
3925 !strv_isempty(context->exec_paths) ||
3926 !strv_isempty(context->no_exec_paths);
3927}
3928
ff0af2a1 3929static int exec_child(
f2341e0a 3930 Unit *unit,
34cf6c43 3931 const ExecCommand *command,
ff0af2a1
LP
3932 const ExecContext *context,
3933 const ExecParameters *params,
28135da3 3934 ExecRuntime *runtime,
6bb00842 3935 const CGroupContext *cgroup_context,
ff0af2a1 3936 int socket_fd,
2caa38e9 3937 const int named_iofds[static 3],
cd48e23f 3938 int *params_fds,
da6053d0 3939 size_t n_socket_fds,
25b583d7 3940 size_t n_storage_fds,
ff0af2a1 3941 char **files_env,
00d9ef85 3942 int user_lookup_fd,
12145637 3943 int *exit_status) {
d35fbf6b 3944
8c35c10d 3945 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3946 int r, ngids = 0, exec_fd;
4d885bd3
DH
3947 _cleanup_free_ gid_t *supplementary_gids = NULL;
3948 const char *username = NULL, *groupname = NULL;
73ff4d48 3949 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
2b3c1b9e 3950 const char *home = NULL, *shell = NULL;
7ca69792 3951 char **final_argv = NULL;
7bce046b
LP
3952 dev_t journal_stream_dev = 0;
3953 ino_t journal_stream_ino = 0;
5749f855 3954 bool userns_set_up = false;
165a31c0
LP
3955 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3956 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3957 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3958 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3959#if HAVE_SELINUX
7f59dd35 3960 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3961 bool use_selinux = false;
ecfbc84f 3962#endif
f9fa32f0 3963#if ENABLE_SMACK
43b1f709 3964 bool use_smack = false;
ecfbc84f 3965#endif
349cc4a5 3966#if HAVE_APPARMOR
43b1f709 3967 bool use_apparmor = false;
ecfbc84f 3968#endif
5749f855
AZ
3969 uid_t saved_uid = getuid();
3970 gid_t saved_gid = getgid();
fed1e721
LP
3971 uid_t uid = UID_INVALID;
3972 gid_t gid = GID_INVALID;
1da37e58
ZJS
3973 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3974 n_keep_fds; /* total number of fds not to close */
165a31c0 3975 int secure_bits;
afb11bf1
DG
3976 _cleanup_free_ gid_t *gids_after_pam = NULL;
3977 int ngids_after_pam = 0;
cd48e23f
RP
3978 _cleanup_free_ int *fds = NULL;
3979 _cleanup_strv_free_ char **fdnames = NULL;
034c6ed7 3980
f2341e0a 3981 assert(unit);
5cb5a6ff
LP
3982 assert(command);
3983 assert(context);
d35fbf6b 3984 assert(params);
ff0af2a1 3985 assert(exit_status);
d35fbf6b 3986
69339ae9
LP
3987 /* Explicitly test for CVE-2021-4034 inspired invocations */
3988 assert(command->path);
3989 assert(!strv_isempty(command->argv));
3990
d35fbf6b
DM
3991 rename_process_from_path(command->path);
3992
9c274488
LP
3993 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3994 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3995 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3996 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3997 SIGNALS_IGNORE);
d35fbf6b
DM
3998
3999 if (context->ignore_sigpipe)
9c274488 4000 (void) ignore_signals(SIGPIPE);
d35fbf6b 4001
ff0af2a1
LP
4002 r = reset_signal_mask();
4003 if (r < 0) {
4004 *exit_status = EXIT_SIGNAL_MASK;
12145637 4005 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4006 }
034c6ed7 4007
d35fbf6b
DM
4008 if (params->idle_pipe)
4009 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4010
2c027c62
LP
4011 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4012 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4013 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4014 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4015
d35fbf6b 4016 log_forget_fds();
2c027c62 4017 log_set_open_when_needed(true);
a3b00f91 4018 log_settle_target();
4f2d528d 4019
40a80078
LP
4020 /* In case anything used libc syslog(), close this here, too */
4021 closelog();
4022
cd48e23f
RP
4023 fds = newdup(int, params_fds, n_fds);
4024 if (!fds) {
4025 *exit_status = EXIT_MEMORY;
4026 return log_oom();
4027 }
4028
4029 fdnames = strv_copy((char**) params->fd_names);
4030 if (!fdnames) {
4031 *exit_status = EXIT_MEMORY;
4032 return log_oom();
4033 }
4034
4035 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4036 if (r < 0) {
4037 *exit_status = EXIT_FDS;
4038 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4039 }
4040
b1994387 4041 int keep_fds[n_fds + 3];
1da37e58
ZJS
4042 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4043 n_keep_fds = n_fds;
4044
4045 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4046 if (r < 0) {
4047 *exit_status = EXIT_FDS;
4048 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4049 }
4050
b1994387 4051#if HAVE_LIBBPF
46004616
ZJS
4052 if (unit->manager->restrict_fs) {
4053 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4054 if (bpf_map_fd < 0) {
4055 *exit_status = EXIT_FDS;
46004616 4056 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4057 }
4058
4059 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4060 if (r < 0) {
4061 *exit_status = EXIT_FDS;
4062 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4063 }
4064 }
4065#endif
4066
15220772 4067 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4068 if (r < 0) {
4069 *exit_status = EXIT_FDS;
12145637 4070 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4071 }
4072
0af07108
ZJS
4073 if (!context->same_pgrp &&
4074 setsid() < 0) {
4075 *exit_status = EXIT_SETSID;
4076 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4077 }
9e2f7c11 4078
1e22b5cd 4079 exec_context_tty_reset(context, params);
d35fbf6b 4080
c891efaf 4081 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4082 _cleanup_free_ char *cmdline = NULL;
4083
4ef15008 4084 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4085 if (!cmdline) {
0460aa5c 4086 *exit_status = EXIT_MEMORY;
12145637 4087 return log_oom();
3b20f877 4088 }
d35fbf6b 4089
4ef15008 4090 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4091 if (r != CONFIRM_EXECUTE) {
4092 if (r == CONFIRM_PRETEND_SUCCESS) {
4093 *exit_status = EXIT_SUCCESS;
4094 return 0;
4095 }
5fa01ac0 4096
ff0af2a1 4097 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4098 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4099 "Execution cancelled by the user");
d35fbf6b
DM
4100 }
4101 }
1a63a750 4102
d521916d
LP
4103 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4104 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4105 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4106 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4107 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4108 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
170d978b 4109 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
d521916d
LP
4110 *exit_status = EXIT_MEMORY;
4111 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4112 }
4113
15220772 4114 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
da50b85a 4115 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4116
d521916d 4117 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4118 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4119 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4120 *exit_status = EXIT_USER;
12145637 4121 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4122 }
4123
da50b85a
LP
4124 r = compile_suggested_paths(context, params, &suggested_paths);
4125 if (r < 0) {
4126 *exit_status = EXIT_MEMORY;
4127 return log_oom();
4128 }
4129
15220772 4130 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4131 if (r < 0) {
4132 *exit_status = EXIT_USER;
d85ff944
YW
4133 if (r == -EILSEQ)
4134 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4135 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4136 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4137 }
524daa8c 4138
70dd455c 4139 if (!uid_is_valid(uid)) {
29206d46 4140 *exit_status = EXIT_USER;
d85ff944 4141 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4142 }
4143
4144 if (!gid_is_valid(gid)) {
4145 *exit_status = EXIT_USER;
d85ff944 4146 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4147 }
5bc7452b 4148
15220772
DDM
4149 if (runtime->dynamic_creds->user)
4150 username = runtime->dynamic_creds->user->name;
29206d46
LP
4151
4152 } else {
4d885bd3
DH
4153 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4154 if (r < 0) {
4155 *exit_status = EXIT_USER;
12145637 4156 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4157 }
5bc7452b 4158
4d885bd3
DH
4159 r = get_fixed_group(context, &groupname, &gid);
4160 if (r < 0) {
4161 *exit_status = EXIT_GROUP;
12145637 4162 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4163 }
cdc5d5c5 4164 }
29206d46 4165
cdc5d5c5
DH
4166 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4167 r = get_supplementary_groups(context, username, groupname, gid,
4168 &supplementary_gids, &ngids);
4169 if (r < 0) {
4170 *exit_status = EXIT_GROUP;
12145637 4171 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4172 }
5bc7452b 4173
00d9ef85
LP
4174 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4175 if (r < 0) {
4176 *exit_status = EXIT_USER;
12145637 4177 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4178 }
4179
4180 user_lookup_fd = safe_close(user_lookup_fd);
4181
6732edab
LP
4182 r = acquire_home(context, uid, &home, &home_buffer);
4183 if (r < 0) {
4184 *exit_status = EXIT_CHDIR;
12145637 4185 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4186 }
4187
4a055e5a 4188 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
d35fbf6b 4189 if (socket_fd >= 0)
a34ceba6 4190 (void) fd_nonblock(socket_fd, false);
acbb0225 4191
4c70a4a7
MS
4192 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4193 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4194 if (params->cgroup_path) {
4195 _cleanup_free_ char *p = NULL;
4196
a8b993dc 4197 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4c70a4a7
MS
4198 if (r < 0) {
4199 *exit_status = EXIT_CGROUP;
4200 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4201 }
4202
4203 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4204 if (r == -EUCLEAN) {
4205 *exit_status = EXIT_CGROUP;
4206 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4207 "because the cgroup or one of its parents or "
4208 "siblings is in the threaded mode: %m", p);
4209 }
4c70a4a7
MS
4210 if (r < 0) {
4211 *exit_status = EXIT_CGROUP;
4212 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4213 }
4214 }
4215
28135da3
DDM
4216 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4217 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4218 if (r < 0) {
4219 *exit_status = EXIT_NETWORK;
4220 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4221 }
4222 }
4223
28135da3
DDM
4224 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4225 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
a70581ff
XR
4226 if (r < 0) {
4227 *exit_status = EXIT_NAMESPACE;
4228 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4229 }
4230 }
4231
52c239d7 4232 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4233 if (r < 0) {
4234 *exit_status = EXIT_STDIN;
12145637 4235 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4236 }
034c6ed7 4237
52c239d7 4238 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4239 if (r < 0) {
4240 *exit_status = EXIT_STDOUT;
12145637 4241 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4242 }
4243
52c239d7 4244 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4245 if (r < 0) {
4246 *exit_status = EXIT_STDERR;
12145637 4247 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4248 }
4249
d35fbf6b 4250 if (context->oom_score_adjust_set) {
bb44fd07
ZJS
4251 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4252 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
9f8168eb 4253 r = set_oom_score_adjust(context->oom_score_adjust);
bb44fd07
ZJS
4254 if (ERRNO_IS_NEG_PRIVILEGE(r))
4255 log_unit_debug_errno(unit, r,
4256 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4257 else if (r < 0) {
4258 *exit_status = EXIT_OOM_ADJUST;
4259 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4260 }
d35fbf6b
DM
4261 }
4262
ad21e542
ZJS
4263 if (context->coredump_filter_set) {
4264 r = set_coredump_filter(context->coredump_filter);
bb44fd07
ZJS
4265 if (ERRNO_IS_NEG_PRIVILEGE(r))
4266 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5fa01ac0
ZJS
4267 else if (r < 0) {
4268 *exit_status = EXIT_LIMITS;
bb44fd07 4269 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5cf4c468 4270 }
ad21e542
ZJS
4271 }
4272
39090201
DJL
4273 if (context->nice_set) {
4274 r = setpriority_closest(context->nice);
5fa01ac0
ZJS
4275 if (r < 0) {
4276 *exit_status = EXIT_NICE;
39090201 4277 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5fa01ac0 4278 }
39090201 4279 }
613b411c 4280
d35fbf6b
DM
4281 if (context->cpu_sched_set) {
4282 struct sched_param param = {
4283 .sched_priority = context->cpu_sched_priority,
4284 };
4285
ff0af2a1
LP
4286 r = sched_setscheduler(0,
4287 context->cpu_sched_policy |
4288 (context->cpu_sched_reset_on_fork ?
4289 SCHED_RESET_ON_FORK : 0),
4290 &param);
4291 if (r < 0) {
4292 *exit_status = EXIT_SETSCHEDULER;
12145637 4293 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4294 }
d35fbf6b 4295 }
fc9b2a84 4296
e2b2fb7f
MS
4297 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4298 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4299 const CPUSet *cpu_set;
4300
4301 if (context->cpu_affinity_from_numa) {
4302 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4303 if (r < 0) {
4304 *exit_status = EXIT_CPUAFFINITY;
4305 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4306 }
4307
4308 cpu_set = &converted_cpu_set;
4309 } else
4310 cpu_set = &context->cpu_set;
4311
4312 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4313 *exit_status = EXIT_CPUAFFINITY;
12145637 4314 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4315 }
e2b2fb7f 4316 }
034c6ed7 4317
b070c7c0
MS
4318 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4319 r = apply_numa_policy(&context->numa_policy);
bb44fd07
ZJS
4320 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4321 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4322 else if (r < 0) {
4323 *exit_status = EXIT_NUMA_POLICY;
4324 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
b070c7c0
MS
4325 }
4326 }
4327
d35fbf6b
DM
4328 if (context->ioprio_set)
4329 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4330 *exit_status = EXIT_IOPRIO;
12145637 4331 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4332 }
da726a4d 4333
d35fbf6b
DM
4334 if (context->timer_slack_nsec != NSEC_INFINITY)
4335 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4336 *exit_status = EXIT_TIMERSLACK;
12145637 4337 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4338 }
9eba9da4 4339
21022b9d
LP
4340 if (context->personality != PERSONALITY_INVALID) {
4341 r = safe_personality(context->personality);
4342 if (r < 0) {
ff0af2a1 4343 *exit_status = EXIT_PERSONALITY;
12145637 4344 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4345 }
21022b9d 4346 }
94f04347 4347
33331d11
VB
4348 if (context->utmp_id) {
4349 const char *line = context->tty_path ?
4350 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4351 NULL;
df0ff127 4352 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4353 line,
023a4f67
LP
4354 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4355 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4356 USER_PROCESS,
6a93917d 4357 username);
33331d11 4358 }
d35fbf6b 4359
08f67696 4360 if (uid_is_valid(uid)) {
ff0af2a1
LP
4361 r = chown_terminal(STDIN_FILENO, uid);
4362 if (r < 0) {
4363 *exit_status = EXIT_STDIN;
12145637 4364 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4365 }
d35fbf6b 4366 }
8e274523 4367
6bb00842
LP
4368 if (params->cgroup_path) {
4369 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4370 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4371 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4372 * touch a single hierarchy too. */
4373
4374 if (params->flags & EXEC_CGROUP_DELEGATE) {
a8b993dc
LP
4375 _cleanup_free_ char *p = NULL;
4376
6bb00842
LP
4377 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4378 if (r < 0) {
4379 *exit_status = EXIT_CGROUP;
4380 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4381 }
a8b993dc
LP
4382
4383 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4384 if (r < 0) {
4385 *exit_status = EXIT_CGROUP;
4386 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4387 }
4388 if (r > 0) {
bcd9b981 4389 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
a8b993dc
LP
4390 if (r < 0) {
4391 *exit_status = EXIT_CGROUP;
4392 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4393 }
4394 }
6bb00842
LP
4395 }
4396
4397 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4398 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4399 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4400 if (r < 0) {
4401 *exit_status = EXIT_MEMORY;
4402 return log_oom();
4403 }
4404
4405 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4406 if (r < 0) {
4407 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4408 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4409 memory_pressure_path = mfree(memory_pressure_path);
4410 }
4411 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4412 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4413 if (!memory_pressure_path) {
4414 *exit_status = EXIT_MEMORY;
4415 return log_oom();
4416 }
4417 }
034c6ed7 4418 }
d35fbf6b 4419 }
034c6ed7 4420
211a3d87
LB
4421 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4422
5b10116e 4423 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
59dd2bbb 4424 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4425 if (r < 0)
4426 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4427 }
94f04347 4428
bb0c0d6f 4429 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
43962c30 4430 r = exec_setup_credentials(context, params, unit->id, uid, gid);
bb0c0d6f
LP
4431 if (r < 0) {
4432 *exit_status = EXIT_CREDENTIALS;
4433 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4434 }
4435 }
4436
7bce046b 4437 r = build_environment(
fd63e712 4438 unit,
7bce046b
LP
4439 context,
4440 params,
6bb00842 4441 cgroup_context,
7bce046b 4442 n_fds,
cd48e23f 4443 fdnames,
7bce046b
LP
4444 home,
4445 username,
4446 shell,
4447 journal_stream_dev,
4448 journal_stream_ino,
6bb00842 4449 memory_pressure_path,
7bce046b 4450 &our_env);
2065ca69
JW
4451 if (r < 0) {
4452 *exit_status = EXIT_MEMORY;
12145637 4453 return log_oom();
2065ca69
JW
4454 }
4455
4456 r = build_pass_environment(context, &pass_env);
4457 if (r < 0) {
4458 *exit_status = EXIT_MEMORY;
12145637 4459 return log_oom();
2065ca69
JW
4460 }
4461
adf769b0
ZJS
4462 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4463 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4464 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4465 if (!strv_isempty(context->exec_search_path)) {
4466 _cleanup_free_ char *joined = NULL;
4467
4468 joined = strv_join(context->exec_search_path, ":");
4469 if (!joined) {
4470 *exit_status = EXIT_MEMORY;
4471 return log_oom();
4472 }
4473
4474 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4475 if (r < 0) {
4476 *exit_status = EXIT_MEMORY;
4477 return log_oom();
4478 }
4479 }
4480
4ab3d29f 4481 accum_env = strv_env_merge(params->environment,
2065ca69 4482 our_env,
8c35c10d 4483 joined_exec_search_path,
2065ca69
JW
4484 pass_env,
4485 context->environment,
44e5d006 4486 files_env);
2065ca69
JW
4487 if (!accum_env) {
4488 *exit_status = EXIT_MEMORY;
12145637 4489 return log_oom();
2065ca69 4490 }
1280503b 4491 accum_env = strv_env_clean(accum_env);
2065ca69 4492
096424d1 4493 (void) umask(context->umask);
b213e1c1 4494
b1edf445 4495 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4496 if (r < 0) {
4497 *exit_status = EXIT_KEYRING;
12145637 4498 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4499 }
4500
adf769b0
ZJS
4501 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4502 * from it. */
1703fa41 4503 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4504
adf769b0
ZJS
4505 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4506 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4507 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4508
adf769b0
ZJS
4509 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4510 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4511 * desired. */
165a31c0
LP
4512 if (needs_ambient_hack)
4513 needs_setuid = false;
4514 else
4515 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4516
638fd8cc
LP
4517 uint64_t capability_ambient_set = context->capability_ambient_set;
4518
165a31c0 4519 if (needs_sandboxing) {
adf769b0
ZJS
4520 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4521 * /sys being present. The actual MAC context application will happen later, as late as
4522 * possible, to avoid impacting our own code paths. */
7f18ef0a 4523
349cc4a5 4524#if HAVE_SELINUX
43b1f709 4525 use_selinux = mac_selinux_use();
7f18ef0a 4526#endif
f9fa32f0 4527#if ENABLE_SMACK
43b1f709 4528 use_smack = mac_smack_use();
7f18ef0a 4529#endif
349cc4a5 4530#if HAVE_APPARMOR
43b1f709 4531 use_apparmor = mac_apparmor_use();
7f18ef0a 4532#endif
165a31c0 4533 }
7f18ef0a 4534
ce932d2d
LP
4535 if (needs_sandboxing) {
4536 int which_failed;
4537
4538 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4539 * is set here. (See below.) */
4540
4541 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4542 if (r < 0) {
4543 *exit_status = EXIT_LIMITS;
4544 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4545 }
4546 }
4547
0af07108 4548 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4549 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4550 * wins here. (See above.) */
4551
1da37e58 4552 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4553 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4554 if (r < 0) {
4555 *exit_status = EXIT_PAM;
4556 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4557 }
ac45f971 4558
638fd8cc
LP
4559 if (ambient_capabilities_supported()) {
4560 uint64_t ambient_after_pam;
4561
4562 /* PAM modules might have set some ambient caps. Query them here and merge them into
4563 * the caps we want to set in the end, so that we don't end up unsetting them. */
4564 r = capability_get_ambient(&ambient_after_pam);
4565 if (r < 0) {
4566 *exit_status = EXIT_CAPABILITIES;
4567 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4568 }
4569
4570 capability_ambient_set |= ambient_after_pam;
4571 }
4572
0af07108
ZJS
4573 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4574 if (ngids_after_pam < 0) {
4575 *exit_status = EXIT_MEMORY;
4576 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4577 }
b213e1c1 4578 }
5749f855 4579
170d978b 4580 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5749f855
AZ
4581 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4582 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4583 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108 4584
0af07108 4585 r = setup_private_users(saved_uid, saved_gid, uid, gid);
6ef721cb
LB
4586 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4587 * the actual requested operations fail (or silently continue). */
4588 if (r < 0 && context->private_users) {
0af07108
ZJS
4589 *exit_status = EXIT_USER;
4590 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855 4591 }
6ef721cb
LB
4592 if (r < 0)
4593 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4594 else
4595 userns_set_up = true;
5749f855
AZ
4596 }
4597
28135da3 4598 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
a8d08f39 4599
5a3627e5
LP
4600 /* Try to enable network namespacing if network namespacing is available and we have
4601 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4602 * new network namespace. And if we don't have that, then we could only create a network
4603 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4604 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
28135da3 4605 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
bb44fd07
ZJS
4606 if (ERRNO_IS_NEG_PRIVILEGE(r))
4607 log_unit_notice_errno(unit, r,
4608 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4609 else if (r < 0) {
4610 *exit_status = EXIT_NETWORK;
4611 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
6e2d7c4f 4612 }
a8d08f39
LP
4613 } else if (context->network_namespace_path) {
4614 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4615 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4616 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f 4617 } else
5a3627e5 4618 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
d35fbf6b 4619 }
169c1bda 4620
28135da3 4621 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
a70581ff
XR
4622
4623 if (ns_type_supported(NAMESPACE_IPC)) {
28135da3 4624 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
a70581ff
XR
4625 if (r == -EPERM)
4626 log_unit_warning_errno(unit, r,
4627 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4628 else if (r < 0) {
4629 *exit_status = EXIT_NAMESPACE;
4630 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4631 }
4632 } else if (context->ipc_namespace_path) {
4633 *exit_status = EXIT_NAMESPACE;
4634 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4635 "IPCNamespacePath= is not supported, refusing.");
4636 } else
4637 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4638 }
4639
ee818b89 4640 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4641 _cleanup_free_ char *error_path = NULL;
4642
73ff4d48 4643 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
3fbe8dbe
LP
4644 if (r < 0) {
4645 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4646 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4647 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4648 }
d35fbf6b 4649 }
81a2b7ce 4650
daf8f72b
LP
4651 if (needs_sandboxing) {
4652 r = apply_protect_hostname(unit, context, exit_status);
4653 if (r < 0)
4654 return r;
aecd5ac6
TM
4655 }
4656
85614c6e
SR
4657 if (context->memory_ksm >= 0)
4658 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4659 if (ERRNO_IS_NOT_SUPPORTED(errno))
4660 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4661 else {
4662 *exit_status = EXIT_KSM;
4663 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4664 }
4665 }
4666
5749f855
AZ
4667 /* Drop groups as early as possible.
4668 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4669 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4670 if (needs_setuid) {
afb11bf1
DG
4671 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4672 int ngids_to_enforce = 0;
4673
4674 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4675 ngids,
4676 gids_after_pam,
4677 ngids_after_pam,
4678 &gids_to_enforce);
4679 if (ngids_to_enforce < 0) {
4680 *exit_status = EXIT_MEMORY;
4681 return log_unit_error_errno(unit,
4682 ngids_to_enforce,
4683 "Failed to merge group lists. Group membership might be incorrect: %m");
4684 }
4685
4686 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4687 if (r < 0) {
4688 *exit_status = EXIT_GROUP;
12145637 4689 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4690 }
165a31c0 4691 }
096424d1 4692
5749f855
AZ
4693 /* If the user namespace was not set up above, try to do it now.
4694 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 4695 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
4696 * case of mount namespaces being less privileged when the mount point list is copied from a
4697 * different user namespace). */
9008e1ac 4698
5749f855
AZ
4699 if (needs_sandboxing && context->private_users && !userns_set_up) {
4700 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4701 if (r < 0) {
4702 *exit_status = EXIT_USER;
4703 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4704 }
4705 }
4706
9f71ba8d
ZJS
4707 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4708 * shall execute. */
4709
4710 _cleanup_free_ char *executable = NULL;
254d1313 4711 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 4712 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4713 if (r < 0) {
4714 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4715 log_unit_struct_errno(unit, LOG_INFO, r,
4716 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4717 LOG_UNIT_INVOCATION_ID(unit),
4718 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4719 command->path),
4720 "EXECUTABLE=%s", command->path);
5fa01ac0 4721 *exit_status = EXIT_SUCCESS;
9f71ba8d
ZJS
4722 return 0;
4723 }
4724
4725 *exit_status = EXIT_EXEC;
c2503e35
RH
4726 return log_unit_struct_errno(unit, LOG_INFO, r,
4727 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4728 LOG_UNIT_INVOCATION_ID(unit),
4729 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4730 command->path),
4731 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4732 }
4733
b83d5050
ZJS
4734 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4735 if (r < 0) {
4736 *exit_status = EXIT_FDS;
4737 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4738 }
4739
9f71ba8d 4740#if HAVE_SELINUX
49590d67 4741 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 4742 int fd = -EBADF;
49590d67
MS
4743
4744 if (socket_fd >= 0)
4745 fd = socket_fd;
4746 else if (params->n_socket_fds == 1)
4747 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4748 * use context from that fd to compute the label. */
4749 fd = params->fds[0];
4750
4751 if (fd >= 0) {
4752 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4753 if (r < 0) {
4754 if (!context->selinux_context_ignore) {
4755 *exit_status = EXIT_SELINUX_CONTEXT;
4756 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4757 }
4758 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4759 }
9f71ba8d
ZJS
4760 }
4761 }
4762#endif
4763
4a055e5a
ZJS
4764 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4765 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4766 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4767 * execve(). */
5686391b 4768
1da37e58 4769 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4770 if (r >= 0)
4771 r = shift_fds(fds, n_fds);
4772 if (r >= 0)
cd48e23f 4773 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
4774 if (r < 0) {
4775 *exit_status = EXIT_FDS;
12145637 4776 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4777 }
e66cf1a3 4778
5686391b
LP
4779 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4780 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4781 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4782 * came this far. */
4783
165a31c0 4784 secure_bits = context->secure_bits;
e66cf1a3 4785
165a31c0
LP
4786 if (needs_sandboxing) {
4787 uint64_t bset;
e66cf1a3 4788
4a055e5a
ZJS
4789 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4790 * (Note this is placed after the general resource limit initialization, see above, in order
4791 * to take precedence.) */
f4170c67
LP
4792 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4793 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4794 *exit_status = EXIT_LIMITS;
12145637 4795 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4796 }
4797 }
4798
37ac2744
JB
4799#if ENABLE_SMACK
4800 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4801 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4802 if (use_smack) {
aa5ae971 4803 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 4804 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4805 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4806 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4807 }
4808 }
4809#endif
4810
165a31c0
LP
4811 bset = context->capability_bounding_set;
4812 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4813 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4814 * instead of us doing that */
4815 if (needs_ambient_hack)
4816 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4817 (UINT64_C(1) << CAP_SETUID) |
4818 (UINT64_C(1) << CAP_SETGID);
4819
4820 if (!cap_test_all(bset)) {
638fd8cc 4821 r = capability_bounding_set_drop(bset, /* right_now= */ false);
ff0af2a1
LP
4822 if (r < 0) {
4823 *exit_status = EXIT_CAPABILITIES;
12145637 4824 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4825 }
4c2630eb 4826 }
3b8bddde 4827
16fcb191
TK
4828 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4829 * keep-caps set.
a954b249
LP
4830 *
4831 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4832 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4833 * the ambient capabilities can be raised as they are present in the permitted and
4834 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4835 * without changing the user, so we also set the ambient capabilities here.
4836 *
4837 * The requested ambient capabilities are raised in the inheritable set if the second
4838 * argument is true. */
943800f4 4839 if (!needs_ambient_hack) {
638fd8cc 4840 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
755d4b67
IP
4841 if (r < 0) {
4842 *exit_status = EXIT_CAPABILITIES;
12145637 4843 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4844 }
755d4b67 4845 }
165a31c0 4846 }
755d4b67 4847
fa97f630 4848 /* chroot to root directory first, before we lose the ability to chroot */
9c0c6701 4849 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
fa97f630
JB
4850 if (r < 0)
4851 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4852
165a31c0 4853 if (needs_setuid) {
08f67696 4854 if (uid_is_valid(uid)) {
638fd8cc 4855 r = enforce_user(context, uid, capability_ambient_set);
ff0af2a1
LP
4856 if (r < 0) {
4857 *exit_status = EXIT_USER;
12145637 4858 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4859 }
165a31c0 4860
638fd8cc 4861 if (!needs_ambient_hack && capability_ambient_set != 0) {
755d4b67 4862
16fcb191 4863 /* Raise the ambient capabilities after user change. */
638fd8cc 4864 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
755d4b67
IP
4865 if (r < 0) {
4866 *exit_status = EXIT_CAPABILITIES;
12145637 4867 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4868 }
755d4b67 4869 }
5b6319dc 4870 }
165a31c0 4871 }
d35fbf6b 4872
56ef8db9
JB
4873 /* Apply working directory here, because the working directory might be on NFS and only the user running
4874 * this service might have the correct privilege to change to the working directory */
9c0c6701 4875 r = apply_working_directory(context, params, runtime, home, exit_status);
56ef8db9
JB
4876 if (r < 0)
4877 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4878
165a31c0 4879 if (needs_sandboxing) {
37ac2744 4880 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4881 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4882 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4883 * are restricted. */
4884
349cc4a5 4885#if HAVE_SELINUX
43b1f709 4886 if (use_selinux) {
5cd9cd35
LP
4887 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4888
4889 if (exec_context) {
4890 r = setexeccon(exec_context);
006d1864
TM
4891 if (r < 0) {
4892 if (!context->selinux_context_ignore) {
4893 *exit_status = EXIT_SELINUX_CONTEXT;
4894 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4895 }
4896 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4897 }
4898 }
4899 }
4900#endif
4901
349cc4a5 4902#if HAVE_APPARMOR
43b1f709 4903 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4904 r = aa_change_onexec(context->apparmor_profile);
4905 if (r < 0 && !context->apparmor_profile_ignore) {
4906 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4907 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4908 }
4909 }
4910#endif
4911
a954b249
LP
4912 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4913 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4914 * requires CAP_SETPCAP. */
dbdc4098 4915 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4916 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098 4917 * effective set here.
a954b249
LP
4918 *
4919 * The effective set is overwritten during execve() with the following values:
4920 *
dbdc4098 4921 * - ambient set (for non-root processes)
a954b249 4922 *
dbdc4098
TK
4923 * - (inheritable | bounding) set for root processes)
4924 *
4925 * Hence there is no security impact to raise it in the effective set before execve
4926 */
a954b249 4927 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
dbdc4098
TK
4928 if (r < 0) {
4929 *exit_status = EXIT_CAPABILITIES;
4930 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4931 }
755d4b67 4932 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4933 *exit_status = EXIT_SECUREBITS;
12145637 4934 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4935 }
dbdc4098 4936 }
5b6319dc 4937
59eeb84b 4938 if (context_has_no_new_privileges(context))
d35fbf6b 4939 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4940 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4941 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4942 }
4943
349cc4a5 4944#if HAVE_SECCOMP
469830d1
LP
4945 r = apply_address_families(unit, context);
4946 if (r < 0) {
4947 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4948 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4949 }
04aa0cb9 4950
469830d1
LP
4951 r = apply_memory_deny_write_execute(unit, context);
4952 if (r < 0) {
4953 *exit_status = EXIT_SECCOMP;
12145637 4954 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4955 }
f4170c67 4956
469830d1
LP
4957 r = apply_restrict_realtime(unit, context);
4958 if (r < 0) {
4959 *exit_status = EXIT_SECCOMP;
12145637 4960 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4961 }
4962
f69567cb
LP
4963 r = apply_restrict_suid_sgid(unit, context);
4964 if (r < 0) {
4965 *exit_status = EXIT_SECCOMP;
4966 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4967 }
4968
add00535
LP
4969 r = apply_restrict_namespaces(unit, context);
4970 if (r < 0) {
4971 *exit_status = EXIT_SECCOMP;
12145637 4972 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4973 }
4974
469830d1
LP
4975 r = apply_protect_sysctl(unit, context);
4976 if (r < 0) {
4977 *exit_status = EXIT_SECCOMP;
12145637 4978 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4979 }
4980
469830d1
LP
4981 r = apply_protect_kernel_modules(unit, context);
4982 if (r < 0) {
4983 *exit_status = EXIT_SECCOMP;
12145637 4984 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4985 }
4986
84703040
KK
4987 r = apply_protect_kernel_logs(unit, context);
4988 if (r < 0) {
4989 *exit_status = EXIT_SECCOMP;
4990 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4991 }
4992
fc64760d
KK
4993 r = apply_protect_clock(unit, context);
4994 if (r < 0) {
4995 *exit_status = EXIT_SECCOMP;
4996 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4997 }
4998
469830d1
LP
4999 r = apply_private_devices(unit, context);
5000 if (r < 0) {
5001 *exit_status = EXIT_SECCOMP;
12145637 5002 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5003 }
5004
5005 r = apply_syscall_archs(unit, context);
5006 if (r < 0) {
5007 *exit_status = EXIT_SECCOMP;
12145637 5008 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5009 }
5010
78e864e5
TM
5011 r = apply_lock_personality(unit, context);
5012 if (r < 0) {
5013 *exit_status = EXIT_SECCOMP;
12145637 5014 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5015 }
5016
9df2cdd8
TM
5017 r = apply_syscall_log(unit, context);
5018 if (r < 0) {
5019 *exit_status = EXIT_SECCOMP;
5020 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5021 }
5022
5cd9cd35
LP
5023 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5024 * by the filter as little as possible. */
165a31c0 5025 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5026 if (r < 0) {
5027 *exit_status = EXIT_SECCOMP;
12145637 5028 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5029 }
5030#endif
b1994387
ILG
5031
5032#if HAVE_LIBBPF
5033 r = apply_restrict_filesystems(unit, context);
5034 if (r < 0) {
5035 *exit_status = EXIT_BPF;
5036 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5037 }
5038#endif
5039
d35fbf6b 5040 }
034c6ed7 5041
00819cc1
LP
5042 if (!strv_isempty(context->unset_environment)) {
5043 char **ee = NULL;
5044
5045 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5046 if (!ee) {
5047 *exit_status = EXIT_MEMORY;
12145637 5048 return log_oom();
00819cc1
LP
5049 }
5050
130d3d22 5051 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5052 }
5053
7ca69792 5054 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
f331434d
LP
5055 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5056
5057 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5058 if (r < 0) {
7ca69792 5059 *exit_status = EXIT_MEMORY;
f331434d 5060 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
7ca69792
AZ
5061 }
5062 final_argv = replaced_argv;
f331434d
LP
5063
5064 if (!strv_isempty(unset_variables)) {
5065 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5066 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5067 }
5068
5069 if (!strv_isempty(bad_variables)) {
5070 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5071 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5072 }
7ca69792
AZ
5073 } else
5074 final_argv = command->argv;
034c6ed7 5075
3ff67ec4 5076 log_command_line(unit, "Executing", executable, final_argv);
dd305ec9 5077
5686391b
LP
5078 if (exec_fd >= 0) {
5079 uint8_t hot = 1;
5080
5081 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5082 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5083
5084 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5085 *exit_status = EXIT_EXEC;
5086 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5087 }
5088 }
5089
a6d9111c 5090 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5091
5092 if (exec_fd >= 0) {
5093 uint8_t hot = 0;
5094
5095 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5096 * that POLLHUP on it no longer means execve() succeeded. */
5097
5098 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5099 *exit_status = EXIT_EXEC;
5100 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5101 }
5102 }
12145637 5103
ff0af2a1 5104 *exit_status = EXIT_EXEC;
9f71ba8d 5105 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5106}
81a2b7ce 5107
34cf6c43 5108static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5109static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5110
f2341e0a
LP
5111int exec_spawn(Unit *unit,
5112 ExecCommand *command,
d35fbf6b
DM
5113 const ExecContext *context,
5114 const ExecParameters *params,
28135da3 5115 ExecRuntime *runtime,
6bb00842 5116 const CGroupContext *cgroup_context,
d35fbf6b 5117 pid_t *ret) {
8351ceae 5118
ee39ca20 5119 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5120 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5121 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5122 size_t n_storage_fds = 0, n_socket_fds = 0;
d35fbf6b 5123 pid_t pid;
8351ceae 5124
f2341e0a 5125 assert(unit);
d35fbf6b
DM
5126 assert(command);
5127 assert(context);
5128 assert(ret);
5129 assert(params);
25b583d7 5130 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5131
4b2af439
DDM
5132 LOG_CONTEXT_PUSH_UNIT(unit);
5133
d35fbf6b
DM
5134 if (context->std_input == EXEC_INPUT_SOCKET ||
5135 context->std_output == EXEC_OUTPUT_SOCKET ||
5136 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5137
d85ff944
YW
5138 if (params->n_socket_fds > 1)
5139 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5140
d85ff944
YW
5141 if (params->n_socket_fds == 0)
5142 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5143
d35fbf6b
DM
5144 socket_fd = params->fds[0];
5145 } else {
254d1313 5146 socket_fd = -EBADF;
d35fbf6b 5147 fds = params->fds;
9b141911 5148 n_socket_fds = params->n_socket_fds;
25b583d7 5149 n_storage_fds = params->n_storage_fds;
d35fbf6b 5150 }
94f04347 5151
34cf6c43 5152 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5153 if (r < 0)
5154 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5155
f2341e0a 5156 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5157 if (r < 0)
f2341e0a 5158 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5159
9f71ba8d
ZJS
5160 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5161 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5162 mac_selinux_maybe_reload();
5163
3ff67ec4
ZJS
5164 /* We won't know the real executable path until we create the mount namespace in the child, but we
5165 want to log from the parent, so we use the possibly inaccurate path here. */
5166 log_command_line(unit, "About to execute", command->path, command->argv);
12145637 5167
78f93209 5168 if (params->cgroup_path) {
a8b993dc 5169 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
78f93209
LP
5170 if (r < 0)
5171 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
18c1e481
LP
5172 if (r > 0) {
5173 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5174 * realized by the unit logic) */
5175
78f93209
LP
5176 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5177 if (r < 0)
a8b993dc 5178 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
78f93209
LP
5179 }
5180 }
5181
d35fbf6b
DM
5182 pid = fork();
5183 if (pid < 0)
74129a12 5184 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5185
5186 if (pid == 0) {
5fa01ac0 5187 int exit_status;
ff0af2a1 5188
f2341e0a
LP
5189 r = exec_child(unit,
5190 command,
ff0af2a1
LP
5191 context,
5192 params,
5193 runtime,
6bb00842 5194 cgroup_context,
ff0af2a1 5195 socket_fd,
52c239d7 5196 named_iofds,
4c47affc 5197 fds,
9b141911 5198 n_socket_fds,
25b583d7 5199 n_storage_fds,
ff0af2a1 5200 files_env,
00d9ef85 5201 unit->manager->user_lookup_fds[1],
12145637
LP
5202 &exit_status);
5203
e1714f02 5204 if (r < 0) {
5fa01ac0
ZJS
5205 const char *status = ASSERT_PTR(
5206 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
e1714f02 5207
c2503e35
RH
5208 log_unit_struct_errno(unit, LOG_ERR, r,
5209 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5210 LOG_UNIT_INVOCATION_ID(unit),
5211 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5212 status, command->path),
5213 "EXECUTABLE=%s", command->path);
5fa01ac0
ZJS
5214 } else
5215 assert(exit_status == EXIT_SUCCESS);
4c2630eb 5216
ff0af2a1 5217 _exit(exit_status);
034c6ed7
LP
5218 }
5219
f2341e0a 5220 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5221
78f93209
LP
5222 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5223 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5224 * process will be killed too). */
5225 if (subcgroup_path)
5226 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5227
b58b4116 5228 exec_status_start(&command->exec_status, pid);
9fb86720 5229
034c6ed7 5230 *ret = pid;
5cb5a6ff
LP
5231 return 0;
5232}
5233
034c6ed7
LP
5234void exec_context_init(ExecContext *c) {
5235 assert(c);
5236
4c12626c 5237 c->umask = 0022;
0692548c 5238 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5239 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5240 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5241 c->syslog_level_prefix = true;
353e12c2 5242 c->ignore_sigpipe = true;
3a43da28 5243 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5244 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5245 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5246 c->directories[t].mode = 0755;
12213aed 5247 c->timeout_clean_usec = USEC_INFINITY;
3fd5190b 5248 c->capability_bounding_set = CAP_MASK_UNSET;
aa9d574d
YW
5249 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5250 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5251 c->log_level_max = -1;
005bfaf1
TM
5252#if HAVE_SECCOMP
5253 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5254#endif
51462135
DDM
5255 c->tty_rows = UINT_MAX;
5256 c->tty_cols = UINT_MAX;
b070c7c0 5257 numa_policy_reset(&c->numa_policy);
24002121 5258 c->private_mounts = -1;
85614c6e 5259 c->memory_ksm = -1;
034c6ed7
LP
5260}
5261
613b411c 5262void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5263 assert(c);
5264
6796073e
LP
5265 c->environment = strv_free(c->environment);
5266 c->environment_files = strv_free(c->environment_files);
b4c14404 5267 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5268 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5269
31ce987c 5270 rlimit_free_all(c->rlimit);
034c6ed7 5271
5b10116e 5272 for (size_t l = 0; l < 3; l++) {
52c239d7 5273 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5274 c->stdio_file[l] = mfree(c->stdio_file[l]);
5275 }
52c239d7 5276
a1e58e8e
LP
5277 c->working_directory = mfree(c->working_directory);
5278 c->root_directory = mfree(c->root_directory);
915e6d16 5279 c->root_image = mfree(c->root_image);
18d73705 5280 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5281 c->root_hash = mfree(c->root_hash);
5282 c->root_hash_size = 0;
5283 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5284 c->root_hash_sig = mfree(c->root_hash_sig);
5285 c->root_hash_sig_size = 0;
5286 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5287 c->root_verity = mfree(c->root_verity);
93f59701 5288 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5289 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5290 c->tty_path = mfree(c->tty_path);
5291 c->syslog_identifier = mfree(c->syslog_identifier);
5292 c->user = mfree(c->user);
5293 c->group = mfree(c->group);
034c6ed7 5294
6796073e 5295 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5296
a1e58e8e 5297 c->pam_name = mfree(c->pam_name);
5b6319dc 5298
2a624c36
AP
5299 c->read_only_paths = strv_free(c->read_only_paths);
5300 c->read_write_paths = strv_free(c->read_write_paths);
5301 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5302 c->exec_paths = strv_free(c->exec_paths);
5303 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5304 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5305
d2d6c096 5306 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5307 c->bind_mounts = NULL;
5308 c->n_bind_mounts = 0;
2abd4e38
YW
5309 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5310 c->temporary_filesystems = NULL;
5311 c->n_temporary_filesystems = 0;
b3d13314 5312 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5313
0985c7c4 5314 cpu_set_reset(&c->cpu_set);
b070c7c0 5315 numa_policy_reset(&c->numa_policy);
86a3475b 5316
a1e58e8e
LP
5317 c->utmp_id = mfree(c->utmp_id);
5318 c->selinux_context = mfree(c->selinux_context);
5319 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5320 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5321
9b412709 5322 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
b1994387 5323
8cfa775f 5324 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5325 c->syscall_archs = set_free(c->syscall_archs);
5326 c->address_families = set_free(c->address_families);
e66cf1a3 5327
5b10116e 5328 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5329 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5330
5331 c->log_level_max = -1;
5332
5333 exec_context_free_log_extra_fields(c);
9b412709
FS
5334 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5335 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
08f3be7a 5336
5ac1530e
ZJS
5337 c->log_ratelimit_interval_usec = 0;
5338 c->log_ratelimit_burst = 0;
90fc172e 5339
08f3be7a
LP
5340 c->stdin_data = mfree(c->stdin_data);
5341 c->stdin_data_size = 0;
a8d08f39
LP
5342
5343 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5344 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5345
5346 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5347
43144be4 5348 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5349 c->set_credentials = hashmap_free(c->set_credentials);
9b412709 5350 c->import_credentials = set_free_free(c->import_credentials);
84be0c71
LP
5351
5352 c->root_image_policy = image_policy_free(c->root_image_policy);
5353 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5354 c->extension_image_policy = image_policy_free(c->extension_image_policy);
e66cf1a3
LP
5355}
5356
34cf6c43 5357int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5358 assert(c);
5359
5360 if (!runtime_prefix)
5361 return 0;
5362
211a3d87 5363 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5364 _cleanup_free_ char *p = NULL;
e66cf1a3 5365
494d0247 5366 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5367 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5368 else
211a3d87 5369 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5370 if (!p)
5371 return -ENOMEM;
5372
7bc4bf4a
LP
5373 /* We execute this synchronously, since we need to be sure this is gone when we start the
5374 * service next. */
c6878637 5375 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5376
211a3d87
LB
5377 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5378 _cleanup_free_ char *symlink_abs = NULL;
5379
5380 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5381 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5382 else
5383 symlink_abs = path_join(runtime_prefix, *symlink);
5384 if (!symlink_abs)
5385 return -ENOMEM;
5386
5387 (void) unlink(symlink_abs);
5388 }
e66cf1a3
LP
5389 }
5390
5391 return 0;
5cb5a6ff
LP
5392}
5393
b9f976fb
MK
5394int exec_context_destroy_mount_ns_dir(Unit *u) {
5395 _cleanup_free_ char *p = NULL;
5396
5397 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5398 return 0;
5399
5400 p = path_join("/run/systemd/propagate/", u->id);
5401 if (!p)
5402 return -ENOMEM;
5403
5404 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5405 if (rmdir(p) < 0 && errno != ENOENT)
5406 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5407
5408 return 0;
5409}
5410
34cf6c43 5411static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5412 assert(c);
5413
a1e58e8e 5414 c->path = mfree(c->path);
6796073e 5415 c->argv = strv_free(c->argv);
43d0fcbd
LP
5416}
5417
da6053d0 5418void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5419 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5420 exec_command_done(c+i);
5421}
5422
f1acf85a 5423ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5424 ExecCommand *i;
5425
52e3671b 5426 while ((i = LIST_POP(command, c))) {
43d0fcbd 5427 exec_command_done(i);
5cb5a6ff
LP
5428 free(i);
5429 }
f1acf85a
ZJS
5430
5431 return NULL;
5cb5a6ff
LP
5432}
5433
da6053d0 5434void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5435 for (size_t i = 0; i < n; i++)
f1acf85a 5436 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5437}
5438
6a1d4d9f 5439void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5440 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5441 exec_status_reset(&c[i].exec_status);
5442}
5443
5444void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5445 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5446 LIST_FOREACH(command, z, c[i])
5447 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5448}
5449
039f0e70 5450typedef struct InvalidEnvInfo {
34cf6c43 5451 const Unit *unit;
039f0e70
LP
5452 const char *path;
5453} InvalidEnvInfo;
5454
5455static void invalid_env(const char *p, void *userdata) {
5456 InvalidEnvInfo *info = userdata;
5457
f2341e0a 5458 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5459}
5460
52c239d7
LB
5461const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5462 assert(c);
5463
5464 switch (fd_index) {
5073ff6b 5465
52c239d7
LB
5466 case STDIN_FILENO:
5467 if (c->std_input != EXEC_INPUT_NAMED_FD)
5468 return NULL;
5073ff6b 5469
52c239d7 5470 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5471
52c239d7
LB
5472 case STDOUT_FILENO:
5473 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5474 return NULL;
5073ff6b 5475
52c239d7 5476 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5477
52c239d7
LB
5478 case STDERR_FILENO:
5479 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5480 return NULL;
5073ff6b 5481
52c239d7 5482 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5483
52c239d7
LB
5484 default:
5485 return NULL;
5486 }
5487}
5488
2caa38e9
LP
5489static int exec_context_named_iofds(
5490 const ExecContext *c,
5491 const ExecParameters *p,
5492 int named_iofds[static 3]) {
5493
5b10116e 5494 size_t targets;
56fbd561 5495 const char* stdio_fdname[3];
da6053d0 5496 size_t n_fds;
52c239d7
LB
5497
5498 assert(c);
5499 assert(p);
2caa38e9 5500 assert(named_iofds);
52c239d7
LB
5501
5502 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5503 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5504 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5505
5b10116e 5506 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5507 stdio_fdname[i] = exec_context_fdname(c, i);
5508
4c47affc
FB
5509 n_fds = p->n_storage_fds + p->n_socket_fds;
5510
5b10116e 5511 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5512 if (named_iofds[STDIN_FILENO] < 0 &&
5513 c->std_input == EXEC_INPUT_NAMED_FD &&
5514 stdio_fdname[STDIN_FILENO] &&
5515 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5516
52c239d7
LB
5517 named_iofds[STDIN_FILENO] = p->fds[i];
5518 targets--;
56fbd561
ZJS
5519
5520 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5521 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5522 stdio_fdname[STDOUT_FILENO] &&
5523 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5524
52c239d7
LB
5525 named_iofds[STDOUT_FILENO] = p->fds[i];
5526 targets--;
56fbd561
ZJS
5527
5528 } else if (named_iofds[STDERR_FILENO] < 0 &&
5529 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5530 stdio_fdname[STDERR_FILENO] &&
5531 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5532
52c239d7
LB
5533 named_iofds[STDERR_FILENO] = p->fds[i];
5534 targets--;
5535 }
5536
56fbd561 5537 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5538}
5539
398a5009
ZJS
5540static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5541 _cleanup_strv_free_ char **v = NULL;
398a5009 5542 int r;
8c7be95e
LP
5543
5544 assert(c);
398a5009 5545 assert(ret);
8c7be95e
LP
5546
5547 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5548 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5549 bool ignore = false;
5550 char *fn = *i;
8c7be95e
LP
5551
5552 if (fn[0] == '-') {
5553 ignore = true;
313cefa1 5554 fn++;
8c7be95e
LP
5555 }
5556
5557 if (!path_is_absolute(fn)) {
8c7be95e
LP
5558 if (ignore)
5559 continue;
8c7be95e
LP
5560 return -EINVAL;
5561 }
5562
2bef10ab 5563 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5564 r = safe_glob(fn, 0, &pglob);
5565 if (r < 0) {
2bef10ab
PL
5566 if (ignore)
5567 continue;
398a5009 5568 return r;
2bef10ab 5569 }
8c7be95e 5570
d8c92e8b
ZJS
5571 /* When we don't match anything, -ENOENT should be returned */
5572 assert(pglob.gl_pathc > 0);
5573
fcc06682 5574 for (size_t n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5575 _cleanup_strv_free_ char **p = NULL;
5576
5577 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5578 if (r < 0) {
2bef10ab
PL
5579 if (ignore)
5580 continue;
398a5009 5581 return r;
e9c1ea9d 5582 }
398a5009 5583
ebc05a09 5584 /* Log invalid environment variables with filename */
039f0e70
LP
5585 if (p) {
5586 InvalidEnvInfo info = {
f2341e0a 5587 .unit = unit,
039f0e70
LP
5588 .path = pglob.gl_pathv[n]
5589 };
5590
5591 p = strv_env_clean_with_callback(p, invalid_env, &info);
5592 }
8c7be95e 5593
398a5009
ZJS
5594 if (!v)
5595 v = TAKE_PTR(p);
2bef10ab 5596 else {
398a5009 5597 char **m = strv_env_merge(v, p);
c84a9488 5598 if (!m)
2bef10ab 5599 return -ENOMEM;
2bef10ab 5600
398a5009 5601 strv_free_and_replace(v, m);
2bef10ab 5602 }
8c7be95e
LP
5603 }
5604 }
5605
398a5009 5606 *ret = TAKE_PTR(v);
8c7be95e
LP
5607
5608 return 0;
5609}
5610
6ac8fdc9 5611static bool tty_may_match_dev_console(const char *tty) {
7b912648 5612 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5613
1e22b5cd
LP
5614 if (!tty)
5615 return true;
5616
a119ec7c 5617 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5618
5619 /* trivial identity? */
5620 if (streq(tty, "console"))
5621 return true;
5622
7b912648
LP
5623 if (resolve_dev_console(&resolved) < 0)
5624 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5625
5626 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5627 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5628}
5629
6c0ae739
LP
5630static bool exec_context_may_touch_tty(const ExecContext *ec) {
5631 assert(ec);
1e22b5cd 5632
6c0ae739 5633 return ec->tty_reset ||
1e22b5cd
LP
5634 ec->tty_vhangup ||
5635 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5636 is_terminal_input(ec->std_input) ||
5637 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5638 is_terminal_output(ec->std_error);
5639}
5640
5641bool exec_context_may_touch_console(const ExecContext *ec) {
5642
5643 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5644 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5645}
5646
15ae422b 5647static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5648 assert(f);
5649
5650 STRV_FOREACH(g, l)
5651 fprintf(f, " %s", *g);
5652}
5653
ddc155b2
TM
5654static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5655 assert(f);
5656 assert(prefix);
5657 assert(name);
5658
5659 if (!strv_isempty(strv)) {
a7bd1656 5660 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5661 strv_fprintf(f, strv);
5662 fputs("\n", f);
5663 }
5664}
5665
34cf6c43 5666void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5667 int r;
9eba9da4 5668
5cb5a6ff
LP
5669 assert(c);
5670 assert(f);
5671
4ad49000 5672 prefix = strempty(prefix);
5cb5a6ff
LP
5673
5674 fprintf(f,
94f04347
LP
5675 "%sUMask: %04o\n"
5676 "%sWorkingDirectory: %s\n"
451a074f 5677 "%sRootDirectory: %s\n"
9c0c6701 5678 "%sRootEphemeral: %s\n"
15ae422b 5679 "%sNonBlocking: %s\n"
64747e2d 5680 "%sPrivateTmp: %s\n"
7f112f50 5681 "%sPrivateDevices: %s\n"
59eeb84b 5682 "%sProtectKernelTunables: %s\n"
e66a2f65 5683 "%sProtectKernelModules: %s\n"
84703040 5684 "%sProtectKernelLogs: %s\n"
fc64760d 5685 "%sProtectClock: %s\n"
59eeb84b 5686 "%sProtectControlGroups: %s\n"
d251207d
LP
5687 "%sPrivateNetwork: %s\n"
5688 "%sPrivateUsers: %s\n"
1b8689f9
LP
5689 "%sProtectHome: %s\n"
5690 "%sProtectSystem: %s\n"
5d997827 5691 "%sMountAPIVFS: %s\n"
f3e43635 5692 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5693 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5694 "%sRestrictRealtime: %s\n"
f69567cb 5695 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5696 "%sKeyringMode: %s\n"
4e399953
LP
5697 "%sProtectHostname: %s\n"
5698 "%sProtectProc: %s\n"
5699 "%sProcSubset: %s\n",
5cb5a6ff 5700 prefix, c->umask,
14eb3285
LP
5701 prefix, empty_to_root(c->working_directory),
5702 prefix, empty_to_root(c->root_directory),
9c0c6701 5703 prefix, yes_no(c->root_ephemeral),
15ae422b 5704 prefix, yes_no(c->non_blocking),
64747e2d 5705 prefix, yes_no(c->private_tmp),
7f112f50 5706 prefix, yes_no(c->private_devices),
59eeb84b 5707 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5708 prefix, yes_no(c->protect_kernel_modules),
84703040 5709 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5710 prefix, yes_no(c->protect_clock),
59eeb84b 5711 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5712 prefix, yes_no(c->private_network),
5713 prefix, yes_no(c->private_users),
1b8689f9
LP
5714 prefix, protect_home_to_string(c->protect_home),
5715 prefix, protect_system_to_string(c->protect_system),
5e98086d 5716 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5717 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5718 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5719 prefix, yes_no(c->restrict_realtime),
f69567cb 5720 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5721 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5722 prefix, yes_no(c->protect_hostname),
5723 prefix, protect_proc_to_string(c->protect_proc),
5724 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5725
915e6d16
LP
5726 if (c->root_image)
5727 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5728
18d73705 5729 if (c->root_image_options) {
18d73705
LB
5730 fprintf(f, "%sRootImageOptions:", prefix);
5731 LIST_FOREACH(mount_options, o, c->root_image_options)
5732 if (!isempty(o->options))
9ece6444
LB
5733 fprintf(f, " %s:%s",
5734 partition_designator_to_string(o->partition_designator),
5735 o->options);
18d73705
LB
5736 fprintf(f, "\n");
5737 }
5738
0389f4fa
LB
5739 if (c->root_hash) {
5740 _cleanup_free_ char *encoded = NULL;
5741 encoded = hexmem(c->root_hash, c->root_hash_size);
5742 if (encoded)
5743 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5744 }
5745
5746 if (c->root_hash_path)
5747 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5748
d4d55b0d
LB
5749 if (c->root_hash_sig) {
5750 _cleanup_free_ char *encoded = NULL;
5751 ssize_t len;
5752 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5753 if (len)
5754 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5755 }
5756
5757 if (c->root_hash_sig_path)
5758 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5759
0389f4fa
LB
5760 if (c->root_verity)
5761 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5762
8c7be95e
LP
5763 STRV_FOREACH(e, c->environment)
5764 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5765
5766 STRV_FOREACH(e, c->environment_files)
5767 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5768
b4c14404
FB
5769 STRV_FOREACH(e, c->pass_environment)
5770 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5771
00819cc1
LP
5772 STRV_FOREACH(e, c->unset_environment)
5773 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5774
53f47dfc
YW
5775 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5776
5b10116e 5777 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5778 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5779
211a3d87
LB
5780 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5781 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5782
5783 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5784 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5785 }
3536f49e 5786 }
c2bbd90b 5787
5291f26d 5788 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5789
fb33a393 5790 if (c->nice_set)
5291f26d 5791 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5792
dd6c17b1 5793 if (c->oom_score_adjust_set)
5291f26d 5794 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5795
ad21e542 5796 if (c->coredump_filter_set)
5291f26d 5797 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5798
5b10116e 5799 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5800 if (c->rlimit[i]) {
4c3a2b84 5801 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5802 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5803 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5804 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5805 }
94f04347 5806
f8b69d1d 5807 if (c->ioprio_set) {
1756a011 5808 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5809
5bead76e 5810 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5811 if (r >= 0)
5812 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5813
5bead76e 5814 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5815 }
94f04347 5816
f8b69d1d 5817 if (c->cpu_sched_set) {
1756a011 5818 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5819
837df140
YW
5820 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5821 if (r >= 0)
5822 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5823
94f04347 5824 fprintf(f,
38b48754
LP
5825 "%sCPUSchedulingPriority: %i\n"
5826 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5827 prefix, c->cpu_sched_priority,
5828 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5829 }
94f04347 5830
0985c7c4 5831 if (c->cpu_set.set) {
e7fca352
MS
5832 _cleanup_free_ char *affinity = NULL;
5833
5834 affinity = cpu_set_to_range_string(&c->cpu_set);
5835 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5836 }
5837
b070c7c0
MS
5838 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5839 _cleanup_free_ char *nodes = NULL;
5840
5841 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5842 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5843 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5844 }
5845
3a43da28 5846 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5847 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5848
5849 fprintf(f,
80876c20
LP
5850 "%sStandardInput: %s\n"
5851 "%sStandardOutput: %s\n"
5852 "%sStandardError: %s\n",
5853 prefix, exec_input_to_string(c->std_input),
5854 prefix, exec_output_to_string(c->std_output),
5855 prefix, exec_output_to_string(c->std_error));
5856
befc4a80
LP
5857 if (c->std_input == EXEC_INPUT_NAMED_FD)
5858 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5859 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5860 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5861 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5862 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5863
5864 if (c->std_input == EXEC_INPUT_FILE)
5865 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5866 if (c->std_output == EXEC_OUTPUT_FILE)
5867 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5868 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5869 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5870 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5871 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5872 if (c->std_error == EXEC_OUTPUT_FILE)
5873 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5874 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5875 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5876 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5877 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5878
80876c20
LP
5879 if (c->tty_path)
5880 fprintf(f,
6ea832a2
LP
5881 "%sTTYPath: %s\n"
5882 "%sTTYReset: %s\n"
5883 "%sTTYVHangup: %s\n"
51462135
DDM
5884 "%sTTYVTDisallocate: %s\n"
5885 "%sTTYRows: %u\n"
5886 "%sTTYColumns: %u\n",
6ea832a2
LP
5887 prefix, c->tty_path,
5888 prefix, yes_no(c->tty_reset),
5889 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5890 prefix, yes_no(c->tty_vt_disallocate),
5891 prefix, c->tty_rows,
5892 prefix, c->tty_cols);
94f04347 5893
9f6444eb 5894 if (IN_SET(c->std_output,
9f6444eb
LP
5895 EXEC_OUTPUT_KMSG,
5896 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5897 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5898 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5899 IN_SET(c->std_error,
9f6444eb
LP
5900 EXEC_OUTPUT_KMSG,
5901 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5902 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5903 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5904
5ce70e5b 5905 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5906
837df140
YW
5907 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5908 if (r >= 0)
5909 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5910
837df140
YW
5911 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5912 if (r >= 0)
5913 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5914 }
94f04347 5915
d3070fbd
LP
5916 if (c->log_level_max >= 0) {
5917 _cleanup_free_ char *t = NULL;
5918
5919 (void) log_level_to_string_alloc(c->log_level_max, &t);
5920
5921 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5922 }
5923
5291f26d 5924 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5925 fprintf(f,
5926 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5927 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5928
5ac1530e
ZJS
5929 if (c->log_ratelimit_burst > 0)
5930 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5931
523ea123
QD
5932 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5933 fprintf(f, "%sLogFilterPatterns:", prefix);
5934
5935 char *pattern;
5936 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5937 fprintf(f, " %s", pattern);
5938 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5939 fprintf(f, " ~%s", pattern);
5940 fputc('\n', f);
5941 }
5942
5b10116e
ZJS
5943 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5944 fprintf(f, "%sLogExtraFields: ", prefix);
5945 fwrite(c->log_extra_fields[j].iov_base,
5946 1, c->log_extra_fields[j].iov_len,
5947 f);
5948 fputc('\n', f);
d3070fbd
LP
5949 }
5950
91dd5f7c
LP
5951 if (c->log_namespace)
5952 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5953
07d46372
YW
5954 if (c->secure_bits) {
5955 _cleanup_free_ char *str = NULL;
5956
5957 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5958 if (r >= 0)
5959 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5960 }
94f04347 5961
3fd5190b 5962 if (c->capability_bounding_set != CAP_MASK_UNSET) {
dd1f5bd0 5963 _cleanup_free_ char *str = NULL;
94f04347 5964
8142d735 5965 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
5966 if (r >= 0)
5967 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5968 }
5969
5970 if (c->capability_ambient_set != 0) {
dd1f5bd0 5971 _cleanup_free_ char *str = NULL;
755d4b67 5972
8142d735 5973 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
5974 if (r >= 0)
5975 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5976 }
5977
5978 if (c->user)
f2d3769a 5979 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5980 if (c->group)
f2d3769a 5981 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5982
29206d46
LP
5983 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5984
ddc155b2 5985 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5986
5b6319dc 5987 if (c->pam_name)
f2d3769a 5988 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5989
ddc155b2
TM
5990 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5991 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5992 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5993 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5994 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 5995 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 5996
5b10116e
ZJS
5997 for (size_t i = 0; i < c->n_bind_mounts; i++)
5998 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5999 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6000 c->bind_mounts[i].ignore_enoent ? "-": "",
6001 c->bind_mounts[i].source,
6002 c->bind_mounts[i].destination,
6003 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6004
5b10116e
ZJS
6005 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6006 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6007
5b10116e
ZJS
6008 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6009 t->path,
6010 isempty(t->options) ? "" : ":",
6011 strempty(t->options));
6012 }
2abd4e38 6013
169c1bda
LP
6014 if (c->utmp_id)
6015 fprintf(f,
6016 "%sUtmpIdentifier: %s\n",
6017 prefix, c->utmp_id);
7b52a628
MS
6018
6019 if (c->selinux_context)
6020 fprintf(f,
5f8640fb
LP
6021 "%sSELinuxContext: %s%s\n",
6022 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6023
80c21aea
WC
6024 if (c->apparmor_profile)
6025 fprintf(f,
6026 "%sAppArmorProfile: %s%s\n",
6027 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6028
6029 if (c->smack_process_label)
6030 fprintf(f,
6031 "%sSmackProcessLabel: %s%s\n",
6032 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6033
050f7277 6034 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6035 fprintf(f,
6036 "%sPersonality: %s\n",
6037 prefix, strna(personality_to_string(c->personality)));
6038
78e864e5
TM
6039 fprintf(f,
6040 "%sLockPersonality: %s\n",
6041 prefix, yes_no(c->lock_personality));
6042
17df7223 6043 if (c->syscall_filter) {
17df7223 6044 fprintf(f,
57183d11 6045 "%sSystemCallFilter: ",
17df7223
LP
6046 prefix);
6047
6b000af4 6048 if (!c->syscall_allow_list)
17df7223
LP
6049 fputc('~', f);
6050
349cc4a5 6051#if HAVE_SECCOMP
d5a99b7c
JJ
6052 void *id, *val;
6053 bool first = true;
90e74a66 6054 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6055 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6056 const char *errno_name = NULL;
6057 int num = PTR_TO_INT(val);
17df7223
LP
6058
6059 if (first)
6060 first = false;
6061 else
6062 fputc(' ', f);
6063
57183d11 6064 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6065 fputs(strna(name), f);
8cfa775f
YW
6066
6067 if (num >= 0) {
005bfaf1 6068 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6069 if (errno_name)
6070 fprintf(f, ":%s", errno_name);
6071 else
6072 fprintf(f, ":%d", num);
6073 }
17df7223 6074 }
351a19b1 6075#endif
17df7223
LP
6076
6077 fputc('\n', f);
6078 }
6079
57183d11 6080 if (c->syscall_archs) {
57183d11
LP
6081 fprintf(f,
6082 "%sSystemCallArchitectures:",
6083 prefix);
6084
349cc4a5 6085#if HAVE_SECCOMP
d5a99b7c 6086 void *id;
90e74a66 6087 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6088 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6089#endif
6090 fputc('\n', f);
6091 }
6092
add00535
LP
6093 if (exec_context_restrict_namespaces_set(c)) {
6094 _cleanup_free_ char *s = NULL;
6095
86c2a9f1 6096 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6097 if (r >= 0)
6098 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6099 prefix, strna(s));
add00535
LP
6100 }
6101
b1994387 6102#if HAVE_LIBBPF
8fe84dc8
YW
6103 if (exec_context_restrict_filesystems_set(c)) {
6104 char *fs;
6105 SET_FOREACH(fs, c->restrict_filesystems)
6106 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6107 }
b1994387
ILG
6108#endif
6109
a8d08f39
LP
6110 if (c->network_namespace_path)
6111 fprintf(f,
6112 "%sNetworkNamespacePath: %s\n",
6113 prefix, c->network_namespace_path);
6114
3df90f24 6115 if (c->syscall_errno > 0) {
3df90f24
YW
6116 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6117
005bfaf1 6118#if HAVE_SECCOMP
d5a99b7c 6119 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6120 if (errno_name)
005bfaf1 6121 fputs(errno_name, f);
3df90f24 6122 else
005bfaf1
TM
6123 fprintf(f, "%d", c->syscall_errno);
6124#endif
6125 fputc('\n', f);
3df90f24 6126 }
b3d13314 6127
5b10116e 6128 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6129 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6130 c->mount_images[i].ignore_enoent ? "-": "",
6131 c->mount_images[i].source,
79e20ceb 6132 c->mount_images[i].destination);
427353f6 6133 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6134 fprintf(f, ":%s:%s",
427353f6 6135 partition_designator_to_string(o->partition_designator),
79e20ceb 6136 strempty(o->options));
427353f6
LB
6137 fprintf(f, "\n");
6138 }
93f59701
LB
6139
6140 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6141 fprintf(f, "%sExtensionImages: %s%s", prefix,
6142 c->extension_images[i].ignore_enoent ? "-": "",
6143 c->extension_images[i].source);
6144 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6145 fprintf(f, ":%s:%s",
6146 partition_designator_to_string(o->partition_designator),
6147 strempty(o->options));
6148 fprintf(f, "\n");
6149 }
a07b9926
LB
6150
6151 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6152}
6153
34cf6c43 6154bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6155 assert(c);
6156
61233823 6157 /* Returns true if the process forked off would run under
a931ad47
LP
6158 * an unchanged UID or as root. */
6159
6160 if (!c->user)
6161 return true;
6162
6163 if (streq(c->user, "root") || streq(c->user, "0"))
6164 return true;
6165
6166 return false;
6167}
6168
34cf6c43 6169int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6170 int p;
6171
6172 assert(c);
6173
6174 if (c->ioprio_set)
6175 return c->ioprio;
6176
6177 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6178 if (p < 0)
0692548c 6179 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6180
8b330d7d 6181 return ioprio_normalize(p);
7f452159
LP
6182}
6183
5e98086d
ZJS
6184bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6185 assert(c);
6186
61198784 6187 /* Explicit setting wins */
5e98086d
ZJS
6188 if (c->mount_apivfs_set)
6189 return c->mount_apivfs;
6190
61198784 6191 /* Default to "yes" if root directory or image are specified */
74e12520 6192 if (exec_context_with_rootfs(c))
61198784
ZJS
6193 return true;
6194
5e98086d
ZJS
6195 return false;
6196}
6197
d3070fbd 6198void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6199 assert(c);
6200
5b10116e 6201 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6202 free(c->log_extra_fields[l].iov_base);
6203 c->log_extra_fields = mfree(c->log_extra_fields);
6204 c->n_log_extra_fields = 0;
6205}
6206
6f765baf 6207void exec_context_revert_tty(ExecContext *c) {
254d1313 6208 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6209 const char *path;
6210 struct stat st;
6f765baf
LP
6211 int r;
6212
6213 assert(c);
6214
6215 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6216 exec_context_tty_reset(c, NULL);
6217
6218 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6219 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6220 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6221 if (!exec_context_may_touch_tty(c))
6222 return;
6f765baf 6223
0ba976e8
LP
6224 path = exec_context_tty_path(c);
6225 if (!path)
6226 return;
6f765baf 6227
0ba976e8
LP
6228 fd = open(path, O_PATH|O_CLOEXEC);
6229 if (fd < 0)
6230 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6231 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6232 path);
6233
6234 if (fstat(fd, &st) < 0)
6235 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6236
6237 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6238 * if things are a character device, since a proper check either means we'd have to open the TTY and
6239 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6240 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6241 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6242 if (!S_ISCHR(st.st_mode))
6243 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6244
6245 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6246 if (r < 0)
6247 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6248}
6249
4c2f5842
LP
6250int exec_context_get_clean_directories(
6251 ExecContext *c,
6252 char **prefix,
6253 ExecCleanMask mask,
6254 char ***ret) {
6255
6256 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6257 int r;
6258
6259 assert(c);
6260 assert(prefix);
6261 assert(ret);
6262
5b10116e 6263 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6264 if (!FLAGS_SET(mask, 1U << t))
6265 continue;
6266
6267 if (!prefix[t])
6268 continue;
6269
211a3d87 6270 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6271 char *j;
6272
211a3d87 6273 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6274 if (!j)
6275 return -ENOMEM;
6276
6277 r = strv_consume(&l, j);
6278 if (r < 0)
6279 return r;
7f622a19
YW
6280
6281 /* Also remove private directories unconditionally. */
6282 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6283 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6284 if (!j)
6285 return -ENOMEM;
6286
6287 r = strv_consume(&l, j);
6288 if (r < 0)
6289 return r;
6290 }
6291
211a3d87
LB
6292 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6293 j = path_join(prefix[t], *symlink);
7f622a19
YW
6294 if (!j)
6295 return -ENOMEM;
6296
6297 r = strv_consume(&l, j);
6298 if (r < 0)
6299 return r;
6300 }
4c2f5842
LP
6301 }
6302 }
6303
6304 *ret = TAKE_PTR(l);
6305 return 0;
6306}
6307
6308int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6309 ExecCleanMask mask = 0;
6310
6311 assert(c);
6312 assert(ret);
6313
6314 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6315 if (c->directories[t].n_items > 0)
4c2f5842
LP
6316 mask |= 1U << t;
6317
6318 *ret = mask;
6319 return 0;
6320}
6321
b58b4116 6322void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6323 assert(s);
5cb5a6ff 6324
2ed26ed0
LP
6325 *s = (ExecStatus) {
6326 .pid = pid,
6327 };
6328
b58b4116
LP
6329 dual_timestamp_get(&s->start_timestamp);
6330}
6331
34cf6c43 6332void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6333 assert(s);
6334
d46b79bb 6335 if (s->pid != pid)
2ed26ed0
LP
6336 *s = (ExecStatus) {
6337 .pid = pid,
6338 };
b58b4116 6339
63983207 6340 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6341
034c6ed7
LP
6342 s->code = code;
6343 s->status = status;
169c1bda 6344
6f765baf
LP
6345 if (context && context->utmp_id)
6346 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6347}
6348
6a1d4d9f
LP
6349void exec_status_reset(ExecStatus *s) {
6350 assert(s);
6351
6352 *s = (ExecStatus) {};
6353}
6354
34cf6c43 6355void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6356 assert(s);
6357 assert(f);
6358
9fb86720
LP
6359 if (s->pid <= 0)
6360 return;
6361
4c940960
LP
6362 prefix = strempty(prefix);
6363
9fb86720 6364 fprintf(f,
ccd06097
ZJS
6365 "%sPID: "PID_FMT"\n",
6366 prefix, s->pid);
9fb86720 6367
af9d16e1 6368 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6369 fprintf(f,
6370 "%sStart Timestamp: %s\n",
04f5c018 6371 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6372
af9d16e1 6373 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6374 fprintf(f,
6375 "%sExit Timestamp: %s\n"
6376 "%sExit Code: %s\n"
6377 "%sExit Status: %i\n",
04f5c018 6378 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6379 prefix, sigchld_code_to_string(s->code),
6380 prefix, s->status);
5cb5a6ff 6381}
44d8db9e 6382
34cf6c43 6383static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6384 _cleanup_free_ char *cmd = NULL;
4c940960 6385 const char *prefix2;
44d8db9e
LP
6386
6387 assert(c);
6388 assert(f);
6389
4c940960 6390 prefix = strempty(prefix);
63c372cb 6391 prefix2 = strjoina(prefix, "\t");
44d8db9e 6392
4ef15008 6393 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6394
44d8db9e
LP
6395 fprintf(f,
6396 "%sCommand Line: %s\n",
38553034 6397 prefix, strnull(cmd));
44d8db9e 6398
9fb86720 6399 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6400}
6401
6402void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6403 assert(f);
6404
4c940960 6405 prefix = strempty(prefix);
44d8db9e 6406
03677889
YW
6407 LIST_FOREACH(command, i, c)
6408 exec_command_dump(i, f, prefix);
44d8db9e 6409}
94f04347 6410
a6a80b4f
LP
6411void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6412 ExecCommand *end;
6413
6414 assert(l);
6415 assert(e);
6416
6417 if (*l) {
35b8ca3a 6418 /* It's kind of important, that we keep the order here */
cc232fa0 6419 end = LIST_FIND_TAIL(command, *l);
71fda00f 6420 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f 6421 } else
3ff67ec4 6422 *l = e;
a6a80b4f
LP
6423}
6424
26fd040d
LP
6425int exec_command_set(ExecCommand *c, const char *path, ...) {
6426 va_list ap;
6427 char **l, *p;
6428
6429 assert(c);
6430 assert(path);
6431
6432 va_start(ap, path);
6433 l = strv_new_ap(path, ap);
6434 va_end(ap);
6435
6436 if (!l)
6437 return -ENOMEM;
6438
250a918d
LP
6439 p = strdup(path);
6440 if (!p) {
26fd040d
LP
6441 strv_free(l);
6442 return -ENOMEM;
6443 }
6444
6897dfe8 6445 free_and_replace(c->path, p);
26fd040d 6446
130d3d22 6447 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6448}
6449
86b23b07 6450int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6451 _cleanup_strv_free_ char **l = NULL;
86b23b07 6452 va_list ap;
86b23b07
JS
6453 int r;
6454
6455 assert(c);
6456 assert(path);
6457
6458 va_start(ap, path);
6459 l = strv_new_ap(path, ap);
6460 va_end(ap);
6461
6462 if (!l)
6463 return -ENOMEM;
6464
e287086b 6465 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6466 if (r < 0)
86b23b07 6467 return r;
86b23b07
JS
6468
6469 return 0;
6470}
6471
437f3e35
LP
6472static char *destroy_tree(char *path) {
6473 if (!path)
6474 return NULL;
9c0c6701 6475
437f3e35
LP
6476 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6477 log_debug("Spawning process to nuke '%s'", path);
9c0c6701 6478
437f3e35
LP
6479 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6480 }
9c0c6701 6481
437f3e35 6482 return mfree(path);
9c0c6701
DDM
6483}
6484
e52a696a 6485static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
e8a565cb
YW
6486 if (!rt)
6487 return NULL;
6488
6489 if (rt->manager)
e76506b7 6490 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
e8a565cb 6491
e52a696a
DDM
6492 rt->id = mfree(rt->id);
6493 rt->tmp_dir = mfree(rt->tmp_dir);
6494 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6495 safe_close_pair(rt->netns_storage_socket);
6496 safe_close_pair(rt->ipcns_storage_socket);
6497 return mfree(rt);
6498}
6499
6500DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6501DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6502
6503ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
e52a696a
DDM
6504 if (!rt)
6505 return NULL;
6506
6507 assert(rt->n_ref > 0);
6508 rt->n_ref--;
6509
6510 if (rt->n_ref > 0)
6511 return NULL;
56a13a49 6512
437f3e35
LP
6513 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6514 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
e8a565cb 6515
e52a696a 6516 return exec_shared_runtime_free(rt);
e8a565cb
YW
6517}
6518
e76506b7 6519static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
56a13a49 6520 _cleanup_free_ char *id_copy = NULL;
e76506b7 6521 ExecSharedRuntime *n;
613b411c 6522
8e8009dc 6523 assert(ret);
613b411c 6524
56a13a49
ZJS
6525 id_copy = strdup(id);
6526 if (!id_copy)
6527 return -ENOMEM;
6528
e76506b7 6529 n = new(ExecSharedRuntime, 1);
8e8009dc 6530 if (!n)
613b411c
LP
6531 return -ENOMEM;
6532
e76506b7 6533 *n = (ExecSharedRuntime) {
56a13a49 6534 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6535 .netns_storage_socket = PIPE_EBADF,
6536 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6537 };
6538
6539 *ret = n;
613b411c
LP
6540 return 0;
6541}
6542
e76506b7 6543static int exec_shared_runtime_add(
e8a565cb
YW
6544 Manager *m,
6545 const char *id,
56a13a49
ZJS
6546 char **tmp_dir,
6547 char **var_tmp_dir,
6548 int netns_storage_socket[2],
a70581ff 6549 int ipcns_storage_socket[2],
e76506b7 6550 ExecSharedRuntime **ret) {
e8a565cb 6551
e76506b7 6552 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
613b411c
LP
6553 int r;
6554
e8a565cb 6555 assert(m);
613b411c
LP
6556 assert(id);
6557
a70581ff 6558 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6559
e76506b7 6560 r = exec_shared_runtime_allocate(&rt, id);
613b411c
LP
6561 if (r < 0)
6562 return r;
6563
e76506b7 6564 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6565 if (r < 0)
6566 return r;
e8a565cb 6567
56a13a49
ZJS
6568 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6569 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6570 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6571
6572 if (netns_storage_socket) {
56a13a49
ZJS
6573 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6574 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6575 }
6576
a70581ff
XR
6577 if (ipcns_storage_socket) {
6578 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6579 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6580 }
6581
e8a565cb
YW
6582 rt->manager = m;
6583
6584 if (ret)
6585 *ret = rt;
e76506b7 6586 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
56a13a49 6587 TAKE_PTR(rt);
e8a565cb
YW
6588 return 0;
6589}
6590
e76506b7 6591static int exec_shared_runtime_make(
74aaf59b
LP
6592 Manager *m,
6593 const ExecContext *c,
6594 const char *id,
e76506b7 6595 ExecSharedRuntime **ret) {
74aaf59b 6596
56a13a49 6597 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6598 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6599 int r;
6600
6601 assert(m);
6602 assert(c);
6603 assert(id);
6604
e76506b7 6605 /* It is not necessary to create ExecSharedRuntime object. */
fde36d25 6606 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
74aaf59b 6607 *ret = NULL;
e8a565cb 6608 return 0;
74aaf59b 6609 }
e8a565cb 6610
efa2f3a1
TM
6611 if (c->private_tmp &&
6612 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6613 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6614 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6615 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6616 if (r < 0)
6617 return r;
6618 }
6619
fbbb9697 6620 if (exec_needs_network_namespace(c)) {
e8a565cb
YW
6621 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6622 return -errno;
6623 }
6624
fde36d25 6625 if (exec_needs_ipc_namespace(c)) {
a70581ff
XR
6626 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6627 return -errno;
6628 }
6629
e76506b7 6630 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6631 if (r < 0)
6632 return r;
6633
613b411c
LP
6634 return 1;
6635}
6636
e76506b7
DDM
6637int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6638 ExecSharedRuntime *rt;
e8a565cb 6639 int r;
613b411c 6640
e8a565cb
YW
6641 assert(m);
6642 assert(id);
6643 assert(ret);
6644
e76506b7 6645 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
e8a565cb 6646 if (rt)
e76506b7 6647 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6648 goto ref;
6649
74aaf59b
LP
6650 if (!create) {
6651 *ret = NULL;
e8a565cb 6652 return 0;
74aaf59b 6653 }
e8a565cb
YW
6654
6655 /* If not found, then create a new object. */
e76506b7 6656 r = exec_shared_runtime_make(m, c, id, &rt);
74aaf59b 6657 if (r < 0)
e8a565cb 6658 return r;
74aaf59b 6659 if (r == 0) {
e76506b7 6660 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
74aaf59b
LP
6661 *ret = NULL;
6662 return 0;
6663 }
613b411c 6664
e8a565cb
YW
6665ref:
6666 /* increment reference counter. */
6667 rt->n_ref++;
6668 *ret = rt;
6669 return 1;
6670}
613b411c 6671
e76506b7
DDM
6672int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6673 ExecSharedRuntime *rt;
e8a565cb
YW
6674
6675 assert(m);
613b411c
LP
6676 assert(f);
6677 assert(fds);
6678
e76506b7 6679 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb 6680 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6681
e8a565cb
YW
6682 if (rt->tmp_dir)
6683 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6684
e8a565cb
YW
6685 if (rt->var_tmp_dir)
6686 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6687
e8a565cb
YW
6688 if (rt->netns_storage_socket[0] >= 0) {
6689 int copy;
613b411c 6690
e8a565cb
YW
6691 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6692 if (copy < 0)
6693 return copy;
613b411c 6694
e8a565cb
YW
6695 fprintf(f, " netns-socket-0=%i", copy);
6696 }
613b411c 6697
e8a565cb
YW
6698 if (rt->netns_storage_socket[1] >= 0) {
6699 int copy;
613b411c 6700
e8a565cb
YW
6701 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6702 if (copy < 0)
6703 return copy;
613b411c 6704
e8a565cb
YW
6705 fprintf(f, " netns-socket-1=%i", copy);
6706 }
6707
a70581ff
XR
6708 if (rt->ipcns_storage_socket[0] >= 0) {
6709 int copy;
6710
6711 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6712 if (copy < 0)
6713 return copy;
6714
6715 fprintf(f, " ipcns-socket-0=%i", copy);
6716 }
6717
6718 if (rt->ipcns_storage_socket[1] >= 0) {
6719 int copy;
6720
6721 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6722 if (copy < 0)
6723 return copy;
6724
6725 fprintf(f, " ipcns-socket-1=%i", copy);
6726 }
6727
e8a565cb 6728 fputc('\n', f);
613b411c
LP
6729 }
6730
6731 return 0;
6732}
6733
e76506b7
DDM
6734int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6735 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6736 ExecSharedRuntime *rt;
613b411c
LP
6737 int r;
6738
e8a565cb
YW
6739 /* This is for the migration from old (v237 or earlier) deserialization text.
6740 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
e76506b7 6741 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
e8a565cb
YW
6742 * so or not from the serialized text, then we always creates a new object owned by this. */
6743
6744 assert(u);
613b411c
LP
6745 assert(key);
6746 assert(value);
6747
e76506b7 6748 /* Manager manages ExecSharedRuntime objects by the unit id.
e8a565cb
YW
6749 * So, we omit the serialized text when the unit does not have id (yet?)... */
6750 if (isempty(u->id)) {
6751 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6752 return 0;
6753 }
613b411c 6754
e76506b7 6755 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
cbc165d1 6756 return log_oom();
e8a565cb 6757
e76506b7 6758 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
e8a565cb 6759 if (!rt) {
e76506b7 6760 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6761 return log_oom();
613b411c 6762
e8a565cb
YW
6763 rt = rt_create;
6764 }
6765
6766 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6767 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6768 return -ENOMEM;
613b411c
LP
6769
6770 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6771 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6772 return -ENOMEM;
613b411c
LP
6773
6774 } else if (streq(key, "netns-socket-0")) {
6775 int fd;
6776
e652663a 6777 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6778 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6779 return 0;
613b411c 6780 }
e8a565cb
YW
6781
6782 safe_close(rt->netns_storage_socket[0]);
6783 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6784
613b411c
LP
6785 } else if (streq(key, "netns-socket-1")) {
6786 int fd;
6787
e652663a 6788 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6789 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6790 return 0;
613b411c 6791 }
e8a565cb
YW
6792
6793 safe_close(rt->netns_storage_socket[1]);
6794 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6795
613b411c
LP
6796 } else
6797 return 0;
6798
e76506b7 6799 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
e8a565cb 6800 if (rt_create) {
e76506b7 6801 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
e8a565cb 6802 if (r < 0) {
3fe91079 6803 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6804 return 0;
6805 }
613b411c 6806
e8a565cb 6807 rt_create->manager = u->manager;
613b411c 6808
e8a565cb 6809 /* Avoid cleanup */
56a13a49 6810 TAKE_PTR(rt_create);
e8a565cb 6811 }
98b47d54 6812
e8a565cb
YW
6813 return 1;
6814}
613b411c 6815
e76506b7 6816int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
56a13a49
ZJS
6817 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6818 char *id = NULL;
a70581ff 6819 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 6820 const char *p, *v = ASSERT_PTR(value);
e8a565cb 6821 size_t n;
613b411c 6822
e8a565cb 6823 assert(m);
e8a565cb 6824 assert(fds);
98b47d54 6825
e8a565cb 6826 n = strcspn(v, " ");
2f82562b 6827 id = strndupa_safe(v, n);
e8a565cb
YW
6828 if (v[n] != ' ')
6829 goto finalize;
6830 p = v + n + 1;
6831
6832 v = startswith(p, "tmp-dir=");
6833 if (v) {
6834 n = strcspn(v, " ");
56a13a49
ZJS
6835 tmp_dir = strndup(v, n);
6836 if (!tmp_dir)
6837 return log_oom();
e8a565cb
YW
6838 if (v[n] != ' ')
6839 goto finalize;
6840 p = v + n + 1;
6841 }
6842
6843 v = startswith(p, "var-tmp-dir=");
6844 if (v) {
6845 n = strcspn(v, " ");
56a13a49
ZJS
6846 var_tmp_dir = strndup(v, n);
6847 if (!var_tmp_dir)
6848 return log_oom();
e8a565cb
YW
6849 if (v[n] != ' ')
6850 goto finalize;
6851 p = v + n + 1;
6852 }
6853
6854 v = startswith(p, "netns-socket-0=");
6855 if (v) {
6856 char *buf;
6857
6858 n = strcspn(v, " ");
2f82562b 6859 buf = strndupa_safe(v, n);
c413bb28 6860
e652663a
DT
6861 netns_fdpair[0] = parse_fd(buf);
6862 if (netns_fdpair[0] < 0)
6863 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6864 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6865 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6866 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6867 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6868 if (v[n] != ' ')
6869 goto finalize;
6870 p = v + n + 1;
613b411c
LP
6871 }
6872
e8a565cb
YW
6873 v = startswith(p, "netns-socket-1=");
6874 if (v) {
6875 char *buf;
98b47d54 6876
e8a565cb 6877 n = strcspn(v, " ");
2f82562b 6878 buf = strndupa_safe(v, n);
a70581ff 6879
e652663a
DT
6880 netns_fdpair[1] = parse_fd(buf);
6881 if (netns_fdpair[1] < 0)
6882 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6883 if (!fdset_contains(fds, netns_fdpair[1]))
6884 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6885 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6886 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6887 if (v[n] != ' ')
6888 goto finalize;
6889 p = v + n + 1;
6890 }
6891
6892 v = startswith(p, "ipcns-socket-0=");
6893 if (v) {
6894 char *buf;
6895
6896 n = strcspn(v, " ");
2f82562b 6897 buf = strndupa_safe(v, n);
a70581ff 6898
e652663a
DT
6899 ipcns_fdpair[0] = parse_fd(buf);
6900 if (ipcns_fdpair[0] < 0)
6901 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
a70581ff
XR
6902 if (!fdset_contains(fds, ipcns_fdpair[0]))
6903 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6904 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6905 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6906 if (v[n] != ' ')
6907 goto finalize;
6908 p = v + n + 1;
6909 }
6910
6911 v = startswith(p, "ipcns-socket-1=");
6912 if (v) {
6913 char *buf;
6914
6915 n = strcspn(v, " ");
2f82562b 6916 buf = strndupa_safe(v, n);
a70581ff 6917
e652663a
DT
6918 ipcns_fdpair[1] = parse_fd(buf);
6919 if (ipcns_fdpair[1] < 0)
6920 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
a70581ff 6921 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6922 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6923 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6924 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6925 }
98b47d54 6926
e8a565cb 6927finalize:
e76506b7 6928 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6929 if (r < 0)
56a13a49
ZJS
6930 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6931 return 0;
e8a565cb 6932}
613b411c 6933
e76506b7
DDM
6934void exec_shared_runtime_vacuum(Manager *m) {
6935 ExecSharedRuntime *rt;
e8a565cb
YW
6936
6937 assert(m);
6938
e76506b7 6939 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
e8a565cb 6940
e76506b7 6941 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb
YW
6942 if (rt->n_ref > 0)
6943 continue;
6944
e52a696a 6945 (void) exec_shared_runtime_free(rt);
e8a565cb 6946 }
613b411c
LP
6947}
6948
9c0c6701
DDM
6949int exec_runtime_make(
6950 const Unit *unit,
6951 const ExecContext *context,
6952 ExecSharedRuntime *shared,
6953 DynamicCreds *creds,
6954 ExecRuntime **ret) {
6955 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6956 _cleanup_free_ char *ephemeral = NULL;
28135da3 6957 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
9c0c6701 6958 int r;
28135da3 6959
9c0c6701
DDM
6960 assert(unit);
6961 assert(context);
28135da3
DDM
6962 assert(ret);
6963
9c0c6701 6964 if (!shared && !creds && !exec_needs_ephemeral(context)) {
28135da3
DDM
6965 *ret = NULL;
6966 return 0;
6967 }
6968
9c0c6701
DDM
6969 if (exec_needs_ephemeral(context)) {
6970 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
6971 if (r < 0)
6972 return r;
6973
6974 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
6975 if (r < 0)
6976 return r;
6977
6978 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
6979 return -errno;
6980 }
6981
28135da3
DDM
6982 rt = new(ExecRuntime, 1);
6983 if (!rt)
6984 return -ENOMEM;
6985
6986 *rt = (ExecRuntime) {
6987 .shared = shared,
15220772 6988 .dynamic_creds = creds,
9c0c6701
DDM
6989 .ephemeral_copy = TAKE_PTR(ephemeral),
6990 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
6991 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
28135da3
DDM
6992 };
6993
6994 *ret = TAKE_PTR(rt);
6995 return 1;
6996}
6997
6998ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
6999 if (!rt)
7000 return NULL;
7001
7002 exec_shared_runtime_unref(rt->shared);
15220772 7003 dynamic_creds_unref(rt->dynamic_creds);
9c0c6701 7004
437f3e35 7005 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
9c0c6701 7006
9c0c6701 7007 safe_close_pair(rt->ephemeral_storage_socket);
28135da3
DDM
7008 return mfree(rt);
7009}
7010
7011ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7012 if (!rt)
7013 return NULL;
7014
7015 rt->shared = exec_shared_runtime_destroy(rt->shared);
15220772 7016 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
28135da3
DDM
7017 return exec_runtime_free(rt);
7018}
7019
b9c04eaf
YW
7020void exec_params_clear(ExecParameters *p) {
7021 if (!p)
7022 return;
7023
c3f8a065
LP
7024 p->environment = strv_free(p->environment);
7025 p->fd_names = strv_free(p->fd_names);
7026 p->fds = mfree(p->fds);
7027 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7028}
7029
211a3d87
LB
7030void exec_directory_done(ExecDirectory *d) {
7031 if (!d)
7032 return;
7033
7034 for (size_t i = 0; i < d->n_items; i++) {
7035 free(d->items[i].path);
7036 strv_free(d->items[i].symlinks);
7037 }
7038
7039 d->items = mfree(d->items);
7040 d->n_items = 0;
7041 d->mode = 0755;
7042}
7043
564e5c98
YW
7044static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7045 assert(d);
7046 assert(path);
7047
7048 for (size_t i = 0; i < d->n_items; i++)
7049 if (path_equal(d->items[i].path, path))
7050 return &d->items[i];
7051
7052 return NULL;
7053}
7054
7055int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7056 _cleanup_strv_free_ char **s = NULL;
7057 _cleanup_free_ char *p = NULL;
564e5c98
YW
7058 ExecDirectoryItem *existing;
7059 int r;
211a3d87
LB
7060
7061 assert(d);
211a3d87
LB
7062 assert(path);
7063
564e5c98
YW
7064 existing = exec_directory_find(d, path);
7065 if (existing) {
7066 r = strv_extend(&existing->symlinks, symlink);
7067 if (r < 0)
7068 return r;
7069
7070 return 0; /* existing item is updated */
7071 }
7072
211a3d87
LB
7073 p = strdup(path);
7074 if (!p)
7075 return -ENOMEM;
7076
564e5c98
YW
7077 if (symlink) {
7078 s = strv_new(symlink);
211a3d87
LB
7079 if (!s)
7080 return -ENOMEM;
7081 }
7082
564e5c98 7083 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7084 return -ENOMEM;
7085
564e5c98 7086 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7087 .path = TAKE_PTR(p),
7088 .symlinks = TAKE_PTR(s),
7089 };
7090
564e5c98 7091 return 1; /* new item is added */
211a3d87
LB
7092}
7093
a2ab603c
YW
7094static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7095 assert(a);
7096 assert(b);
7097
7098 return path_compare(a->path, b->path);
7099}
7100
7101void exec_directory_sort(ExecDirectory *d) {
7102 assert(d);
7103
7104 /* Sort the exec directories to make always parent directories processed at first in
7105 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7106 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7107 * list. See also comments in setup_exec_directory() and issue #24783. */
7108
7109 if (d->n_items <= 1)
7110 return;
7111
7112 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7113
7114 for (size_t i = 1; i < d->n_items; i++)
7115 for (size_t j = 0; j < i; j++)
7116 if (path_startswith(d->items[i].path, d->items[j].path)) {
7117 d->items[i].only_create = true;
7118 break;
7119 }
211a3d87
LB
7120}
7121
4fb8f1e8
LP
7122ExecCleanMask exec_clean_mask_from_string(const char *s) {
7123 ExecDirectoryType t;
7124
7125 assert(s);
7126
7127 if (streq(s, "all"))
7128 return EXEC_CLEAN_ALL;
7129 if (streq(s, "fdstore"))
7130 return EXEC_CLEAN_FDSTORE;
7131
7132 t = exec_resource_type_from_string(s);
7133 if (t < 0)
7134 return (ExecCleanMask) t;
7135
7136 return 1U << t;
7137}
7138
80876c20
LP
7139static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7140 [EXEC_INPUT_NULL] = "null",
7141 [EXEC_INPUT_TTY] = "tty",
7142 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7143 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7144 [EXEC_INPUT_SOCKET] = "socket",
7145 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7146 [EXEC_INPUT_DATA] = "data",
2038c3f5 7147 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7148};
7149
8a0867d6
LP
7150DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7151
94f04347 7152static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7153 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7154 [EXEC_OUTPUT_NULL] = "null",
80876c20 7155 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7156 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7157 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7158 [EXEC_OUTPUT_JOURNAL] = "journal",
7159 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7160 [EXEC_OUTPUT_SOCKET] = "socket",
7161 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7162 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7163 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7164 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7165};
7166
7167DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7168
7169static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7170 [EXEC_UTMP_INIT] = "init",
7171 [EXEC_UTMP_LOGIN] = "login",
7172 [EXEC_UTMP_USER] = "user",
7173};
7174
7175DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7176
7177static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7178 [EXEC_PRESERVE_NO] = "no",
7179 [EXEC_PRESERVE_YES] = "yes",
7180 [EXEC_PRESERVE_RESTART] = "restart",
7181};
7182
7183DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7184
6b7b2ed9 7185/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7186static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7187 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7188 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7189 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7190 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7191 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7192};
7193
7194DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7195
211a3d87
LB
7196/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7197static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7198 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7199 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7200 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7201 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7202 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7203};
7204
7205DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7206
6b7b2ed9
LP
7207/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7208 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7209 * directories, specifically .timer units with their timestamp touch file. */
7210static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7211 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7212 [EXEC_DIRECTORY_STATE] = "state",
7213 [EXEC_DIRECTORY_CACHE] = "cache",
7214 [EXEC_DIRECTORY_LOGS] = "logs",
7215 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7216};
7217
7218DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7219
7220/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7221 * the service payload in. */
fb2042dd
YW
7222static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7223 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7224 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7225 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7226 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7227 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7228};
7229
7230DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7231
b1edf445
LP
7232static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7233 [EXEC_KEYRING_INHERIT] = "inherit",
7234 [EXEC_KEYRING_PRIVATE] = "private",
7235 [EXEC_KEYRING_SHARED] = "shared",
7236};
7237
7238DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);