]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
Merge pull request #29130 from poettering/unit-defaults
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
ac8db36c 7#include <sys/file.h>
f5947a5e 8#include <sys/ioctl.h>
f3e43635 9#include <sys/mman.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
9c0c6701
DDM
18#include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
349cc4a5 20#if HAVE_PAM
5b6319dc
LP
21#include <security/pam_appl.h>
22#endif
23
349cc4a5 24#if HAVE_SELINUX
7b52a628
MS
25#include <selinux/selinux.h>
26#endif
27
349cc4a5 28#if HAVE_APPARMOR
eef65bf3
MS
29#include <sys/apparmor.h>
30#endif
31
24882e06 32#include "sd-messages.h"
8dd4c05b
LP
33
34#include "af-list.h"
b5efdb8a 35#include "alloc-util.h"
349cc4a5 36#if HAVE_APPARMOR
3ffd4af2
LP
37#include "apparmor-util.h"
38#endif
ee617a4e 39#include "argv-util.h"
8dd4c05b
LP
40#include "async.h"
41#include "barrier.h"
b1994387 42#include "bpf-lsm.h"
9c0c6701 43#include "btrfs-util.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
9c0c6701 46#include "chattr-util.h"
fdb3deca 47#include "cgroup-setup.h"
f461a28d 48#include "chase.h"
bb0c0d6f 49#include "chown-recursive.h"
28db6fbf 50#include "constants.h"
da681e1b 51#include "cpu-set-util.h"
6a818c3c 52#include "data-fd-util.h"
686d13b9 53#include "env-file.h"
4d1a6904 54#include "env-util.h"
17df7223 55#include "errno-list.h"
8a62620e 56#include "escape.h"
43962c30 57#include "exec-credential.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
f97b34a6 61#include "format-util.h"
7d50b32a 62#include "glob-util.h"
0389f4fa 63#include "hexdecoct.h"
c004493c 64#include "io-util.h"
032b3afb 65#include "ioprio-util.h"
9c0c6701 66#include "lock-util.h"
8dd4c05b
LP
67#include "log.h"
68#include "macro.h"
e8a565cb 69#include "manager.h"
2a341bb9 70#include "manager-dump.h"
0a970718 71#include "memory-util.h"
f5947a5e 72#include "missing_fs.h"
5bead76e 73#include "missing_ioprio.h"
7a114ed4 74#include "missing_prctl.h"
35cd0ba5 75#include "mkdir-label.h"
8dd4c05b 76#include "namespace.h"
6bedfcbb 77#include "parse-util.h"
8dd4c05b 78#include "path-util.h"
4d62ee55 79#include "proc-cmdline.h"
0b452006 80#include "process-util.h"
6bb00842 81#include "psi-util.h"
78f22b97 82#include "rlimit-util.h"
8dd4c05b 83#include "rm-rf.h"
3ffd4af2 84#include "seccomp-util.h"
07d46372 85#include "securebits-util.h"
8dd4c05b 86#include "selinux-util.h"
24882e06 87#include "signal-util.h"
8dd4c05b 88#include "smack-util.h"
57b7a260 89#include "socket-util.h"
a2ab603c 90#include "sort-util.h"
fd63e712 91#include "special.h"
949befd3 92#include "stat-util.h"
8b43440b 93#include "string-table.h"
07630cea 94#include "string-util.h"
8dd4c05b 95#include "strv.h"
7ccbd1ae 96#include "syslog-util.h"
8dd4c05b 97#include "terminal-util.h"
bb0c0d6f 98#include "tmpfile-util.h"
566b7d23 99#include "umask-util.h"
2d3b784d 100#include "unit-serialize.h"
b1d4f8e1 101#include "user-util.h"
8dd4c05b 102#include "utmp-wtmp.h"
5cb5a6ff 103
e056b01d 104#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 105#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 106
531dca78
LP
107#define SNDBUF_SIZE (8*1024*1024)
108
da6053d0 109static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
110 if (n_fds <= 0)
111 return 0;
112
a0d40ac5
LP
113 /* Modifies the fds array! (sorts it) */
114
034c6ed7
LP
115 assert(fds);
116
5b10116e
ZJS
117 for (int start = 0;;) {
118 int restart_from = -1;
034c6ed7 119
5b10116e 120 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
121 int nfd;
122
123 /* Already at right index? */
124 if (fds[i] == i+3)
125 continue;
126
3cc2aff1
LP
127 nfd = fcntl(fds[i], F_DUPFD, i + 3);
128 if (nfd < 0)
034c6ed7
LP
129 return -errno;
130
03e334a1 131 safe_close(fds[i]);
034c6ed7
LP
132 fds[i] = nfd;
133
134 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 135 * let's remember that and try again from here */
034c6ed7
LP
136 if (nfd != i+3 && restart_from < 0)
137 restart_from = i;
138 }
139
140 if (restart_from < 0)
141 break;
142
143 start = restart_from;
144 }
145
146 return 0;
147}
148
cd48e23f
RP
149static int flags_fds(
150 const int fds[],
151 size_t n_socket_fds,
152 size_t n_fds,
153 bool nonblock) {
154
e2c76839 155 int r;
47a71eed
LP
156
157 if (n_fds <= 0)
158 return 0;
159
160 assert(fds);
161
9b141911
FB
162 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
163 * O_NONBLOCK only applies to socket activation though. */
47a71eed 164
5b10116e 165 for (size_t i = 0; i < n_fds; i++) {
47a71eed 166
9b141911
FB
167 if (i < n_socket_fds) {
168 r = fd_nonblock(fds[i], nonblock);
169 if (r < 0)
170 return r;
171 }
47a71eed 172
451a074f
LP
173 /* We unconditionally drop FD_CLOEXEC from the fds,
174 * since after all we want to pass these fds to our
175 * children */
47a71eed 176
3cc2aff1
LP
177 r = fd_cloexec(fds[i], false);
178 if (r < 0)
e2c76839 179 return r;
47a71eed
LP
180 }
181
182 return 0;
183}
184
1e22b5cd 185static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
186 assert(context);
187
1e22b5cd
LP
188 if (context->stdio_as_fds)
189 return NULL;
190
80876c20
LP
191 if (context->tty_path)
192 return context->tty_path;
193
194 return "/dev/console";
195}
196
4d62ee55 197static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
4d62ee55
DDM
198 unsigned rows, cols;
199 const char *tty;
4d62ee55
DDM
200
201 assert(context);
202 assert(ret_rows);
203 assert(ret_cols);
204
205 rows = context->tty_rows;
206 cols = context->tty_cols;
207
208 tty = exec_context_tty_path(context);
29f5a5ae
DDM
209 if (tty)
210 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
4d62ee55
DDM
211
212 *ret_rows = rows;
213 *ret_cols = cols;
214
215 return 0;
216}
217
1e22b5cd 218static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
a0043bfa
ZJS
219 _cleanup_close_ int fd = -EBADF;
220 const char *path = exec_context_tty_path(ASSERT_PTR(context));
1e22b5cd 221
a0043bfa
ZJS
222 /* Take a lock around the device for the duration of the setup that we do here.
223 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
224 * We open a new fd that will be closed automatically, and operate on it for convenience.
225 */
6ea832a2 226
a0043bfa
ZJS
227 if (p && p->stdin_fd >= 0) {
228 fd = xopenat_lock(p->stdin_fd, NULL,
229 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
230 if (fd < 0)
231 return;
232 } else if (path) {
233 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
234 if (fd < 0)
235 return;
6ea832a2 236
a0043bfa
ZJS
237 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
238 return;
239 } else
240 return; /* nothing to do */
6ea832a2 241
a0043bfa
ZJS
242 if (context->tty_vhangup)
243 (void) terminal_vhangup_fd(fd);
244
245 if (context->tty_reset)
246 (void) reset_terminal_fd(fd, true);
1e22b5cd 247
4d62ee55
DDM
248 if (p && p->stdin_fd >= 0) {
249 unsigned rows = context->tty_rows, cols = context->tty_cols;
250
251 (void) exec_context_tty_size(context, &rows, &cols);
252 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
253 }
51462135 254
1e22b5cd
LP
255 if (context->tty_vt_disallocate && path)
256 (void) vt_disallocate(path);
6ea832a2
LP
257}
258
6af760f3
LP
259static bool is_terminal_input(ExecInput i) {
260 return IN_SET(i,
261 EXEC_INPUT_TTY,
262 EXEC_INPUT_TTY_FORCE,
263 EXEC_INPUT_TTY_FAIL);
264}
265
3a1286b6 266static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
267 return IN_SET(o,
268 EXEC_OUTPUT_TTY,
6af760f3
LP
269 EXEC_OUTPUT_KMSG_AND_CONSOLE,
270 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
271}
272
aac8c0c3
LP
273static bool is_kmsg_output(ExecOutput o) {
274 return IN_SET(o,
275 EXEC_OUTPUT_KMSG,
276 EXEC_OUTPUT_KMSG_AND_CONSOLE);
277}
278
6af760f3
LP
279static bool exec_context_needs_term(const ExecContext *c) {
280 assert(c);
281
282 /* Return true if the execution context suggests we should set $TERM to something useful. */
283
284 if (is_terminal_input(c->std_input))
285 return true;
286
287 if (is_terminal_output(c->std_output))
288 return true;
289
290 if (is_terminal_output(c->std_error))
291 return true;
292
293 return !!c->tty_path;
3a1286b6
MS
294}
295
80876c20 296static int open_null_as(int flags, int nfd) {
046a82c1 297 int fd;
071830ff 298
80876c20 299 assert(nfd >= 0);
071830ff 300
613b411c
LP
301 fd = open("/dev/null", flags|O_NOCTTY);
302 if (fd < 0)
071830ff
LP
303 return -errno;
304
046a82c1 305 return move_fd(fd, nfd, false);
071830ff
LP
306}
307
91dd5f7c
LP
308static int connect_journal_socket(
309 int fd,
310 const char *log_namespace,
311 uid_t uid,
312 gid_t gid) {
313
524daa8c
ZJS
314 uid_t olduid = UID_INVALID;
315 gid_t oldgid = GID_INVALID;
91dd5f7c 316 const char *j;
524daa8c
ZJS
317 int r;
318
91dd5f7c
LP
319 j = log_namespace ?
320 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
321 "/run/systemd/journal/stdout";
91dd5f7c 322
cad93f29 323 if (gid_is_valid(gid)) {
524daa8c
ZJS
324 oldgid = getgid();
325
92a17af9 326 if (setegid(gid) < 0)
524daa8c
ZJS
327 return -errno;
328 }
329
cad93f29 330 if (uid_is_valid(uid)) {
524daa8c
ZJS
331 olduid = getuid();
332
92a17af9 333 if (seteuid(uid) < 0) {
524daa8c
ZJS
334 r = -errno;
335 goto restore_gid;
336 }
337 }
338
1861986a 339 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 340
1861986a
LP
341 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
342 an LSM interferes. */
524daa8c 343
cad93f29 344 if (uid_is_valid(uid))
524daa8c
ZJS
345 (void) seteuid(olduid);
346
347 restore_gid:
cad93f29 348 if (gid_is_valid(gid))
524daa8c
ZJS
349 (void) setegid(oldgid);
350
351 return r;
352}
353
fd1f9c89 354static int connect_logger_as(
34cf6c43 355 const Unit *unit,
fd1f9c89 356 const ExecContext *context,
af635cf3 357 const ExecParameters *params,
fd1f9c89
LP
358 ExecOutput output,
359 const char *ident,
fd1f9c89
LP
360 int nfd,
361 uid_t uid,
362 gid_t gid) {
363
254d1313 364 _cleanup_close_ int fd = -EBADF;
2ac1ff68 365 int r;
071830ff
LP
366
367 assert(context);
af635cf3 368 assert(params);
80876c20
LP
369 assert(output < _EXEC_OUTPUT_MAX);
370 assert(ident);
371 assert(nfd >= 0);
071830ff 372
54fe0cdb
LP
373 fd = socket(AF_UNIX, SOCK_STREAM, 0);
374 if (fd < 0)
80876c20 375 return -errno;
071830ff 376
91dd5f7c 377 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
378 if (r < 0)
379 return r;
071830ff 380
2ac1ff68 381 if (shutdown(fd, SHUT_RD) < 0)
80876c20 382 return -errno;
071830ff 383
fd1f9c89 384 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 385
2ac1ff68 386 if (dprintf(fd,
62bca2c6 387 "%s\n"
80876c20
LP
388 "%s\n"
389 "%i\n"
54fe0cdb
LP
390 "%i\n"
391 "%i\n"
392 "%i\n"
4f4a1dbf 393 "%i\n",
c867611e 394 context->syslog_identifier ?: ident,
af635cf3 395 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
396 context->syslog_priority,
397 !!context->syslog_level_prefix,
f3dc6af2 398 false,
aac8c0c3 399 is_kmsg_output(output),
2ac1ff68
EV
400 is_terminal_output(output)) < 0)
401 return -errno;
80876c20 402
2ac1ff68 403 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 404}
2ac1ff68 405
3a274a21 406static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 407 int fd;
071830ff 408
80876c20
LP
409 assert(path);
410 assert(nfd >= 0);
fd1f9c89 411
3a274a21 412 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 413 if (fd < 0)
80876c20 414 return fd;
071830ff 415
046a82c1 416 return move_fd(fd, nfd, false);
80876c20 417}
071830ff 418
2038c3f5 419static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 420 _cleanup_close_ int fd = -EBADF;
86fca584 421 int r;
071830ff 422
80876c20 423 assert(path);
071830ff 424
2038c3f5
LP
425 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
426 flags |= O_CREAT;
427
428 fd = open(path, flags|O_NOCTTY, mode);
429 if (fd >= 0)
15a3e96f 430 return TAKE_FD(fd);
071830ff 431
2038c3f5
LP
432 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
433 return -errno;
2038c3f5
LP
434
435 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
436
437 fd = socket(AF_UNIX, SOCK_STREAM, 0);
438 if (fd < 0)
439 return -errno;
440
1861986a
LP
441 r = connect_unix_path(fd, AT_FDCWD, path);
442 if (IN_SET(r, -ENOTSOCK, -EINVAL))
443 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
444 * wasn't an AF_UNIX socket after all */
445 return -ENXIO;
446 if (r < 0)
447 return r;
071830ff 448
2038c3f5
LP
449 if ((flags & O_ACCMODE) == O_RDONLY)
450 r = shutdown(fd, SHUT_WR);
451 else if ((flags & O_ACCMODE) == O_WRONLY)
452 r = shutdown(fd, SHUT_RD);
453 else
86fca584 454 r = 0;
15a3e96f 455 if (r < 0)
2038c3f5 456 return -errno;
2038c3f5 457
15a3e96f 458 return TAKE_FD(fd);
80876c20 459}
071830ff 460
08f3be7a
LP
461static int fixup_input(
462 const ExecContext *context,
463 int socket_fd,
464 bool apply_tty_stdin) {
465
466 ExecInput std_input;
467
468 assert(context);
469
470 std_input = context->std_input;
1e3ad081
LP
471
472 if (is_terminal_input(std_input) && !apply_tty_stdin)
473 return EXEC_INPUT_NULL;
071830ff 474
03fd9c49 475 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
476 return EXEC_INPUT_NULL;
477
08f3be7a
LP
478 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
479 return EXEC_INPUT_NULL;
480
03fd9c49 481 return std_input;
4f2d528d
LP
482}
483
7966a916 484static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 485
7966a916 486 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
487 return EXEC_OUTPUT_INHERIT;
488
7966a916 489 return output;
4f2d528d
LP
490}
491
a34ceba6
LP
492static int setup_input(
493 const ExecContext *context,
494 const ExecParameters *params,
52c239d7 495 int socket_fd,
2caa38e9 496 const int named_iofds[static 3]) {
a34ceba6 497
4f2d528d 498 ExecInput i;
51462135 499 int r;
4f2d528d
LP
500
501 assert(context);
a34ceba6 502 assert(params);
2caa38e9 503 assert(named_iofds);
a34ceba6
LP
504
505 if (params->stdin_fd >= 0) {
506 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
507 return -errno;
508
509 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e 510 if (isatty(STDIN_FILENO)) {
4d62ee55
DDM
511 unsigned rows = context->tty_rows, cols = context->tty_cols;
512
513 (void) exec_context_tty_size(context, &rows, &cols);
1fb0682e
LP
514 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
515 (void) reset_terminal_fd(STDIN_FILENO, true);
4d62ee55 516 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
1fb0682e 517 }
a34ceba6
LP
518
519 return STDIN_FILENO;
520 }
4f2d528d 521
08f3be7a 522 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
523
524 switch (i) {
071830ff 525
80876c20
LP
526 case EXEC_INPUT_NULL:
527 return open_null_as(O_RDONLY, STDIN_FILENO);
528
529 case EXEC_INPUT_TTY:
530 case EXEC_INPUT_TTY_FORCE:
531 case EXEC_INPUT_TTY_FAIL: {
4d62ee55 532 unsigned rows, cols;
046a82c1 533 int fd;
071830ff 534
1e22b5cd 535 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
536 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
537 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
538 ACQUIRE_TERMINAL_WAIT,
3a43da28 539 USEC_INFINITY);
970edce6 540 if (fd < 0)
80876c20
LP
541 return fd;
542
4d62ee55
DDM
543 r = exec_context_tty_size(context, &rows, &cols);
544 if (r < 0)
545 return r;
546
547 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
51462135
DDM
548 if (r < 0)
549 return r;
550
046a82c1 551 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
552 }
553
4f2d528d 554 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
555 assert(socket_fd >= 0);
556
7c248223 557 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 558
52c239d7 559 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
560 assert(named_iofds[STDIN_FILENO] >= 0);
561
52c239d7 562 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 563 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 564
08f3be7a
LP
565 case EXEC_INPUT_DATA: {
566 int fd;
567
568 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
569 if (fd < 0)
570 return fd;
571
572 return move_fd(fd, STDIN_FILENO, false);
573 }
574
2038c3f5
LP
575 case EXEC_INPUT_FILE: {
576 bool rw;
577 int fd;
578
579 assert(context->stdio_file[STDIN_FILENO]);
580
581 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
582 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
583
584 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
585 if (fd < 0)
586 return fd;
587
588 return move_fd(fd, STDIN_FILENO, false);
589 }
590
80876c20 591 default:
04499a70 592 assert_not_reached();
80876c20
LP
593 }
594}
595
41fc585a
LP
596static bool can_inherit_stderr_from_stdout(
597 const ExecContext *context,
598 ExecOutput o,
599 ExecOutput e) {
600
601 assert(context);
602
603 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
604 * stderr fd */
605
606 if (e == EXEC_OUTPUT_INHERIT)
607 return true;
608 if (e != o)
609 return false;
610
611 if (e == EXEC_OUTPUT_NAMED_FD)
612 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
613
8d7dab1f 614 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
615 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
616
617 return true;
618}
619
a34ceba6 620static int setup_output(
34cf6c43 621 const Unit *unit,
a34ceba6
LP
622 const ExecContext *context,
623 const ExecParameters *params,
624 int fileno,
625 int socket_fd,
2caa38e9 626 const int named_iofds[static 3],
a34ceba6 627 const char *ident,
7bce046b
LP
628 uid_t uid,
629 gid_t gid,
630 dev_t *journal_stream_dev,
631 ino_t *journal_stream_ino) {
a34ceba6 632
4f2d528d
LP
633 ExecOutput o;
634 ExecInput i;
47c1d80d 635 int r;
4f2d528d 636
f2341e0a 637 assert(unit);
80876c20 638 assert(context);
a34ceba6 639 assert(params);
80876c20 640 assert(ident);
7bce046b
LP
641 assert(journal_stream_dev);
642 assert(journal_stream_ino);
80876c20 643
a34ceba6
LP
644 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
645
646 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
647 return -errno;
648
649 return STDOUT_FILENO;
650 }
651
652 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
653 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
654 return -errno;
655
656 return STDERR_FILENO;
657 }
658
08f3be7a 659 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 660 o = fixup_output(context->std_output, socket_fd);
4f2d528d 661
eb17e935
MS
662 if (fileno == STDERR_FILENO) {
663 ExecOutput e;
664 e = fixup_output(context->std_error, socket_fd);
80876c20 665
eb17e935
MS
666 /* This expects the input and output are already set up */
667
668 /* Don't change the stderr file descriptor if we inherit all
669 * the way and are not on a tty */
670 if (e == EXEC_OUTPUT_INHERIT &&
671 o == EXEC_OUTPUT_INHERIT &&
672 i == EXEC_INPUT_NULL &&
673 !is_terminal_input(context->std_input) &&
7966a916 674 getppid() != 1)
eb17e935
MS
675 return fileno;
676
677 /* Duplicate from stdout if possible */
41fc585a 678 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 679 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 680
eb17e935 681 o = e;
80876c20 682
eb17e935 683 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
684 /* If input got downgraded, inherit the original value */
685 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 686 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 687
08f3be7a
LP
688 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
689 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 690 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 691
acb591e4
LP
692 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
693 if (getppid() != 1)
eb17e935 694 return fileno;
94f04347 695
eb17e935
MS
696 /* We need to open /dev/null here anew, to get the right access mode. */
697 return open_null_as(O_WRONLY, fileno);
071830ff 698 }
94f04347 699
eb17e935 700 switch (o) {
80876c20
LP
701
702 case EXEC_OUTPUT_NULL:
eb17e935 703 return open_null_as(O_WRONLY, fileno);
80876c20
LP
704
705 case EXEC_OUTPUT_TTY:
4f2d528d 706 if (is_terminal_input(i))
7c248223 707 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
708
709 /* We don't reset the terminal if this is just about output */
1e22b5cd 710 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 711
9a6bca7a 712 case EXEC_OUTPUT_KMSG:
28dbc1e8 713 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
714 case EXEC_OUTPUT_JOURNAL:
715 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 716 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 717 if (r < 0) {
7966a916
ZJS
718 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
719 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 720 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
721 } else {
722 struct stat st;
723
724 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
725 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
726 * services to detect whether they are connected to the journal or not.
727 *
728 * If both stdout and stderr are connected to a stream then let's make sure to store the data
729 * about STDERR as that's usually the best way to do logging. */
7bce046b 730
ab2116b1
LP
731 if (fstat(fileno, &st) >= 0 &&
732 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
733 *journal_stream_dev = st.st_dev;
734 *journal_stream_ino = st.st_ino;
735 }
47c1d80d
MS
736 }
737 return r;
4f2d528d
LP
738
739 case EXEC_OUTPUT_SOCKET:
740 assert(socket_fd >= 0);
e75a9ed1 741
7c248223 742 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 743
52c239d7 744 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
745 assert(named_iofds[fileno] >= 0);
746
52c239d7 747 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 748 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 749
566b7d23 750 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
751 case EXEC_OUTPUT_FILE_APPEND:
752 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 753 bool rw;
566b7d23 754 int fd, flags;
2038c3f5
LP
755
756 assert(context->stdio_file[fileno]);
757
758 rw = context->std_input == EXEC_INPUT_FILE &&
759 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
760
761 if (rw)
7c248223 762 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 763
566b7d23
ZD
764 flags = O_WRONLY;
765 if (o == EXEC_OUTPUT_FILE_APPEND)
766 flags |= O_APPEND;
8d7dab1f
LW
767 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
768 flags |= O_TRUNC;
566b7d23
ZD
769
770 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
771 if (fd < 0)
772 return fd;
773
566b7d23 774 return move_fd(fd, fileno, 0);
2038c3f5
LP
775 }
776
94f04347 777 default:
04499a70 778 assert_not_reached();
94f04347 779 }
071830ff
LP
780}
781
02a51aba 782static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 783 int r;
02a51aba
LP
784
785 assert(fd >= 0);
02a51aba 786
1ff74fb6 787 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
788 if (isatty(fd) < 1) {
789 if (IN_SET(errno, EINVAL, ENOTTY))
790 return 0; /* not a tty */
1ff74fb6 791
02a51aba 792 return -errno;
4b3b5bc7 793 }
02a51aba 794
4b3b5bc7 795 /* This might fail. What matters are the results. */
f2df231f 796 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
797 if (r < 0)
798 return r;
02a51aba 799
4b3b5bc7 800 return 1;
02a51aba
LP
801}
802
aedec452 803static int setup_confirm_stdio(
51462135 804 const ExecContext *context,
aedec452
LP
805 const char *vc,
806 int *ret_saved_stdin,
807 int *ret_saved_stdout) {
808
254d1313 809 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
4d62ee55 810 unsigned rows, cols;
3d18b167 811 int r;
80876c20 812
aedec452
LP
813 assert(ret_saved_stdin);
814 assert(ret_saved_stdout);
80876c20 815
af6da548
LP
816 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
817 if (saved_stdin < 0)
818 return -errno;
80876c20 819
af6da548 820 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
821 if (saved_stdout < 0)
822 return -errno;
80876c20 823
8854d795 824 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
825 if (fd < 0)
826 return fd;
80876c20 827
af6da548
LP
828 r = chown_terminal(fd, getuid());
829 if (r < 0)
3d18b167 830 return r;
02a51aba 831
3d18b167
LP
832 r = reset_terminal_fd(fd, true);
833 if (r < 0)
834 return r;
80876c20 835
4d62ee55
DDM
836 r = exec_context_tty_size(context, &rows, &cols);
837 if (r < 0)
838 return r;
839
840 r = terminal_set_size_fd(fd, vc, rows, cols);
51462135
DDM
841 if (r < 0)
842 return r;
843
aedec452
LP
844 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
845 TAKE_FD(fd);
2b33ab09
LP
846 if (r < 0)
847 return r;
80876c20 848
aedec452
LP
849 *ret_saved_stdin = TAKE_FD(saved_stdin);
850 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 851 return 0;
80876c20
LP
852}
853
63d77c92 854static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
855 assert(err < 0);
856
857 if (err == -ETIMEDOUT)
63d77c92 858 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
859 else {
860 errno = -err;
63d77c92 861 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
862 }
863}
864
63d77c92 865static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 866 _cleanup_close_ int fd = -EBADF;
80876c20 867
3b20f877 868 assert(vc);
80876c20 869
7d5ceb64 870 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 871 if (fd < 0)
3b20f877 872 return;
80876c20 873
63d77c92 874 write_confirm_error_fd(err, fd, u);
af6da548 875}
80876c20 876
3d18b167 877static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 878 int r = 0;
80876c20 879
af6da548
LP
880 assert(saved_stdin);
881 assert(saved_stdout);
882
883 release_terminal();
884
885 if (*saved_stdin >= 0)
80876c20 886 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 887 r = -errno;
80876c20 888
af6da548 889 if (*saved_stdout >= 0)
80876c20 890 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 891 r = -errno;
80876c20 892
3d18b167
LP
893 *saved_stdin = safe_close(*saved_stdin);
894 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
895
896 return r;
897}
898
3b20f877
FB
899enum {
900 CONFIRM_PRETEND_FAILURE = -1,
901 CONFIRM_PRETEND_SUCCESS = 0,
902 CONFIRM_EXECUTE = 1,
903};
904
51462135 905static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 906 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 907 _cleanup_free_ char *e = NULL;
3b20f877 908 char c;
af6da548 909
3b20f877 910 /* For any internal errors, assume a positive response. */
51462135 911 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 912 if (r < 0) {
63d77c92 913 write_confirm_error(r, vc, u);
3b20f877
FB
914 return CONFIRM_EXECUTE;
915 }
af6da548 916
b0eb2944
FB
917 /* confirm_spawn might have been disabled while we were sleeping. */
918 if (manager_is_confirm_spawn_disabled(u->manager)) {
919 r = 1;
920 goto restore_stdio;
921 }
af6da548 922
2bcd3c26
FB
923 e = ellipsize(cmdline, 60, 100);
924 if (!e) {
925 log_oom();
926 r = CONFIRM_EXECUTE;
927 goto restore_stdio;
928 }
af6da548 929
d172b175 930 for (;;) {
539622bd 931 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 932 if (r < 0) {
63d77c92 933 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
934 r = CONFIRM_EXECUTE;
935 goto restore_stdio;
936 }
af6da548 937
d172b175 938 switch (c) {
b0eb2944
FB
939 case 'c':
940 printf("Resuming normal execution.\n");
941 manager_disable_confirm_spawn();
942 r = 1;
943 break;
dd6f9ac0
FB
944 case 'D':
945 unit_dump(u, stdout, " ");
946 continue; /* ask again */
d172b175
FB
947 case 'f':
948 printf("Failing execution.\n");
949 r = CONFIRM_PRETEND_FAILURE;
950 break;
951 case 'h':
b0eb2944
FB
952 printf(" c - continue, proceed without asking anymore\n"
953 " D - dump, show the state of the unit\n"
dd6f9ac0 954 " f - fail, don't execute the command and pretend it failed\n"
d172b175 955 " h - help\n"
eedf223a 956 " i - info, show a short summary of the unit\n"
56fde33a 957 " j - jobs, show jobs that are in progress\n"
d172b175
FB
958 " s - skip, don't execute the command and pretend it succeeded\n"
959 " y - yes, execute the command\n");
dd6f9ac0 960 continue; /* ask again */
eedf223a
FB
961 case 'i':
962 printf(" Description: %s\n"
963 " Unit: %s\n"
964 " Command: %s\n",
965 u->id, u->description, cmdline);
966 continue; /* ask again */
56fde33a 967 case 'j':
d1d8786c 968 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 969 continue; /* ask again */
539622bd
FB
970 case 'n':
971 /* 'n' was removed in favor of 'f'. */
972 printf("Didn't understand 'n', did you mean 'f'?\n");
973 continue; /* ask again */
d172b175
FB
974 case 's':
975 printf("Skipping execution.\n");
976 r = CONFIRM_PRETEND_SUCCESS;
977 break;
978 case 'y':
979 r = CONFIRM_EXECUTE;
980 break;
981 default:
04499a70 982 assert_not_reached();
d172b175 983 }
3b20f877 984 break;
3b20f877 985 }
af6da548 986
3b20f877 987restore_stdio:
af6da548 988 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 989 return r;
80876c20
LP
990}
991
4d885bd3
DH
992static int get_fixed_user(const ExecContext *c, const char **user,
993 uid_t *uid, gid_t *gid,
994 const char **home, const char **shell) {
81a2b7ce 995 int r;
4d885bd3 996 const char *name;
81a2b7ce 997
4d885bd3 998 assert(c);
81a2b7ce 999
23deef88
LP
1000 if (!c->user)
1001 return 0;
1002
4d885bd3
DH
1003 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1004 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 1005
23deef88 1006 name = c->user;
fafff8f1 1007 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
1008 if (r < 0)
1009 return r;
81a2b7ce 1010
4d885bd3
DH
1011 *user = name;
1012 return 0;
1013}
1014
1015static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
1016 int r;
1017 const char *name;
1018
1019 assert(c);
1020
1021 if (!c->group)
1022 return 0;
1023
1024 name = c->group;
fafff8f1 1025 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
1026 if (r < 0)
1027 return r;
1028
1029 *group = name;
1030 return 0;
1031}
1032
cdc5d5c5
DH
1033static int get_supplementary_groups(const ExecContext *c, const char *user,
1034 const char *group, gid_t gid,
1035 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
1036 int r, k = 0;
1037 int ngroups_max;
1038 bool keep_groups = false;
1039 gid_t *groups = NULL;
1040 _cleanup_free_ gid_t *l_gids = NULL;
1041
1042 assert(c);
1043
bbeea271
DH
1044 /*
1045 * If user is given, then lookup GID and supplementary groups list.
1046 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1047 * here and as early as possible so we keep the list of supplementary
1048 * groups of the caller.
bbeea271
DH
1049 */
1050 if (user && gid_is_valid(gid) && gid != 0) {
1051 /* First step, initialize groups from /etc/groups */
1052 if (initgroups(user, gid) < 0)
1053 return -errno;
1054
1055 keep_groups = true;
1056 }
1057
ac6e8be6 1058 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1059 return 0;
1060
366ddd25
DH
1061 /*
1062 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1063 * be positive, otherwise fail.
1064 */
1065 errno = 0;
1066 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1067 if (ngroups_max <= 0)
1068 return errno_or_else(EOPNOTSUPP);
366ddd25 1069
4d885bd3
DH
1070 l_gids = new(gid_t, ngroups_max);
1071 if (!l_gids)
1072 return -ENOMEM;
81a2b7ce 1073
4d885bd3
DH
1074 if (keep_groups) {
1075 /*
1076 * Lookup the list of groups that the user belongs to, we
1077 * avoid NSS lookups here too for gid=0.
1078 */
1079 k = ngroups_max;
1080 if (getgrouplist(user, gid, l_gids, &k) < 0)
1081 return -EINVAL;
1082 } else
1083 k = 0;
81a2b7ce 1084
4d885bd3
DH
1085 STRV_FOREACH(i, c->supplementary_groups) {
1086 const char *g;
81a2b7ce 1087
4d885bd3
DH
1088 if (k >= ngroups_max)
1089 return -E2BIG;
81a2b7ce 1090
4d885bd3 1091 g = *i;
fafff8f1 1092 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1093 if (r < 0)
1094 return r;
81a2b7ce 1095
4d885bd3
DH
1096 k++;
1097 }
81a2b7ce 1098
4d885bd3
DH
1099 /*
1100 * Sets ngids to zero to drop all supplementary groups, happens
1101 * when we are under root and SupplementaryGroups= is empty.
1102 */
1103 if (k == 0) {
1104 *ngids = 0;
1105 return 0;
1106 }
81a2b7ce 1107
4d885bd3
DH
1108 /* Otherwise get the final list of supplementary groups */
1109 groups = memdup(l_gids, sizeof(gid_t) * k);
1110 if (!groups)
1111 return -ENOMEM;
1112
1113 *supplementary_gids = groups;
1114 *ngids = k;
1115
1116 groups = NULL;
1117
1118 return 0;
1119}
1120
34cf6c43 1121static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1122 int r;
1123
709dbeac
YW
1124 /* Handle SupplementaryGroups= if it is not empty */
1125 if (ngids > 0) {
4d885bd3
DH
1126 r = maybe_setgroups(ngids, supplementary_gids);
1127 if (r < 0)
97f0e76f 1128 return r;
4d885bd3 1129 }
81a2b7ce 1130
4d885bd3
DH
1131 if (gid_is_valid(gid)) {
1132 /* Then set our gids */
1133 if (setresgid(gid, gid, gid) < 0)
1134 return -errno;
81a2b7ce
LP
1135 }
1136
1137 return 0;
1138}
1139
a954b249
LP
1140static int set_securebits(unsigned bits, unsigned mask) {
1141 unsigned applied;
1142 int current;
1143
dbdc4098
TK
1144 current = prctl(PR_GET_SECUREBITS);
1145 if (current < 0)
1146 return -errno;
a954b249 1147
dbdc4098 1148 /* Clear all securebits defined in mask and set bits */
a954b249
LP
1149 applied = ((unsigned) current & ~mask) | bits;
1150 if ((unsigned) current == applied)
dbdc4098 1151 return 0;
a954b249 1152
dbdc4098
TK
1153 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1154 return -errno;
a954b249 1155
dbdc4098
TK
1156 return 1;
1157}
1158
638fd8cc
LP
1159static int enforce_user(
1160 const ExecContext *context,
1161 uid_t uid,
1162 uint64_t capability_ambient_set) {
81a2b7ce 1163 assert(context);
dbdc4098 1164 int r;
81a2b7ce 1165
4d885bd3
DH
1166 if (!uid_is_valid(uid))
1167 return 0;
1168
a954b249
LP
1169 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1170 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1171 * case. */
81a2b7ce 1172
638fd8cc 1173 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
81a2b7ce 1174
a954b249
LP
1175 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1176 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1177 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1178 if (r < 0)
1179 return r;
81a2b7ce
LP
1180 }
1181
479050b3 1182 /* Second step: actually set the uids */
81a2b7ce
LP
1183 if (setresuid(uid, uid, uid) < 0)
1184 return -errno;
1185
a954b249
LP
1186 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1187 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1188 * outside of this call. */
81a2b7ce
LP
1189 return 0;
1190}
1191
349cc4a5 1192#if HAVE_PAM
5b6319dc
LP
1193
1194static int null_conv(
1195 int num_msg,
1196 const struct pam_message **msg,
1197 struct pam_response **resp,
1198 void *appdata_ptr) {
1199
1200 /* We don't support conversations */
1201
1202 return PAM_CONV_ERR;
1203}
1204
cefc33ae
LP
1205#endif
1206
5b6319dc
LP
1207static int setup_pam(
1208 const char *name,
1209 const char *user,
940c5210 1210 uid_t uid,
2d6fce8d 1211 gid_t gid,
5b6319dc 1212 const char *tty,
421bb42d 1213 char ***env, /* updated on success */
5b8d1f6b 1214 const int fds[], size_t n_fds) {
5b6319dc 1215
349cc4a5 1216#if HAVE_PAM
cefc33ae 1217
5b6319dc
LP
1218 static const struct pam_conv conv = {
1219 .conv = null_conv,
1220 .appdata_ptr = NULL
1221 };
1222
2d7c6aa2 1223 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1224 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1225 pam_handle_t *handle = NULL;
d6e5f3ad 1226 sigset_t old_ss;
7bb70b6e 1227 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1228 bool close_session = false;
1229 pid_t pam_pid = 0, parent_pid;
970edce6 1230 int flags = 0;
5b6319dc
LP
1231
1232 assert(name);
1233 assert(user);
2065ca69 1234 assert(env);
5b6319dc
LP
1235
1236 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1237 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1238 * systemd via the cgroup logic. It will then remove the PAM
1239 * session again. The parent process will exec() the actual
1240 * daemon. We do things this way to ensure that the main PID
1241 * of the daemon is the one we initially fork()ed. */
1242
7bb70b6e
LP
1243 r = barrier_create(&barrier);
1244 if (r < 0)
2d7c6aa2
DH
1245 goto fail;
1246
553d2243 1247 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1248 flags |= PAM_SILENT;
1249
f546241b
ZJS
1250 pam_code = pam_start(name, user, &conv, &handle);
1251 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1252 handle = NULL;
1253 goto fail;
1254 }
1255
3cd24c1a
LP
1256 if (!tty) {
1257 _cleanup_free_ char *q = NULL;
1258
1259 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1260 * out if that's the case, and read the TTY off it. */
1261
1262 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1263 tty = strjoina("/dev/", q);
1264 }
1265
513cf7da
MS
1266 if (tty) {
1267 pam_code = pam_set_item(handle, PAM_TTY, tty);
1268 if (pam_code != PAM_SUCCESS)
1269 goto fail;
1270 }
5b6319dc 1271
84eada2f
JW
1272 STRV_FOREACH(nv, *env) {
1273 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1274 if (pam_code != PAM_SUCCESS)
1275 goto fail;
1276 }
1277
970edce6 1278 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1279 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1280 goto fail;
1281
3bb39ea9
DG
1282 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1283 if (pam_code != PAM_SUCCESS)
46d7c6af 1284 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1285
970edce6 1286 pam_code = pam_open_session(handle, flags);
f546241b 1287 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1288 goto fail;
1289
1290 close_session = true;
1291
f546241b
ZJS
1292 e = pam_getenvlist(handle);
1293 if (!e) {
5b6319dc
LP
1294 pam_code = PAM_BUF_ERR;
1295 goto fail;
1296 }
1297
cafc5ca1 1298 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1299
72c0a2c2 1300 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1301
df0ff127 1302 parent_pid = getpid_cached();
5b6319dc 1303
4c253ed1
LP
1304 r = safe_fork("(sd-pam)", 0, &pam_pid);
1305 if (r < 0)
5b6319dc 1306 goto fail;
4c253ed1 1307 if (r == 0) {
7bb70b6e 1308 int sig, ret = EXIT_PAM;
5b6319dc 1309
cafc5ca1 1310 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1311 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1312
1da37e58
ZJS
1313 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1314 * those fds are open here that have been opened by PAM. */
4c253ed1 1315 (void) close_many(fds, n_fds);
5b6319dc 1316
cafc5ca1
LP
1317 /* Drop privileges - we don't need any to pam_close_session and this will make
1318 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1319 * threads to fail to exit normally */
2d6fce8d 1320
97f0e76f
LP
1321 r = maybe_setgroups(0, NULL);
1322 if (r < 0)
1323 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1324 if (setresgid(gid, gid, gid) < 0)
1325 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1326 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1327 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1328
9c274488 1329 (void) ignore_signals(SIGPIPE);
ce30c8dc 1330
cafc5ca1
LP
1331 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1332 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1333 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1334 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1335 goto child_finish;
1336
cafc5ca1
LP
1337 /* Tell the parent that our setup is done. This is especially important regarding dropping
1338 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1339 *
cafc5ca1 1340 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1341 (void) barrier_place(&barrier);
2d7c6aa2 1342
643f4706 1343 /* Check if our parent process might already have died? */
5b6319dc 1344 if (getppid() == parent_pid) {
d6e5f3ad
DM
1345 sigset_t ss;
1346
1347 assert_se(sigemptyset(&ss) >= 0);
1348 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1349
3dead8d9
LP
1350 for (;;) {
1351 if (sigwait(&ss, &sig) < 0) {
1352 if (errno == EINTR)
1353 continue;
1354
1355 goto child_finish;
1356 }
5b6319dc 1357
3dead8d9
LP
1358 assert(sig == SIGTERM);
1359 break;
1360 }
5b6319dc
LP
1361 }
1362
3bb39ea9
DG
1363 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1364 if (pam_code != PAM_SUCCESS)
1365 goto child_finish;
1366
3dead8d9 1367 /* If our parent died we'll end the session */
f546241b 1368 if (getppid() != parent_pid) {
970edce6 1369 pam_code = pam_close_session(handle, flags);
f546241b 1370 if (pam_code != PAM_SUCCESS)
5b6319dc 1371 goto child_finish;
f546241b 1372 }
5b6319dc 1373
7bb70b6e 1374 ret = 0;
5b6319dc
LP
1375
1376 child_finish:
7feb2b57
LP
1377 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1378 * know about this. See pam_end(3) */
1379 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1380 _exit(ret);
5b6319dc
LP
1381 }
1382
2d7c6aa2
DH
1383 barrier_set_role(&barrier, BARRIER_PARENT);
1384
cafc5ca1
LP
1385 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1386 * here. */
5b6319dc
LP
1387 handle = NULL;
1388
3b8bddde 1389 /* Unblock SIGTERM again in the parent */
72c0a2c2 1390 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1391
cafc5ca1
LP
1392 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1393 * this fd around. */
5b6319dc
LP
1394 closelog();
1395
cafc5ca1
LP
1396 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1397 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1398 if (!barrier_place_and_sync(&barrier))
1399 log_error("PAM initialization failed");
1400
130d3d22 1401 return strv_free_and_replace(*env, e);
5b6319dc
LP
1402
1403fail:
970edce6
ZJS
1404 if (pam_code != PAM_SUCCESS) {
1405 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1406 r = -EPERM; /* PAM errors do not map to errno */
1407 } else
1408 log_error_errno(r, "PAM failed: %m");
9ba35398 1409
5b6319dc
LP
1410 if (handle) {
1411 if (close_session)
970edce6 1412 pam_code = pam_close_session(handle, flags);
5b6319dc 1413
7feb2b57 1414 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1415 }
1416
5b6319dc 1417 closelog();
7bb70b6e 1418 return r;
cefc33ae
LP
1419#else
1420 return 0;
5b6319dc 1421#endif
cefc33ae 1422}
5b6319dc 1423
5d6b1584 1424static void rename_process_from_path(const char *path) {
a99626c1 1425 _cleanup_free_ char *buf = NULL;
5d6b1584 1426 const char *p;
5d6b1584 1427
a99626c1
LP
1428 assert(path);
1429
1430 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1431 * /bin/ps */
5d6b1584 1432
a99626c1 1433 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1434 rename_process("(...)");
1435 return;
1436 }
1437
a99626c1 1438 size_t l = strlen(buf);
5d6b1584 1439 if (l > 8) {
a99626c1 1440 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1441 * "systemd-" */
a99626c1 1442 p = buf + l - 8;
5d6b1584 1443 l = 8;
a99626c1
LP
1444 } else
1445 p = buf;
5d6b1584 1446
a99626c1 1447 char process_name[11];
5d6b1584
LP
1448 process_name[0] = '(';
1449 memcpy(process_name+1, p, l);
1450 process_name[1+l] = ')';
1451 process_name[1+l+1] = 0;
1452
1453 rename_process(process_name);
1454}
1455
469830d1
LP
1456static bool context_has_address_families(const ExecContext *c) {
1457 assert(c);
1458
6b000af4 1459 return c->address_families_allow_list ||
469830d1
LP
1460 !set_isempty(c->address_families);
1461}
1462
1463static bool context_has_syscall_filters(const ExecContext *c) {
1464 assert(c);
1465
6b000af4 1466 return c->syscall_allow_list ||
8cfa775f 1467 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1468}
1469
9df2cdd8
TM
1470static bool context_has_syscall_logs(const ExecContext *c) {
1471 assert(c);
1472
1473 return c->syscall_log_allow_list ||
1474 !hashmap_isempty(c->syscall_log);
1475}
1476
469830d1
LP
1477static bool context_has_no_new_privileges(const ExecContext *c) {
1478 assert(c);
1479
1480 if (c->no_new_privileges)
1481 return true;
1482
26c45a6c 1483 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1484 return false;
1485
1486 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1487 return c->lock_personality ||
469830d1 1488 c->memory_deny_write_execute ||
0538d2a8 1489 c->private_devices ||
fc64760d 1490 c->protect_clock ||
0538d2a8 1491 c->protect_hostname ||
469830d1
LP
1492 c->protect_kernel_tunables ||
1493 c->protect_kernel_modules ||
84703040 1494 c->protect_kernel_logs ||
0538d2a8
YW
1495 context_has_address_families(c) ||
1496 exec_context_restrict_namespaces_set(c) ||
1497 c->restrict_realtime ||
1498 c->restrict_suid_sgid ||
78e864e5 1499 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1500 context_has_syscall_filters(c) ||
1501 context_has_syscall_logs(c);
469830d1
LP
1502}
1503
349cc4a5 1504#if HAVE_SECCOMP
17df7223 1505
83f12b27 1506static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1507
1508 if (is_seccomp_available())
1509 return false;
1510
f673b62d 1511 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1512 return true;
83f12b27
FS
1513}
1514
165a31c0 1515static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1516 uint32_t negative_action, default_action, action;
165a31c0 1517 int r;
8351ceae 1518
469830d1 1519 assert(u);
c0467cf3 1520 assert(c);
8351ceae 1521
469830d1 1522 if (!context_has_syscall_filters(c))
83f12b27
FS
1523 return 0;
1524
469830d1
LP
1525 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1526 return 0;
e9642be2 1527
005bfaf1 1528 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1529
6b000af4 1530 if (c->syscall_allow_list) {
469830d1
LP
1531 default_action = negative_action;
1532 action = SCMP_ACT_ALLOW;
7c66bae2 1533 } else {
469830d1
LP
1534 default_action = SCMP_ACT_ALLOW;
1535 action = negative_action;
57183d11 1536 }
8351ceae 1537
165a31c0 1538 if (needs_ambient_hack) {
6b000af4 1539 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1540 if (r < 0)
1541 return r;
1542 }
1543
b54f36c6 1544 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1545}
1546
9df2cdd8
TM
1547static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1548#ifdef SCMP_ACT_LOG
1549 uint32_t default_action, action;
1550#endif
1551
1552 assert(u);
1553 assert(c);
1554
1555 if (!context_has_syscall_logs(c))
1556 return 0;
1557
1558#ifdef SCMP_ACT_LOG
1559 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1560 return 0;
1561
1562 if (c->syscall_log_allow_list) {
1563 /* Log nothing but the ones listed */
1564 default_action = SCMP_ACT_ALLOW;
1565 action = SCMP_ACT_LOG;
1566 } else {
1567 /* Log everything but the ones listed */
1568 default_action = SCMP_ACT_LOG;
1569 action = SCMP_ACT_ALLOW;
1570 }
1571
1572 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1573#else
1574 /* old libseccomp */
1575 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1576 return 0;
1577#endif
1578}
1579
469830d1
LP
1580static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1581 assert(u);
4298d0b5
LP
1582 assert(c);
1583
469830d1 1584 if (set_isempty(c->syscall_archs))
83f12b27
FS
1585 return 0;
1586
469830d1
LP
1587 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1588 return 0;
4298d0b5 1589
469830d1
LP
1590 return seccomp_restrict_archs(c->syscall_archs);
1591}
4298d0b5 1592
469830d1
LP
1593static int apply_address_families(const Unit* u, const ExecContext *c) {
1594 assert(u);
1595 assert(c);
4298d0b5 1596
469830d1
LP
1597 if (!context_has_address_families(c))
1598 return 0;
4298d0b5 1599
469830d1
LP
1600 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1601 return 0;
4298d0b5 1602
6b000af4 1603 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1604}
4298d0b5 1605
83f12b27 1606static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
7a114ed4
TM
1607 int r;
1608
469830d1 1609 assert(u);
f3e43635
TM
1610 assert(c);
1611
469830d1 1612 if (!c->memory_deny_write_execute)
83f12b27
FS
1613 return 0;
1614
7a114ed4
TM
1615 /* use prctl() if kernel supports it (6.3) */
1616 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1617 if (r == 0) {
1618 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1619 return 0;
1620 }
1621 if (r < 0 && errno != EINVAL)
1622 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1623 /* else use seccomp */
1624 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1625
469830d1
LP
1626 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1627 return 0;
f3e43635 1628
469830d1 1629 return seccomp_memory_deny_write_execute();
f3e43635
TM
1630}
1631
83f12b27 1632static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1633 assert(u);
f4170c67
LP
1634 assert(c);
1635
469830d1 1636 if (!c->restrict_realtime)
83f12b27
FS
1637 return 0;
1638
469830d1
LP
1639 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1640 return 0;
f4170c67 1641
469830d1 1642 return seccomp_restrict_realtime();
f4170c67
LP
1643}
1644
f69567cb
LP
1645static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1646 assert(u);
1647 assert(c);
1648
1649 if (!c->restrict_suid_sgid)
1650 return 0;
1651
1652 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1653 return 0;
1654
1655 return seccomp_restrict_suid_sgid();
1656}
1657
59e856c7 1658static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1659 assert(u);
59eeb84b
LP
1660 assert(c);
1661
1662 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1663 * let's protect even those systems where this is left on in the kernel. */
1664
469830d1 1665 if (!c->protect_kernel_tunables)
59eeb84b
LP
1666 return 0;
1667
469830d1
LP
1668 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1669 return 0;
59eeb84b 1670
469830d1 1671 return seccomp_protect_sysctl();
59eeb84b
LP
1672}
1673
59e856c7 1674static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1675 assert(u);
502d704e
DH
1676 assert(c);
1677
25a8d8a0 1678 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1679
469830d1
LP
1680 if (!c->protect_kernel_modules)
1681 return 0;
1682
502d704e
DH
1683 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1684 return 0;
1685
b54f36c6 1686 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1687}
1688
84703040
KK
1689static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1690 assert(u);
1691 assert(c);
1692
1693 if (!c->protect_kernel_logs)
1694 return 0;
1695
1696 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1697 return 0;
1698
1699 return seccomp_protect_syslog();
1700}
1701
daf8f72b 1702static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1703 assert(u);
1704 assert(c);
1705
1706 if (!c->protect_clock)
1707 return 0;
1708
1709 if (skip_seccomp_unavailable(u, "ProtectClock="))
1710 return 0;
1711
1712 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1713}
1714
59e856c7 1715static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1716 assert(u);
ba128bb8
LP
1717 assert(c);
1718
8f81a5f6 1719 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1720
469830d1
LP
1721 if (!c->private_devices)
1722 return 0;
1723
ba128bb8
LP
1724 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1725 return 0;
1726
b54f36c6 1727 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1728}
1729
34cf6c43 1730static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1731 assert(u);
add00535
LP
1732 assert(c);
1733
1734 if (!exec_context_restrict_namespaces_set(c))
1735 return 0;
1736
1737 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1738 return 0;
1739
1740 return seccomp_restrict_namespaces(c->restrict_namespaces);
1741}
1742
78e864e5 1743static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1744 unsigned long personality;
1745 int r;
78e864e5
TM
1746
1747 assert(u);
1748 assert(c);
1749
1750 if (!c->lock_personality)
1751 return 0;
1752
1753 if (skip_seccomp_unavailable(u, "LockPersonality="))
1754 return 0;
1755
e8132d63
LP
1756 personality = c->personality;
1757
1758 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1759 if (personality == PERSONALITY_INVALID) {
1760
1761 r = opinionated_personality(&personality);
1762 if (r < 0)
1763 return r;
1764 }
78e864e5
TM
1765
1766 return seccomp_lock_personality(personality);
1767}
1768
c0467cf3 1769#endif
8351ceae 1770
7a8288f6 1771#if HAVE_LIBBPF
7a8288f6
DM
1772static int apply_restrict_filesystems(Unit *u, const ExecContext *c) {
1773 assert(u);
1774 assert(c);
1775
1776 if (!exec_context_restrict_filesystems_set(c))
1777 return 0;
1778
46004616
ZJS
1779 if (!u->manager->restrict_fs) {
1780 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1781 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1782 return 0;
46004616 1783 }
7a8288f6
DM
1784
1785 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, c->restrict_filesystems_allow_list);
1786}
1787#endif
1788
daf8f72b 1789static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1790 assert(u);
1791 assert(c);
1792
1793 if (!c->protect_hostname)
1794 return 0;
1795
1796 if (ns_type_supported(NAMESPACE_UTS)) {
1797 if (unshare(CLONE_NEWUTS) < 0) {
1798 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1799 *ret_exit_status = EXIT_NAMESPACE;
1800 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1801 }
1802
1803 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1804 }
1805 } else
1806 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1807
1808#if HAVE_SECCOMP
8f3e342f
ZJS
1809 int r;
1810
daf8f72b
LP
1811 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1812 return 0;
1813
1814 r = seccomp_protect_hostname();
1815 if (r < 0) {
1816 *ret_exit_status = EXIT_SECCOMP;
1817 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1818 }
1819#endif
1820
1821 return 0;
1822}
1823
3042bbeb 1824static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1825 assert(idle_pipe);
1826
54eb2300
LP
1827 idle_pipe[1] = safe_close(idle_pipe[1]);
1828 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1829
1830 if (idle_pipe[0] >= 0) {
1831 int r;
1832
1833 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1834
1835 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1836 ssize_t n;
1837
31a7eb86 1838 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1839 n = write(idle_pipe[3], "x", 1);
1840 if (n > 0)
cd972d69 1841 /* Wait for systemd to react to the signal above. */
54756dce 1842 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1843 }
1844
54eb2300 1845 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1846
1847 }
1848
54eb2300 1849 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1850}
1851
fb2042dd
YW
1852static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1853
7cae38c4 1854static int build_environment(
34cf6c43 1855 const Unit *u,
9fa95f85 1856 const ExecContext *c,
1e22b5cd 1857 const ExecParameters *p,
6bb00842 1858 const CGroupContext *cgroup_context,
da6053d0 1859 size_t n_fds,
cd48e23f 1860 char **fdnames,
7cae38c4
LP
1861 const char *home,
1862 const char *username,
1863 const char *shell,
7bce046b
LP
1864 dev_t journal_stream_dev,
1865 ino_t journal_stream_ino,
6bb00842 1866 const char *memory_pressure_path,
7cae38c4
LP
1867 char ***ret) {
1868
1869 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1870 size_t n_env = 0;
7cae38c4 1871 char *x;
4d62ee55 1872 int r;
7cae38c4 1873
4b58153d 1874 assert(u);
7cae38c4 1875 assert(c);
7c1cb6f1 1876 assert(p);
7cae38c4
LP
1877 assert(ret);
1878
6bb00842 1879#define N_ENV_VARS 19
8d5bb13d 1880 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1881 if (!our_env)
1882 return -ENOMEM;
1883
1884 if (n_fds > 0) {
8dd4c05b
LP
1885 _cleanup_free_ char *joined = NULL;
1886
df0ff127 1887 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1888 return -ENOMEM;
1889 our_env[n_env++] = x;
1890
da6053d0 1891 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1892 return -ENOMEM;
1893 our_env[n_env++] = x;
8dd4c05b 1894
cd48e23f 1895 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1896 if (!joined)
1897 return -ENOMEM;
1898
605405c6 1899 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1900 if (!x)
1901 return -ENOMEM;
1902 our_env[n_env++] = x;
7cae38c4
LP
1903 }
1904
b08af3b1 1905 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1906 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1907 return -ENOMEM;
1908 our_env[n_env++] = x;
1909
1e22b5cd 1910 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1911 return -ENOMEM;
1912 our_env[n_env++] = x;
1913 }
1914
de90700f
LP
1915 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1916 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1917 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1918 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1919 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1920 if (!x)
1921 return -ENOMEM;
1922 our_env[n_env++] = x;
1923 }
1924
7cae38c4 1925 if (home) {
b910cc72 1926 x = strjoin("HOME=", home);
7cae38c4
LP
1927 if (!x)
1928 return -ENOMEM;
7bbead1d 1929
4ff361cc 1930 path_simplify(x + 5);
7cae38c4
LP
1931 our_env[n_env++] = x;
1932 }
1933
1934 if (username) {
b910cc72 1935 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1936 if (!x)
1937 return -ENOMEM;
1938 our_env[n_env++] = x;
1939
b910cc72 1940 x = strjoin("USER=", username);
7cae38c4
LP
1941 if (!x)
1942 return -ENOMEM;
1943 our_env[n_env++] = x;
1944 }
1945
1946 if (shell) {
b910cc72 1947 x = strjoin("SHELL=", shell);
7cae38c4
LP
1948 if (!x)
1949 return -ENOMEM;
7bbead1d 1950
4ff361cc 1951 path_simplify(x + 6);
7cae38c4
LP
1952 our_env[n_env++] = x;
1953 }
1954
4b58153d
LP
1955 if (!sd_id128_is_null(u->invocation_id)) {
1956 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1957 return -ENOMEM;
1958
1959 our_env[n_env++] = x;
1960 }
1961
6af760f3 1962 if (exec_context_needs_term(c)) {
4d62ee55 1963 _cleanup_free_ char *cmdline = NULL;
6af760f3
LP
1964 const char *tty_path, *term = NULL;
1965
1966 tty_path = exec_context_tty_path(c);
1967
e8cf09b2
LP
1968 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1969 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1970 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1971
e8cf09b2 1972 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1973 term = getenv("TERM");
4d62ee55
DDM
1974 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1975 _cleanup_free_ char *key = NULL;
1976
1977 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1978 if (!key)
1979 return -ENOMEM;
1980
1981 r = proc_cmdline_get_key(key, 0, &cmdline);
1982 if (r < 0)
1983 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
1984 else if (r > 0)
1985 term = cmdline;
1986 }
e8cf09b2 1987
6af760f3
LP
1988 if (!term)
1989 term = default_term_for_tty(tty_path);
7cae38c4 1990
b910cc72 1991 x = strjoin("TERM=", term);
7cae38c4
LP
1992 if (!x)
1993 return -ENOMEM;
1994 our_env[n_env++] = x;
1995 }
1996
7bce046b
LP
1997 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1998 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1999 return -ENOMEM;
2000
2001 our_env[n_env++] = x;
2002 }
2003
91dd5f7c
LP
2004 if (c->log_namespace) {
2005 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2006 if (!x)
2007 return -ENOMEM;
2008
2009 our_env[n_env++] = x;
2010 }
2011
5b10116e 2012 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 2013 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
2014 const char *n;
2015
2016 if (!p->prefix[t])
2017 continue;
2018
211a3d87 2019 if (c->directories[t].n_items == 0)
fb2042dd
YW
2020 continue;
2021
2022 n = exec_directory_env_name_to_string(t);
2023 if (!n)
2024 continue;
2025
211a3d87
LB
2026 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2027 _cleanup_free_ char *prefixed = NULL;
fb2042dd 2028
211a3d87
LB
2029 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2030 if (!prefixed)
2031 return -ENOMEM;
2032
2033 if (!strextend_with_separator(&joined, ":", prefixed))
2034 return -ENOMEM;
2035 }
fb2042dd
YW
2036
2037 x = strjoin(n, "=", joined);
2038 if (!x)
2039 return -ENOMEM;
2040
2041 our_env[n_env++] = x;
2042 }
2043
133e4de2
YW
2044 _cleanup_free_ char *creds_dir = NULL;
2045 r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2046 if (r < 0)
2047 return r;
2048 if (r > 0) {
2049 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
bb0c0d6f
LP
2050 if (!x)
2051 return -ENOMEM;
2052
2053 our_env[n_env++] = x;
2054 }
2055
dc4e2940
YW
2056 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2057 return -ENOMEM;
2058
2059 our_env[n_env++] = x;
2060
6bb00842
LP
2061 if (memory_pressure_path) {
2062 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2063 if (!x)
2064 return -ENOMEM;
2065
2066 our_env[n_env++] = x;
2067
2068 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2069 _cleanup_free_ char *b = NULL, *e = NULL;
2070
2071 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2072 MEMORY_PRESSURE_DEFAULT_TYPE,
2073 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2074 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2075 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2076 return -ENOMEM;
2077
2078 if (base64mem(b, strlen(b) + 1, &e) < 0)
2079 return -ENOMEM;
2080
2081 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2082 if (!x)
2083 return -ENOMEM;
2084
2085 our_env[n_env++] = x;
2086 }
2087 }
2088
2089 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
8d5bb13d 2090#undef N_ENV_VARS
7cae38c4 2091
ae2a15bc 2092 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2093
2094 return 0;
2095}
2096
b4c14404
FB
2097static int build_pass_environment(const ExecContext *c, char ***ret) {
2098 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2099 size_t n_env = 0;
b4c14404
FB
2100
2101 STRV_FOREACH(i, c->pass_environment) {
2102 _cleanup_free_ char *x = NULL;
2103 char *v;
2104
2105 v = getenv(*i);
2106 if (!v)
2107 continue;
605405c6 2108 x = strjoin(*i, "=", v);
b4c14404
FB
2109 if (!x)
2110 return -ENOMEM;
00819cc1 2111
319a4f4b 2112 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2113 return -ENOMEM;
00819cc1 2114
1cc6c93a 2115 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2116 pass_env[n_env] = NULL;
b4c14404
FB
2117 }
2118
ae2a15bc 2119 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2120
2121 return 0;
2122}
2123
fbbb9697
YW
2124bool exec_needs_network_namespace(const ExecContext *context) {
2125 assert(context);
2126
2127 return context->private_network || context->network_namespace_path;
2128}
2129
9c0c6701
DDM
2130static bool exec_needs_ephemeral(const ExecContext *context) {
2131 return (context->root_image || context->root_directory) && context->root_ephemeral;
2132}
2133
fde36d25
YW
2134static bool exec_needs_ipc_namespace(const ExecContext *context) {
2135 assert(context);
2136
2137 return context->private_ipc || context->ipc_namespace_path;
2138}
2139
5e8deb94 2140bool exec_needs_mount_namespace(
8b44a3d2
LP
2141 const ExecContext *context,
2142 const ExecParameters *params,
28135da3 2143 const ExecRuntime *runtime) {
8b44a3d2
LP
2144
2145 assert(context);
8b44a3d2 2146
915e6d16
LP
2147 if (context->root_image)
2148 return true;
2149
2a624c36
AP
2150 if (!strv_isempty(context->read_write_paths) ||
2151 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2152 !strv_isempty(context->inaccessible_paths) ||
2153 !strv_isempty(context->exec_paths) ||
2154 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2155 return true;
2156
42b1d8e0 2157 if (context->n_bind_mounts > 0)
d2d6c096
LP
2158 return true;
2159
2abd4e38
YW
2160 if (context->n_temporary_filesystems > 0)
2161 return true;
2162
b3d13314
LB
2163 if (context->n_mount_images > 0)
2164 return true;
2165
93f59701
LB
2166 if (context->n_extension_images > 0)
2167 return true;
2168
a07b9926
LB
2169 if (!strv_isempty(context->extension_directories))
2170 return true;
2171
874cdcbc 2172 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
8b44a3d2
LP
2173 return true;
2174
28135da3 2175 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
8b44a3d2
LP
2176 return true;
2177
8b44a3d2 2178 if (context->private_devices ||
24002121 2179 context->private_mounts > 0 ||
c2da3bf2 2180 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
8b44a3d2 2181 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2182 context->protect_home != PROTECT_HOME_NO ||
2183 context->protect_kernel_tunables ||
c575770b 2184 context->protect_kernel_modules ||
94a7b275 2185 context->protect_kernel_logs ||
4e399953
LP
2186 context->protect_control_groups ||
2187 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44 2188 context->proc_subset != PROC_SUBSET_ALL ||
fde36d25 2189 exec_needs_ipc_namespace(context))
8b44a3d2
LP
2190 return true;
2191
37c56f89 2192 if (context->root_directory) {
5e98086d 2193 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2194 return true;
2195
5b10116e 2196 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2197 if (params && !params->prefix[t])
37c56f89
YW
2198 continue;
2199
211a3d87 2200 if (context->directories[t].n_items > 0)
37c56f89
YW
2201 return true;
2202 }
2203 }
5d997827 2204
42b1d8e0 2205 if (context->dynamic_user &&
211a3d87
LB
2206 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2207 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2208 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2209 return true;
2210
91dd5f7c
LP
2211 if (context->log_namespace)
2212 return true;
2213
8b44a3d2
LP
2214 return false;
2215}
2216
5749f855 2217static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2218 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2219 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2220 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2221 _cleanup_(sigkill_waitp) pid_t pid = 0;
2222 uint64_t c = 1;
d251207d
LP
2223 ssize_t n;
2224 int r;
2225
5749f855
AZ
2226 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2227 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2228 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2229 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2230 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2231 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2232 * continues execution normally.
2233 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2234 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2235
5749f855 2236 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2237 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2238 r = asprintf(&uid_map,
5749f855 2239 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2240 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2241 ouid, ouid, uid, uid);
2242 else
2243 r = asprintf(&uid_map,
2244 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2245 ouid, ouid);
d251207d 2246
5749f855
AZ
2247 if (r < 0)
2248 return -ENOMEM;
2249
2250 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2251 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2252 r = asprintf(&gid_map,
5749f855 2253 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2254 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2255 ogid, ogid, gid, gid);
2256 else
2257 r = asprintf(&gid_map,
2258 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2259 ogid, ogid);
2260
2261 if (r < 0)
2262 return -ENOMEM;
d251207d
LP
2263
2264 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2265 * namespace. */
2266 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2267 if (unshare_ready_fd < 0)
2268 return -errno;
2269
2270 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2271 * failed. */
2272 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2273 return -errno;
2274
4c253ed1
LP
2275 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2276 if (r < 0)
2277 return r;
2278 if (r == 0) {
254d1313 2279 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2280 const char *a;
2281 pid_t ppid;
2282
2283 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2284 * here, after the parent opened its own user namespace. */
2285
2286 ppid = getppid();
2287 errno_pipe[0] = safe_close(errno_pipe[0]);
2288
2289 /* Wait until the parent unshared the user namespace */
2290 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2291 r = -errno;
2292 goto child_fail;
2293 }
2294
2295 /* Disable the setgroups() system call in the child user namespace, for good. */
2296 a = procfs_file_alloca(ppid, "setgroups");
2297 fd = open(a, O_WRONLY|O_CLOEXEC);
2298 if (fd < 0) {
2299 if (errno != ENOENT) {
2300 r = -errno;
2301 goto child_fail;
2302 }
2303
2304 /* If the file is missing the kernel is too old, let's continue anyway. */
2305 } else {
2306 if (write(fd, "deny\n", 5) < 0) {
2307 r = -errno;
2308 goto child_fail;
2309 }
2310
2311 fd = safe_close(fd);
2312 }
2313
2314 /* First write the GID map */
2315 a = procfs_file_alloca(ppid, "gid_map");
2316 fd = open(a, O_WRONLY|O_CLOEXEC);
2317 if (fd < 0) {
2318 r = -errno;
2319 goto child_fail;
2320 }
2321 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2322 r = -errno;
2323 goto child_fail;
2324 }
2325 fd = safe_close(fd);
2326
2327 /* The write the UID map */
2328 a = procfs_file_alloca(ppid, "uid_map");
2329 fd = open(a, O_WRONLY|O_CLOEXEC);
2330 if (fd < 0) {
2331 r = -errno;
2332 goto child_fail;
2333 }
2334 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2335 r = -errno;
2336 goto child_fail;
2337 }
2338
2339 _exit(EXIT_SUCCESS);
2340
2341 child_fail:
2342 (void) write(errno_pipe[1], &r, sizeof(r));
2343 _exit(EXIT_FAILURE);
2344 }
2345
2346 errno_pipe[1] = safe_close(errno_pipe[1]);
2347
2348 if (unshare(CLONE_NEWUSER) < 0)
2349 return -errno;
2350
2351 /* Let the child know that the namespace is ready now */
2352 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2353 return -errno;
2354
2355 /* Try to read an error code from the child */
2356 n = read(errno_pipe[0], &r, sizeof(r));
2357 if (n < 0)
2358 return -errno;
2359 if (n == sizeof(r)) { /* an error code was sent to us */
2360 if (r < 0)
2361 return r;
2362 return -EIO;
2363 }
2364 if (n != 0) /* on success we should have read 0 bytes */
2365 return -EIO;
2366
8f03de53 2367 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2368 if (r < 0)
2369 return r;
2e87a1fd 2370 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2371 return -EIO;
2372
2373 return 0;
2374}
2375
494d0247 2376static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
e43911a7
LP
2377 assert(context);
2378
494d0247
YW
2379 if (!context->dynamic_user)
2380 return false;
2381
2382 if (type == EXEC_DIRECTORY_CONFIGURATION)
2383 return false;
2384
2385 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2386 return false;
2387
2388 return true;
2389}
2390
211a3d87
LB
2391static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2392 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2393 int r;
2394
2395 assert(source);
2396
2397 src_abs = path_join(root, source);
2398 if (!src_abs)
2399 return -ENOMEM;
2400
2401 STRV_FOREACH(dst, symlinks) {
2402 _cleanup_free_ char *dst_abs = NULL;
2403
2404 dst_abs = path_join(root, *dst);
2405 if (!dst_abs)
2406 return -ENOMEM;
2407
2408 r = mkdir_parents_label(dst_abs, 0755);
2409 if (r < 0)
2410 return r;
2411
2412 r = symlink_idempotent(src_abs, dst_abs, true);
2413 if (r < 0)
2414 return r;
2415 }
2416
2417 return 0;
2418}
2419
3536f49e 2420static int setup_exec_directory(
59dd2bbb 2421 Unit *u,
07689d5d
LP
2422 const ExecContext *context,
2423 const ExecParameters *params,
2424 uid_t uid,
3536f49e 2425 gid_t gid,
3536f49e 2426 ExecDirectoryType type,
211a3d87 2427 bool needs_mount_namespace,
3536f49e 2428 int *exit_status) {
07689d5d 2429
72fd1768 2430 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2431 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2432 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2433 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2434 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2435 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2436 };
07689d5d
LP
2437 int r;
2438
2439 assert(context);
2440 assert(params);
72fd1768 2441 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2442 assert(exit_status);
07689d5d 2443
3536f49e
YW
2444 if (!params->prefix[type])
2445 return 0;
2446
8679efde 2447 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2448 if (!uid_is_valid(uid))
2449 uid = 0;
2450 if (!gid_is_valid(gid))
2451 gid = 0;
2452 }
2453
211a3d87 2454 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2455 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2456
211a3d87 2457 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2458 if (!p) {
2459 r = -ENOMEM;
2460 goto fail;
2461 }
07689d5d 2462
23a7448e
YW
2463 r = mkdir_parents_label(p, 0755);
2464 if (r < 0)
3536f49e 2465 goto fail;
23a7448e 2466
f9c91932
LP
2467 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2468
2469 /* If we are in user mode, and a configuration directory exists but a state directory
2470 * doesn't exist, then we likely are upgrading from an older systemd version that
2471 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2472 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2473 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
627cdcc7 2474 * separated. If a service has both dirs configured but only the configuration dir
f9c91932
LP
2475 * exists and the state dir does not, we assume we are looking at an update
2476 * situation. Hence, create a compatibility symlink, so that all expectations are
2477 * met.
2478 *
2479 * (We also do something similar with the log directory, which still doesn't exist in
2480 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2481
2482 /* this assumes the state dir is always created before the configuration dir */
2483 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2484 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2485
2486 r = laccess(p, F_OK);
2487 if (r == -ENOENT) {
2488 _cleanup_free_ char *q = NULL;
2489
2490 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2491 * under the configuration hierarchy. */
2492
2493 if (type == EXEC_DIRECTORY_STATE)
2494 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2495 else if (type == EXEC_DIRECTORY_LOGS)
2496 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2497 else
2498 assert_not_reached();
2499 if (!q) {
2500 r = -ENOMEM;
2501 goto fail;
2502 }
2503
2504 r = laccess(q, F_OK);
2505 if (r >= 0) {
2506 /* It does exist! This hence looks like an update. Symlink the
2507 * configuration directory into the state directory. */
2508
2509 r = symlink_idempotent(q, p, /* make_relative= */ true);
2510 if (r < 0)
2511 goto fail;
2512
59dd2bbb 2513 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
f9c91932
LP
2514 continue;
2515 } else if (r != -ENOENT)
59dd2bbb 2516 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
f9c91932
LP
2517
2518 } else if (r < 0)
59dd2bbb 2519 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
f9c91932
LP
2520 }
2521
494d0247 2522 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2523 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2524 * case we want to avoid leaving a directory around fully accessible that is owned by
2525 * a dynamic user whose UID is later on reused. To lock this down we use the same
2526 * trick used by container managers to prohibit host users to get access to files of
2527 * the same UID in containers: we place everything inside a directory that has an
2528 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2529 * for unprivileged host code. We then use fs namespacing to make this directory
2530 * permeable for the service itself.
6c47cd7d 2531 *
3f5b1508
LP
2532 * Specifically: for a service which wants a special directory "foo/" we first create
2533 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2534 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2535 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2536 * unprivileged host users can't look into it. Inside of the namespace of the unit
2537 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2538 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2539 * for the service and making sure it only gets access to the dirs it needs but no
2540 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2541 *
3f5b1508
LP
2542 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2543 * to be owned by the service itself.
2544 *
2545 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2546 * for sharing files or sockets with other services. */
6c47cd7d 2547
4ede9802
LP
2548 pp = path_join(params->prefix[type], "private");
2549 if (!pp) {
6c47cd7d
LP
2550 r = -ENOMEM;
2551 goto fail;
2552 }
2553
2554 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2555 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2556 if (r < 0)
2557 goto fail;
2558
211a3d87 2559 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2560 r = -ENOMEM;
2561 goto fail;
2562 }
2563
2564 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2565 r = mkdir_parents_label(pp, 0755);
2566 if (r < 0)
2567 goto fail;
2568
949befd3 2569 if (is_dir(p, false) > 0 &&
b93d24e0 2570 (laccess(pp, F_OK) == -ENOENT)) {
949befd3
LP
2571
2572 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2573 * it over. Most likely the service has been upgraded from one that didn't use
2574 * DynamicUser=1, to one that does. */
2575
59dd2bbb
LP
2576 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2577 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2578 exec_directory_type_to_string(type), p, pp);
cf52c45d 2579
db58f5de
LP
2580 r = RET_NERRNO(rename(p, pp));
2581 if (r < 0)
949befd3 2582 goto fail;
949befd3
LP
2583 } else {
2584 /* Otherwise, create the actual directory for the service */
2585
2586 r = mkdir_label(pp, context->directories[type].mode);
2587 if (r < 0 && r != -EEXIST)
2588 goto fail;
2589 }
6c47cd7d 2590
a2ab603c
YW
2591 if (!context->directories[type].items[i].only_create) {
2592 /* And link it up from the original place.
2593 * Notes
2594 * 1) If a mount namespace is going to be used, then this symlink remains on
2595 * the host, and a new one for the child namespace will be created later.
2596 * 2) It is not necessary to create this symlink when one of its parent
2597 * directories is specified and already created. E.g.
2598 * StateDirectory=foo foo/bar
2599 * In that case, the inode points to pp and p for "foo/bar" are the same:
2600 * pp = "/var/lib/private/foo/bar"
2601 * p = "/var/lib/foo/bar"
2602 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2603 * we do not need to create the symlink, but we cannot create the symlink.
2604 * See issue #24783. */
2605 r = symlink_idempotent(pp, p, true);
2606 if (r < 0)
2607 goto fail;
2608 }
6c47cd7d 2609
6c47cd7d 2610 } else {
5c6d40d1
LP
2611 _cleanup_free_ char *target = NULL;
2612
2613 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2614 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2615 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2616
2617 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2618 * by DynamicUser=1 (see above)?
2619 *
2620 * We do this for all directory types except for ConfigurationDirectory=,
2621 * since they all support the private/ symlink logic at least in some
2622 * configurations, see above. */
5c6d40d1 2623
f461a28d 2624 r = chase(target, NULL, 0, &target_resolved, NULL);
578dc69f
YW
2625 if (r < 0)
2626 goto fail;
2627
211a3d87 2628 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2629 if (!q) {
2630 r = -ENOMEM;
2631 goto fail;
2632 }
2633
578dc69f 2634 /* /var/lib or friends may be symlinks. So, let's chase them also. */
f461a28d 2635 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
578dc69f
YW
2636 if (r < 0)
2637 goto fail;
2638
2639 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2640
2641 /* Hmm, apparently DynamicUser= was once turned on for this service,
2642 * but is no longer. Let's move the directory back up. */
2643
59dd2bbb
LP
2644 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2645 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2646 exec_directory_type_to_string(type), q, p);
cf52c45d 2647
db58f5de
LP
2648 r = RET_NERRNO(unlink(p));
2649 if (r < 0)
5c6d40d1 2650 goto fail;
5c6d40d1 2651
db58f5de
LP
2652 r = RET_NERRNO(rename(q, p));
2653 if (r < 0)
5c6d40d1 2654 goto fail;
5c6d40d1
LP
2655 }
2656 }
2657
6c47cd7d 2658 r = mkdir_label(p, context->directories[type].mode);
d484580c 2659 if (r < 0) {
d484580c
LP
2660 if (r != -EEXIST)
2661 goto fail;
2662
206e9864
LP
2663 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2664 struct stat st;
2665
2666 /* Don't change the owner/access mode of the configuration directory,
2667 * as in the common case it is not written to by a service, and shall
2668 * not be writable. */
2669
db58f5de
LP
2670 r = RET_NERRNO(stat(p, &st));
2671 if (r < 0)
206e9864 2672 goto fail;
206e9864
LP
2673
2674 /* Still complain if the access mode doesn't match */
2675 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
59dd2bbb
LP
2676 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2677 "(File system: %o %sMode: %o)",
2678 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2679 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
206e9864 2680
6cff72eb 2681 continue;
206e9864 2682 }
6cff72eb 2683 }
a1164ae3 2684 }
07689d5d 2685
206e9864 2686 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2687 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2688 * current UID/GID ownership.) */
2689 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2690 if (r < 0)
2691 goto fail;
c71b2eb7 2692
f5bb36dc
LP
2693 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2694 * available to user code anyway */
2695 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2696 continue;
2697
607b358e
LP
2698 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2699 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2700 * assignments to exist. */
d5602c16 2701 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
07689d5d 2702 if (r < 0)
3536f49e 2703 goto fail;
07689d5d
LP
2704 }
2705
211a3d87
LB
2706 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2707 * they are set up later, to allow configuring empty var/run/etc. */
2708 if (!needs_mount_namespace)
2709 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2710 r = create_many_symlinks(params->prefix[type],
2711 context->directories[type].items[i].path,
2712 context->directories[type].items[i].symlinks);
2713 if (r < 0)
2714 goto fail;
2715 }
2716
07689d5d 2717 return 0;
3536f49e
YW
2718
2719fail:
2720 *exit_status = exit_status_table[type];
3536f49e 2721 return r;
07689d5d
LP
2722}
2723
92b423b9 2724#if ENABLE_SMACK
cefc33ae 2725static int setup_smack(
aa5ae971 2726 const Manager *manager,
cefc33ae 2727 const ExecContext *context,
b83d5050 2728 int executable_fd) {
cefc33ae
LP
2729 int r;
2730
2731 assert(context);
b83d5050 2732 assert(executable_fd >= 0);
cefc33ae 2733
cefc33ae
LP
2734 if (context->smack_process_label) {
2735 r = mac_smack_apply_pid(0, context->smack_process_label);
2736 if (r < 0)
2737 return r;
c9e120e0 2738 } else if (manager->defaults.smack_process_label) {
cefc33ae
LP
2739 _cleanup_free_ char *exec_label = NULL;
2740
b83d5050 2741 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 2742 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
2743 return r;
2744
c9e120e0 2745 r = mac_smack_apply_pid(0, exec_label ?: manager->defaults.smack_process_label);
cefc33ae
LP
2746 if (r < 0)
2747 return r;
2748 }
cefc33ae
LP
2749
2750 return 0;
2751}
92b423b9 2752#endif
cefc33ae 2753
6c47cd7d
LP
2754static int compile_bind_mounts(
2755 const ExecContext *context,
2756 const ExecParameters *params,
2757 BindMount **ret_bind_mounts,
da6053d0 2758 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2759 char ***ret_empty_directories) {
2760
2761 _cleanup_strv_free_ char **empty_directories = NULL;
ed8267c7 2762 BindMount *bind_mounts = NULL;
5b10116e 2763 size_t n, h = 0;
6c47cd7d
LP
2764 int r;
2765
2766 assert(context);
2767 assert(params);
2768 assert(ret_bind_mounts);
2769 assert(ret_n_bind_mounts);
2770 assert(ret_empty_directories);
2771
ed8267c7
DT
2772 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2773
6c47cd7d 2774 n = context->n_bind_mounts;
5b10116e 2775 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2776 if (!params->prefix[t])
2777 continue;
2778
a2ab603c
YW
2779 for (size_t i = 0; i < context->directories[t].n_items; i++)
2780 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
2781 }
2782
2783 if (n <= 0) {
2784 *ret_bind_mounts = NULL;
2785 *ret_n_bind_mounts = 0;
2786 *ret_empty_directories = NULL;
2787 return 0;
2788 }
2789
2790 bind_mounts = new(BindMount, n);
2791 if (!bind_mounts)
2792 return -ENOMEM;
2793
5b10116e 2794 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d 2795 BindMount *item = context->bind_mounts + i;
93404d34 2796 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d
LP
2797
2798 s = strdup(item->source);
ed8267c7
DT
2799 if (!s)
2800 return -ENOMEM;
6c47cd7d
LP
2801
2802 d = strdup(item->destination);
93404d34 2803 if (!d)
ed8267c7 2804 return -ENOMEM;
6c47cd7d
LP
2805
2806 bind_mounts[h++] = (BindMount) {
93404d34
DT
2807 .source = TAKE_PTR(s),
2808 .destination = TAKE_PTR(d),
6c47cd7d
LP
2809 .read_only = item->read_only,
2810 .recursive = item->recursive,
2811 .ignore_enoent = item->ignore_enoent,
2812 };
2813 }
2814
5b10116e 2815 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2816 if (!params->prefix[t])
2817 continue;
2818
211a3d87 2819 if (context->directories[t].n_items == 0)
6c47cd7d
LP
2820 continue;
2821
494d0247 2822 if (exec_directory_is_private(context, t) &&
74e12520 2823 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
2824 char *private_root;
2825
2826 /* So this is for a dynamic user, and we need to make sure the process can access its own
2827 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2828 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2829
657ee2d8 2830 private_root = path_join(params->prefix[t], "private");
ed8267c7
DT
2831 if (!private_root)
2832 return -ENOMEM;
6c47cd7d
LP
2833
2834 r = strv_consume(&empty_directories, private_root);
a635a7ae 2835 if (r < 0)
ed8267c7 2836 return r;
6c47cd7d
LP
2837 }
2838
211a3d87 2839 for (size_t i = 0; i < context->directories[t].n_items; i++) {
93404d34 2840 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d 2841
a2ab603c
YW
2842 /* When one of the parent directories is in the list, we cannot create the symlink
2843 * for the child directory. See also the comments in setup_exec_directory(). */
2844 if (context->directories[t].items[i].only_create)
2845 continue;
2846
494d0247 2847 if (exec_directory_is_private(context, t))
211a3d87 2848 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 2849 else
211a3d87 2850 s = path_join(params->prefix[t], context->directories[t].items[i].path);
ed8267c7
DT
2851 if (!s)
2852 return -ENOMEM;
6c47cd7d 2853
494d0247 2854 if (exec_directory_is_private(context, t) &&
74e12520 2855 exec_context_with_rootfs(context))
5609f688
YW
2856 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2857 * directory is not created on the root directory. So, let's bind-mount the directory
2858 * on the 'non-private' place. */
211a3d87 2859 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
2860 else
2861 d = strdup(s);
93404d34 2862 if (!d)
ed8267c7 2863 return -ENOMEM;
6c47cd7d
LP
2864
2865 bind_mounts[h++] = (BindMount) {
93404d34
DT
2866 .source = TAKE_PTR(s),
2867 .destination = TAKE_PTR(d),
6c47cd7d 2868 .read_only = false,
9ce4e4b0 2869 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
2870 .recursive = true,
2871 .ignore_enoent = false,
2872 };
2873 }
2874 }
2875
2876 assert(h == n);
2877
ed8267c7 2878 *ret_bind_mounts = TAKE_PTR(bind_mounts);
6c47cd7d 2879 *ret_n_bind_mounts = n;
ae2a15bc 2880 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
2881
2882 return (int) n;
6c47cd7d
LP
2883}
2884
df61e79a
LB
2885/* ret_symlinks will contain a list of pairs src:dest that describes
2886 * the symlinks to create later on. For example, the symlinks needed
2887 * to safely give private directories to DynamicUser=1 users. */
2888static int compile_symlinks(
2889 const ExecContext *context,
2890 const ExecParameters *params,
663e2756 2891 bool setup_os_release_symlink,
df61e79a
LB
2892 char ***ret_symlinks) {
2893
2894 _cleanup_strv_free_ char **symlinks = NULL;
2895 int r;
2896
2897 assert(context);
2898 assert(params);
2899 assert(ret_symlinks);
2900
2901 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
2902 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2903 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 2904
211a3d87
LB
2905 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2906 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 2907
211a3d87
LB
2908 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2909 dst_abs = path_join(params->prefix[dt], *symlink);
2910 if (!src_abs || !dst_abs)
2911 return -ENOMEM;
df61e79a 2912
211a3d87
LB
2913 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2914 if (r < 0)
2915 return r;
2916 }
2917
a2ab603c
YW
2918 if (!exec_directory_is_private(context, dt) ||
2919 exec_context_with_rootfs(context) ||
2920 context->directories[dt].items[i].only_create)
211a3d87
LB
2921 continue;
2922
2923 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
2924 if (!private_path)
2925 return -ENOMEM;
2926
211a3d87 2927 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
2928 if (!path)
2929 return -ENOMEM;
2930
2931 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2932 if (r < 0)
2933 return r;
2934 }
2935 }
2936
663e2756
LB
2937 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2938 * and readers will never get a half-written version. Note that, while the paths specified here are
2939 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2940 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2941 if (setup_os_release_symlink) {
2942 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2943 if (r < 0)
2944 return r;
2945
2946 r = strv_extend(&symlinks, "/run/host/os-release");
2947 if (r < 0)
2948 return r;
2949 }
2950
df61e79a
LB
2951 *ret_symlinks = TAKE_PTR(symlinks);
2952
2953 return 0;
2954}
2955
4e677599
LP
2956static bool insist_on_sandboxing(
2957 const ExecContext *context,
2958 const char *root_dir,
2959 const char *root_image,
2960 const BindMount *bind_mounts,
2961 size_t n_bind_mounts) {
2962
4e677599
LP
2963 assert(context);
2964 assert(n_bind_mounts == 0 || bind_mounts);
2965
2966 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 2967 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
2968 * rearrange stuff in a way we cannot ignore gracefully. */
2969
2970 if (context->n_temporary_filesystems > 0)
2971 return true;
2972
2973 if (root_dir || root_image)
2974 return true;
2975
b3d13314
LB
2976 if (context->n_mount_images > 0)
2977 return true;
2978
4e677599
LP
2979 if (context->dynamic_user)
2980 return true;
2981
4355c04f
LB
2982 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
2983 return true;
2984
4e677599
LP
2985 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
2986 * essential. */
5b10116e 2987 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
2988 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
2989 return true;
2990
91dd5f7c
LP
2991 if (context->log_namespace)
2992 return true;
2993
4e677599
LP
2994 return false;
2995}
2996
9c0c6701
DDM
2997static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
2998 _cleanup_close_ int fd = -EBADF;
2999 int r;
3000
3001 if (!runtime || !runtime->ephemeral_copy)
3002 return 0;
3003
3004 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3005 if (r < 0)
3006 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3007
3008 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3009
3010 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3011 if (fd >= 0)
3012 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3013 return 0;
3014
3015 if (fd != -EAGAIN)
3016 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3017
3018 log_debug("Making ephemeral snapshot of %s to %s",
3019 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3020
3021 if (context->root_image)
3022 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3023 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3024 else
3025 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3026 AT_FDCWD, runtime->ephemeral_copy,
3027 BTRFS_SNAPSHOT_FALLBACK_COPY |
3028 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3029 BTRFS_SNAPSHOT_RECURSIVE |
3030 BTRFS_SNAPSHOT_LOCK_BSD);
3031 if (fd < 0)
3032 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3033 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3034
3035 if (context->root_image) {
3036 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3037 * which tends to not perform well in combination with lots of random writes.
3038 *
3039 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3040 * copy, but we at least want to make the intention clear.
3041 */
3042 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3043 if (r < 0)
3044 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3045 }
3046
3047 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3048 if (r < 0)
3049 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3050
3051 return 1;
3052}
3053
66130f0a
DDM
3054static int verity_settings_prepare(
3055 VeritySettings *verity,
3056 const char *root_image,
3057 const void *root_hash,
3058 size_t root_hash_size,
3059 const char *root_hash_path,
3060 const void *root_hash_sig,
3061 size_t root_hash_sig_size,
3062 const char *root_hash_sig_path,
3063 const char *verity_data_path) {
3064
3065 int r;
3066
3067 assert(verity);
3068
3069 if (root_hash) {
3070 void *d;
3071
3072 d = memdup(root_hash, root_hash_size);
3073 if (!d)
3074 return -ENOMEM;
3075
3076 free_and_replace(verity->root_hash, d);
3077 verity->root_hash_size = root_hash_size;
3078 verity->designator = PARTITION_ROOT;
3079 }
3080
3081 if (root_hash_sig) {
3082 void *d;
3083
3084 d = memdup(root_hash_sig, root_hash_sig_size);
3085 if (!d)
3086 return -ENOMEM;
3087
3088 free_and_replace(verity->root_hash_sig, d);
3089 verity->root_hash_sig_size = root_hash_sig_size;
3090 verity->designator = PARTITION_ROOT;
3091 }
3092
3093 if (verity_data_path) {
3094 r = free_and_strdup(&verity->data_path, verity_data_path);
3095 if (r < 0)
3096 return r;
3097 }
3098
3099 r = verity_settings_load(
3100 verity,
3101 root_image,
3102 root_hash_path,
3103 root_hash_sig_path);
3104 if (r < 0)
3105 return log_debug_errno(r, "Failed to load root hash: %m");
3106
3107 return 0;
3108}
3109
6818c54c 3110static int apply_mount_namespace(
34cf6c43 3111 const Unit *u,
9f71ba8d 3112 ExecCommandFlags command_flags,
6818c54c
LP
3113 const ExecContext *context,
3114 const ExecParameters *params,
9c0c6701 3115 ExecRuntime *runtime,
d4b6ec98 3116 const char *memory_pressure_path,
7cc5ef5f 3117 char **error_path) {
6818c54c 3118
66130f0a 3119 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
d4b6ec98
LB
3120 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3121 **read_write_paths_cleanup = NULL;
73ff4d48 3122 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
663e2756 3123 *extension_dir = NULL, *host_os_release_stage = NULL;
66130f0a 3124 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
d4b6ec98 3125 char **read_write_paths;
228af36f 3126 NamespaceInfo ns_info;
663e2756 3127 bool needs_sandboxing, setup_os_release_symlink;
6c47cd7d 3128 BindMount *bind_mounts = NULL;
da6053d0 3129 size_t n_bind_mounts = 0;
6818c54c 3130 int r;
93c6bb51 3131
2b3c1b9e
DH
3132 assert(context);
3133
29933daf
DT
3134 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3135
915e6d16 3136 if (params->flags & EXEC_APPLY_CHROOT) {
9c0c6701
DDM
3137 r = setup_ephemeral(context, runtime);
3138 if (r < 0)
3139 return r;
915e6d16 3140
9c0c6701
DDM
3141 if (context->root_image)
3142 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3143 else
3144 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
915e6d16 3145 }
93c6bb51 3146
6c47cd7d
LP
3147 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3148 if (r < 0)
3149 return r;
3150
d4b6ec98
LB
3151 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3152 * service will need to write to it in order to start the notifications. */
3153 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3154 read_write_paths_cleanup = strv_copy(context->read_write_paths);
29933daf
DT
3155 if (!read_write_paths_cleanup)
3156 return -ENOMEM;
d4b6ec98
LB
3157
3158 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3159 if (r < 0)
29933daf 3160 return r;
d4b6ec98
LB
3161
3162 read_write_paths = read_write_paths_cleanup;
3163 } else
3164 read_write_paths = context->read_write_paths;
3165
9f71ba8d 3166 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3167 if (needs_sandboxing) {
3168 /* The runtime struct only contains the parent of the private /tmp,
3169 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3170 * that is sticky, and that's the one we want to use here.
3171 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91 3172
28135da3
DDM
3173 if (context->private_tmp && runtime && runtime->shared) {
3174 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3175 tmp_dir = runtime->shared->tmp_dir;
3176 else if (runtime->shared->tmp_dir)
3177 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
56a13a49 3178
28135da3
DDM
3179 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3180 var_tmp_dir = runtime->shared->var_tmp_dir;
3181 else if (runtime->shared->var_tmp_dir)
3182 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
ecf63c91
NJ
3183 }
3184
b5a33299
YW
3185 ns_info = (NamespaceInfo) {
3186 .ignore_protect_paths = false,
3187 .private_dev = context->private_devices,
3188 .protect_control_groups = context->protect_control_groups,
3189 .protect_kernel_tunables = context->protect_kernel_tunables,
3190 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3191 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3192 .protect_hostname = context->protect_hostname,
5e98086d 3193 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
52b3d652
LP
3194 .protect_home = context->protect_home,
3195 .protect_system = context->protect_system,
4e399953
LP
3196 .protect_proc = context->protect_proc,
3197 .proc_subset = context->proc_subset,
c2da3bf2 3198 .private_network = exec_needs_network_namespace(context),
fde36d25 3199 .private_ipc = exec_needs_ipc_namespace(context),
6720e356 3200 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3201 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3202 };
ecf63c91 3203 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3204 /*
3205 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3206 * sandbox info, otherwise enforce it, don't ignore protected paths and
3207 * fail if we are enable to apply the sandbox inside the mount namespace.
3208 */
3209 ns_info = (NamespaceInfo) {
3210 .ignore_protect_paths = true,
3211 };
3212 else
3213 ns_info = (NamespaceInfo) {};
b5a33299 3214
663e2756
LB
3215 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
3216 setup_os_release_symlink = ns_info.mount_apivfs && (root_dir || root_image);
3217 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3218 if (r < 0)
3219 return r;
3220
874cdcbc 3221 if (context->mount_propagation_flag == MS_SHARED)
37ed15d7
FB
3222 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3223
133e4de2
YW
3224 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3225 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3226 if (r < 0)
3227 return r;
73ff4d48
YW
3228 }
3229
170d978b 3230 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
5e8deb94 3231 propagate_dir = path_join("/run/systemd/propagate/", u->id);
29933daf
DT
3232 if (!propagate_dir)
3233 return -ENOMEM;
f2550b98 3234
5e8deb94 3235 incoming_dir = strdup("/run/systemd/incoming");
29933daf
DT
3236 if (!incoming_dir)
3237 return -ENOMEM;
24759d8f
LB
3238
3239 extension_dir = strdup("/run/systemd/unit-extensions");
29933daf
DT
3240 if (!extension_dir)
3241 return -ENOMEM;
3f37a825
LB
3242
3243 /* If running under a different root filesystem, propagate the host's os-release. We make a
3244 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
663e2756
LB
3245 if (setup_os_release_symlink) {
3246 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3247 if (!host_os_release_stage)
3f37a825
LB
3248 return -ENOMEM;
3249 }
170d978b
LP
3250 } else {
3251 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3252
29933daf
DT
3253 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3254 return -ENOMEM;
3f37a825 3255
663e2756
LB
3256 if (setup_os_release_symlink) {
3257 if (asprintf(&host_os_release_stage,
3258 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3259 geteuid()) < 0)
3f37a825
LB
3260 return -ENOMEM;
3261 }
170d978b 3262 }
5e8deb94 3263
66130f0a
DDM
3264 if (root_image) {
3265 r = verity_settings_prepare(
3266 &verity,
3267 root_image,
3268 context->root_hash, context->root_hash_size, context->root_hash_path,
3269 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3270 context->root_verity);
3271 if (r < 0)
3272 return r;
3273 }
3274
84be0c71
LP
3275 r = setup_namespace(
3276 root_dir,
3277 root_image,
3278 context->root_image_options,
3279 context->root_image_policy ?: &image_policy_service,
3280 &ns_info,
3281 read_write_paths,
3282 needs_sandboxing ? context->read_only_paths : NULL,
3283 needs_sandboxing ? context->inaccessible_paths : NULL,
3284 needs_sandboxing ? context->exec_paths : NULL,
3285 needs_sandboxing ? context->no_exec_paths : NULL,
3286 empty_directories,
3287 symlinks,
3288 bind_mounts,
3289 n_bind_mounts,
3290 context->temporary_filesystems,
3291 context->n_temporary_filesystems,
3292 context->mount_images,
3293 context->n_mount_images,
3294 context->mount_image_policy ?: &image_policy_service,
3295 tmp_dir,
3296 var_tmp_dir,
3297 creds_path,
3298 context->log_namespace,
3299 context->mount_propagation_flag,
66130f0a 3300 &verity,
84be0c71
LP
3301 context->extension_images,
3302 context->n_extension_images,
3303 context->extension_image_policy ?: &image_policy_sysext,
3304 context->extension_directories,
3305 propagate_dir,
3306 incoming_dir,
3307 extension_dir,
3308 root_dir || root_image ? params->notify_socket : NULL,
663e2756 3309 host_os_release_stage,
84be0c71 3310 error_path);
93c6bb51 3311
1beab8b0 3312 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3313 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3314 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3315 * completely different execution environment. */
aca835ed 3316 if (r == -ENOANO) {
4e677599
LP
3317 if (insist_on_sandboxing(
3318 context,
3319 root_dir, root_image,
3320 bind_mounts,
29933daf
DT
3321 n_bind_mounts))
3322 return log_unit_debug_errno(u,
3323 SYNTHETIC_ERRNO(EOPNOTSUPP),
3324 "Failed to set up namespace, and refusing to continue since "
3325 "the selected namespacing options alter mount environment non-trivially.\n"
3326 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3327 n_bind_mounts,
3328 context->n_temporary_filesystems,
3329 yes_no(root_dir),
3330 yes_no(root_image),
3331 yes_no(context->dynamic_user));
3332
3333 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3334 return 0;
93c6bb51
DH
3335 }
3336
3337 return r;
3338}
3339
915e6d16
LP
3340static int apply_working_directory(
3341 const ExecContext *context,
3342 const ExecParameters *params,
9c0c6701 3343 ExecRuntime *runtime,
915e6d16 3344 const char *home,
376fecf6 3345 int *exit_status) {
915e6d16 3346
6732edab 3347 const char *d, *wd;
2b3c1b9e
DH
3348
3349 assert(context);
376fecf6 3350 assert(exit_status);
2b3c1b9e 3351
6732edab
LP
3352 if (context->working_directory_home) {
3353
376fecf6
LP
3354 if (!home) {
3355 *exit_status = EXIT_CHDIR;
6732edab 3356 return -ENXIO;
376fecf6 3357 }
6732edab 3358
2b3c1b9e 3359 wd = home;
6732edab 3360
14eb3285
LP
3361 } else
3362 wd = empty_to_root(context->working_directory);
e7f1e7c6 3363
fa97f630 3364 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3365 d = wd;
fa97f630 3366 else
9c0c6701 3367 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
e7f1e7c6 3368
376fecf6
LP
3369 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3370 *exit_status = EXIT_CHDIR;
2b3c1b9e 3371 return -errno;
376fecf6 3372 }
e7f1e7c6
DH
3373
3374 return 0;
3375}
3376
fa97f630
JB
3377static int apply_root_directory(
3378 const ExecContext *context,
3379 const ExecParameters *params,
9c0c6701 3380 ExecRuntime *runtime,
fa97f630
JB
3381 const bool needs_mount_ns,
3382 int *exit_status) {
3383
3384 assert(context);
3385 assert(exit_status);
3386
5b10116e 3387 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630 3388 if (!needs_mount_ns && context->root_directory)
9c0c6701 3389 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
fa97f630
JB
3390 *exit_status = EXIT_CHROOT;
3391 return -errno;
3392 }
fa97f630
JB
3393
3394 return 0;
3395}
3396
b1edf445 3397static int setup_keyring(
34cf6c43 3398 const Unit *u,
b1edf445
LP
3399 const ExecContext *context,
3400 const ExecParameters *p,
3401 uid_t uid, gid_t gid) {
3402
74dd6b51 3403 key_serial_t keyring;
e64c2d0b
DJL
3404 int r = 0;
3405 uid_t saved_uid;
3406 gid_t saved_gid;
74dd6b51
LP
3407
3408 assert(u);
b1edf445 3409 assert(context);
74dd6b51
LP
3410 assert(p);
3411
3412 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3413 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3414 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3415 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3416 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3417 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3418
b1edf445
LP
3419 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3420 return 0;
3421
e64c2d0b
DJL
3422 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3423 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3424 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3425 * & group is just as nasty as acquiring a reference to the user keyring. */
3426
3427 saved_uid = getuid();
3428 saved_gid = getgid();
3429
3430 if (gid_is_valid(gid) && gid != saved_gid) {
3431 if (setregid(gid, -1) < 0)
3432 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3433 }
3434
3435 if (uid_is_valid(uid) && uid != saved_uid) {
3436 if (setreuid(uid, -1) < 0) {
3437 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3438 goto out;
3439 }
3440 }
3441
74dd6b51
LP
3442 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3443 if (keyring == -1) {
3444 if (errno == ENOSYS)
8002fb97 3445 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3446 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3447 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3448 else if (errno == EDQUOT)
8002fb97 3449 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3450 else
e64c2d0b 3451 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3452
e64c2d0b 3453 goto out;
74dd6b51
LP
3454 }
3455
e64c2d0b
DJL
3456 /* When requested link the user keyring into the session keyring. */
3457 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3458
3459 if (keyctl(KEYCTL_LINK,
3460 KEY_SPEC_USER_KEYRING,
3461 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3462 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3463 goto out;
3464 }
3465 }
3466
3467 /* Restore uid/gid back */
3468 if (uid_is_valid(uid) && uid != saved_uid) {
3469 if (setreuid(saved_uid, -1) < 0) {
3470 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3471 goto out;
3472 }
3473 }
3474
3475 if (gid_is_valid(gid) && gid != saved_gid) {
3476 if (setregid(saved_gid, -1) < 0)
3477 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3478 }
3479
3480 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3481 if (!sd_id128_is_null(u->invocation_id)) {
3482 key_serial_t key;
3483
3484 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3485 if (key == -1)
8002fb97 3486 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3487 else {
3488 if (keyctl(KEYCTL_SETPERM, key,
3489 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3490 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3491 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3492 }
3493 }
3494
e64c2d0b 3495out:
37b22b3b 3496 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3497 /* no extra logging, as only the first already reported error matters */
3498 if (getuid() != saved_uid)
3499 (void) setreuid(saved_uid, -1);
b1edf445 3500
e64c2d0b
DJL
3501 if (getgid() != saved_gid)
3502 (void) setregid(saved_gid, -1);
b1edf445 3503
e64c2d0b 3504 return r;
74dd6b51
LP
3505}
3506
3042bbeb 3507static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3508 assert(array);
3509 assert(n);
2caa38e9 3510 assert(pair);
29206d46
LP
3511
3512 if (pair[0] >= 0)
3513 array[(*n)++] = pair[0];
3514 if (pair[1] >= 0)
3515 array[(*n)++] = pair[1];
3516}
3517
a34ceba6
LP
3518static int close_remaining_fds(
3519 const ExecParameters *params,
28135da3 3520 const ExecRuntime *runtime,
00d9ef85 3521 int user_lookup_fd,
a34ceba6 3522 int socket_fd,
5b8d1f6b 3523 const int *fds, size_t n_fds) {
a34ceba6 3524
da6053d0 3525 size_t n_dont_close = 0;
9c0c6701 3526 int dont_close[n_fds + 14];
a34ceba6
LP
3527
3528 assert(params);
3529
3530 if (params->stdin_fd >= 0)
3531 dont_close[n_dont_close++] = params->stdin_fd;
3532 if (params->stdout_fd >= 0)
3533 dont_close[n_dont_close++] = params->stdout_fd;
3534 if (params->stderr_fd >= 0)
3535 dont_close[n_dont_close++] = params->stderr_fd;
3536
3537 if (socket_fd >= 0)
3538 dont_close[n_dont_close++] = socket_fd;
3539 if (n_fds > 0) {
3540 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3541 n_dont_close += n_fds;
3542 }
3543
9c0c6701
DDM
3544 if (runtime)
3545 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3546
28135da3
DDM
3547 if (runtime && runtime->shared) {
3548 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3549 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
a70581ff 3550 }
29206d46 3551
15220772
DDM
3552 if (runtime && runtime->dynamic_creds) {
3553 if (runtime->dynamic_creds->user)
3554 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3555 if (runtime->dynamic_creds->group)
3556 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
a34ceba6
LP
3557 }
3558
00d9ef85
LP
3559 if (user_lookup_fd >= 0)
3560 dont_close[n_dont_close++] = user_lookup_fd;
3561
a34ceba6
LP
3562 return close_all_fds(dont_close, n_dont_close);
3563}
3564
00d9ef85
LP
3565static int send_user_lookup(
3566 Unit *unit,
3567 int user_lookup_fd,
3568 uid_t uid,
3569 gid_t gid) {
3570
3571 assert(unit);
3572
3573 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3574 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3575 * specified. */
3576
3577 if (user_lookup_fd < 0)
3578 return 0;
3579
3580 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3581 return 0;
3582
3583 if (writev(user_lookup_fd,
3584 (struct iovec[]) {
ce16d177
YW
3585 IOVEC_MAKE(&uid, sizeof(uid)),
3586 IOVEC_MAKE(&gid, sizeof(gid)),
3587 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3588 return -errno;
3589
3590 return 0;
3591}
3592
6732edab
LP
3593static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3594 int r;
3595
3596 assert(c);
3597 assert(home);
3598 assert(buf);
3599
3600 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3601
3602 if (*home)
3603 return 0;
3604
3605 if (!c->working_directory_home)
3606 return 0;
3607
6732edab
LP
3608 r = get_home_dir(buf);
3609 if (r < 0)
3610 return r;
3611
3612 *home = *buf;
3613 return 1;
3614}
3615
da50b85a
LP
3616static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3617 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3618 int r;
3619
3620 assert(c);
3621 assert(p);
3622 assert(ret);
3623
3624 assert(c->dynamic_user);
3625
3626 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3627 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3628 * directories. */
3629
5b10116e 3630 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3631 if (t == EXEC_DIRECTORY_CONFIGURATION)
3632 continue;
3633
3634 if (!p->prefix[t])
3635 continue;
3636
211a3d87 3637 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3638 char *e;
3639
494d0247 3640 if (exec_directory_is_private(c, t))
211a3d87 3641 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3642 else
211a3d87 3643 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3644 if (!e)
3645 return -ENOMEM;
3646
3647 r = strv_consume(&list, e);
3648 if (r < 0)
3649 return r;
3650 }
3651 }
3652
ae2a15bc 3653 *ret = TAKE_PTR(list);
da50b85a
LP
3654
3655 return 0;
3656}
3657
a8b993dc
LP
3658static int exec_parameters_get_cgroup_path(
3659 const ExecParameters *params,
3660 const CGroupContext *c,
3661 char **ret) {
3662
3663 const char *subgroup = NULL;
78f93209
LP
3664 char *p;
3665
3666 assert(params);
3667 assert(ret);
3668
3669 if (!params->cgroup_path)
3670 return -EINVAL;
3671
3672 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3673 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3674 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3675 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3676 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3677 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3678 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3679 * flag, which is only passed for the former statements, not for the latter. */
3680
a8b993dc
LP
3681 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3682 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3683 subgroup = ".control";
3684 else
3685 subgroup = c->delegate_subgroup;
3686 }
3687
3688 if (subgroup)
3689 p = path_join(params->cgroup_path, subgroup);
78f93209
LP
3690 else
3691 p = strdup(params->cgroup_path);
3692 if (!p)
3693 return -ENOMEM;
3694
3695 *ret = p;
a8b993dc 3696 return !!subgroup;
78f93209
LP
3697}
3698
e2b2fb7f
MS
3699static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3700 _cleanup_(cpu_set_reset) CPUSet s = {};
3701 int r;
3702
3703 assert(c);
3704 assert(ret);
3705
3706 if (!c->numa_policy.nodes.set) {
3707 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3708 return 0;
3709 }
3710
3711 r = numa_to_cpu_set(&c->numa_policy, &s);
3712 if (r < 0)
3713 return r;
3714
3715 cpu_set_reset(ret);
3716
3717 return cpu_set_add_all(ret, &s);
3718}
3719
3720bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3721 assert(c);
3722
3723 return c->cpu_affinity_from_numa;
3724}
3725
1da37e58
ZJS
3726static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3727 int r;
3728
3729 assert(fds);
3730 assert(n_fds);
3731 assert(*n_fds < fds_size);
3732 assert(ret_fd);
3733
3734 if (fd < 0) {
254d1313 3735 *ret_fd = -EBADF;
1da37e58
ZJS
3736 return 0;
3737 }
3738
3739 if (fd < 3 + (int) *n_fds) {
3740 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3741 * the fds we pass to the process (or which are closed only during execve). */
3742
3743 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3744 if (r < 0)
3745 return -errno;
3746
ee3455cf 3747 close_and_replace(fd, r);
1da37e58
ZJS
3748 }
3749
3750 *ret_fd = fds[*n_fds] = fd;
3751 (*n_fds) ++;
3752 return 1;
3753}
3754
cd48e23f
RP
3755static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3756 union sockaddr_union addr = {
3757 .un.sun_family = AF_UNIX,
3758 };
3759 socklen_t sa_len;
3760 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3761 int r;
3762
3763 assert(u);
3764 assert(of);
3765 assert(ofd >= 0);
3766
3767 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3768 if (r < 0)
3769 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3770
3771 sa_len = r;
3772
3773 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3774 _cleanup_close_ int fd = -EBADF;
3775
3776 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3777 if (fd < 0)
3778 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3779
3780 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3781 if (r == -EPROTOTYPE)
3782 continue;
3783 if (r < 0)
3784 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3785
3786 return TAKE_FD(fd);
3787 }
3788
3789 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3790}
3791
3792static int get_open_file_fd(Unit *u, const OpenFile *of) {
3793 struct stat st;
3794 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3795
3796 assert(u);
3797 assert(of);
3798
3799 ofd = open(of->path, O_PATH | O_CLOEXEC);
3800 if (ofd < 0)
dcebb015
DDM
3801 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3802
cd48e23f 3803 if (fstat(ofd, &st) < 0)
dcebb015 3804 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
cd48e23f
RP
3805
3806 if (S_ISSOCK(st.st_mode)) {
3807 fd = connect_unix_harder(u, of, ofd);
3808 if (fd < 0)
3809 return fd;
3810
3811 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
dcebb015
DDM
3812 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3813 of->path);
cd48e23f
RP
3814
3815 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3816 } else {
3817 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3818 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3819 flags |= O_APPEND;
3820 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3821 flags |= O_TRUNC;
3822
3823 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3824 if (fd < 0)
3825 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3826
3827 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3828 }
3829
3830 return TAKE_FD(fd);
3831}
3832
3833static int collect_open_file_fds(
3834 Unit *u,
3835 OpenFile* open_files,
3836 int **fds,
3837 char ***fdnames,
3838 size_t *n_fds) {
3839 int r;
3840
3841 assert(u);
3842 assert(fds);
3843 assert(fdnames);
3844 assert(n_fds);
3845
3846 LIST_FOREACH(open_files, of, open_files) {
3847 _cleanup_close_ int fd = -EBADF;
3848
3849 fd = get_open_file_fd(u, of);
3850 if (fd < 0) {
3851 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3852 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3853 continue;
3854 }
3855
3856 return fd;
3857 }
3858
3859 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3860 return -ENOMEM;
3861
3862 r = strv_extend(fdnames, of->fdname);
3863 if (r < 0)
3864 return r;
3865
3866 (*fds)[*n_fds] = TAKE_FD(fd);
3867
3868 (*n_fds)++;
3869 }
3870
3871 return 0;
3872}
3873
3ff67ec4
ZJS
3874static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3875 assert(unit);
3876 assert(msg);
3877 assert(executable);
3878
3879 if (!DEBUG_LOGGING)
3880 return;
3881
3882 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3883
3884 log_unit_struct(unit, LOG_DEBUG,
3885 "EXECUTABLE=%s", executable,
3886 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3887 LOG_UNIT_INVOCATION_ID(unit));
3888}
3889
170d978b
LP
3890static bool exec_context_need_unprivileged_private_users(
3891 const ExecContext *context,
3892 const ExecParameters *params) {
3893
6ef721cb 3894 assert(context);
170d978b 3895 assert(params);
6ef721cb
LB
3896
3897 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3898 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3899 * (system manager) then we have privileges and don't need this. */
170d978b 3900 if (params->runtime_scope != RUNTIME_SCOPE_USER)
6ef721cb
LB
3901 return false;
3902
3903 return context->private_users ||
3904 context->private_tmp ||
3905 context->private_devices ||
3906 context->private_network ||
3907 context->network_namespace_path ||
3908 context->private_ipc ||
3909 context->ipc_namespace_path ||
adeff822 3910 context->private_mounts > 0 ||
6ef721cb
LB
3911 context->mount_apivfs ||
3912 context->n_bind_mounts > 0 ||
3913 context->n_temporary_filesystems > 0 ||
3914 context->root_directory ||
3915 !strv_isempty(context->extension_directories) ||
3916 context->protect_system != PROTECT_SYSTEM_NO ||
3917 context->protect_home != PROTECT_HOME_NO ||
3918 context->protect_kernel_tunables ||
3919 context->protect_kernel_modules ||
3920 context->protect_kernel_logs ||
3921 context->protect_control_groups ||
3922 context->protect_clock ||
3923 context->protect_hostname ||
3924 !strv_isempty(context->read_write_paths) ||
3925 !strv_isempty(context->read_only_paths) ||
3926 !strv_isempty(context->inaccessible_paths) ||
3927 !strv_isempty(context->exec_paths) ||
3928 !strv_isempty(context->no_exec_paths);
3929}
3930
ff0af2a1 3931static int exec_child(
f2341e0a 3932 Unit *unit,
34cf6c43 3933 const ExecCommand *command,
ff0af2a1
LP
3934 const ExecContext *context,
3935 const ExecParameters *params,
28135da3 3936 ExecRuntime *runtime,
6bb00842 3937 const CGroupContext *cgroup_context,
ff0af2a1 3938 int socket_fd,
2caa38e9 3939 const int named_iofds[static 3],
cd48e23f 3940 int *params_fds,
da6053d0 3941 size_t n_socket_fds,
25b583d7 3942 size_t n_storage_fds,
ff0af2a1 3943 char **files_env,
00d9ef85 3944 int user_lookup_fd,
12145637 3945 int *exit_status) {
d35fbf6b 3946
8c35c10d 3947 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3948 int r, ngids = 0, exec_fd;
4d885bd3
DH
3949 _cleanup_free_ gid_t *supplementary_gids = NULL;
3950 const char *username = NULL, *groupname = NULL;
73ff4d48 3951 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
2b3c1b9e 3952 const char *home = NULL, *shell = NULL;
7ca69792 3953 char **final_argv = NULL;
7bce046b
LP
3954 dev_t journal_stream_dev = 0;
3955 ino_t journal_stream_ino = 0;
5749f855 3956 bool userns_set_up = false;
165a31c0
LP
3957 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3958 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3959 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3960 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3961#if HAVE_SELINUX
7f59dd35 3962 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3963 bool use_selinux = false;
ecfbc84f 3964#endif
f9fa32f0 3965#if ENABLE_SMACK
43b1f709 3966 bool use_smack = false;
ecfbc84f 3967#endif
349cc4a5 3968#if HAVE_APPARMOR
43b1f709 3969 bool use_apparmor = false;
ecfbc84f 3970#endif
5749f855
AZ
3971 uid_t saved_uid = getuid();
3972 gid_t saved_gid = getgid();
fed1e721
LP
3973 uid_t uid = UID_INVALID;
3974 gid_t gid = GID_INVALID;
1da37e58
ZJS
3975 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3976 n_keep_fds; /* total number of fds not to close */
165a31c0 3977 int secure_bits;
afb11bf1
DG
3978 _cleanup_free_ gid_t *gids_after_pam = NULL;
3979 int ngids_after_pam = 0;
cd48e23f
RP
3980 _cleanup_free_ int *fds = NULL;
3981 _cleanup_strv_free_ char **fdnames = NULL;
034c6ed7 3982
f2341e0a 3983 assert(unit);
5cb5a6ff
LP
3984 assert(command);
3985 assert(context);
d35fbf6b 3986 assert(params);
ff0af2a1 3987 assert(exit_status);
d35fbf6b 3988
69339ae9
LP
3989 /* Explicitly test for CVE-2021-4034 inspired invocations */
3990 assert(command->path);
3991 assert(!strv_isempty(command->argv));
3992
d35fbf6b
DM
3993 rename_process_from_path(command->path);
3994
9c274488
LP
3995 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3996 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3997 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3998 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3999 SIGNALS_IGNORE);
d35fbf6b
DM
4000
4001 if (context->ignore_sigpipe)
9c274488 4002 (void) ignore_signals(SIGPIPE);
d35fbf6b 4003
ff0af2a1
LP
4004 r = reset_signal_mask();
4005 if (r < 0) {
4006 *exit_status = EXIT_SIGNAL_MASK;
12145637 4007 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4008 }
034c6ed7 4009
d35fbf6b
DM
4010 if (params->idle_pipe)
4011 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4012
2c027c62
LP
4013 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4014 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4015 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4016 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4017
d35fbf6b 4018 log_forget_fds();
2c027c62 4019 log_set_open_when_needed(true);
a3b00f91 4020 log_settle_target();
4f2d528d 4021
40a80078
LP
4022 /* In case anything used libc syslog(), close this here, too */
4023 closelog();
4024
cd48e23f
RP
4025 fds = newdup(int, params_fds, n_fds);
4026 if (!fds) {
4027 *exit_status = EXIT_MEMORY;
4028 return log_oom();
4029 }
4030
4031 fdnames = strv_copy((char**) params->fd_names);
4032 if (!fdnames) {
4033 *exit_status = EXIT_MEMORY;
4034 return log_oom();
4035 }
4036
4037 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4038 if (r < 0) {
4039 *exit_status = EXIT_FDS;
4040 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4041 }
4042
b1994387 4043 int keep_fds[n_fds + 3];
1da37e58
ZJS
4044 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4045 n_keep_fds = n_fds;
4046
4047 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4048 if (r < 0) {
4049 *exit_status = EXIT_FDS;
4050 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4051 }
4052
b1994387 4053#if HAVE_LIBBPF
46004616
ZJS
4054 if (unit->manager->restrict_fs) {
4055 int bpf_map_fd = lsm_bpf_map_restrict_fs_fd(unit);
b1994387
ILG
4056 if (bpf_map_fd < 0) {
4057 *exit_status = EXIT_FDS;
46004616 4058 return log_unit_error_errno(unit, bpf_map_fd, "Failed to get restrict filesystems BPF map fd: %m");
b1994387
ILG
4059 }
4060
4061 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, bpf_map_fd, &bpf_map_fd);
4062 if (r < 0) {
4063 *exit_status = EXIT_FDS;
4064 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4065 }
4066 }
4067#endif
4068
15220772 4069 r = close_remaining_fds(params, runtime, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4070 if (r < 0) {
4071 *exit_status = EXIT_FDS;
12145637 4072 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4073 }
4074
0af07108
ZJS
4075 if (!context->same_pgrp &&
4076 setsid() < 0) {
4077 *exit_status = EXIT_SETSID;
4078 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4079 }
9e2f7c11 4080
1e22b5cd 4081 exec_context_tty_reset(context, params);
d35fbf6b 4082
c891efaf 4083 if (unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4084 _cleanup_free_ char *cmdline = NULL;
4085
4ef15008 4086 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4087 if (!cmdline) {
0460aa5c 4088 *exit_status = EXIT_MEMORY;
12145637 4089 return log_oom();
3b20f877 4090 }
d35fbf6b 4091
4ef15008 4092 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4093 if (r != CONFIRM_EXECUTE) {
4094 if (r == CONFIRM_PRETEND_SUCCESS) {
4095 *exit_status = EXIT_SUCCESS;
4096 return 0;
4097 }
5fa01ac0 4098
ff0af2a1 4099 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4100 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4101 "Execution cancelled by the user");
d35fbf6b
DM
4102 }
4103 }
1a63a750 4104
d521916d
LP
4105 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4106 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4107 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4108 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4109 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4110 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
170d978b 4111 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
d521916d
LP
4112 *exit_status = EXIT_MEMORY;
4113 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4114 }
4115
15220772 4116 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
da50b85a 4117 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4118
d521916d 4119 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4120 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4121 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4122 *exit_status = EXIT_USER;
12145637 4123 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4124 }
4125
da50b85a
LP
4126 r = compile_suggested_paths(context, params, &suggested_paths);
4127 if (r < 0) {
4128 *exit_status = EXIT_MEMORY;
4129 return log_oom();
4130 }
4131
15220772 4132 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4133 if (r < 0) {
4134 *exit_status = EXIT_USER;
d85ff944
YW
4135 if (r == -EILSEQ)
4136 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4137 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4138 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4139 }
524daa8c 4140
70dd455c 4141 if (!uid_is_valid(uid)) {
29206d46 4142 *exit_status = EXIT_USER;
d85ff944 4143 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4144 }
4145
4146 if (!gid_is_valid(gid)) {
4147 *exit_status = EXIT_USER;
d85ff944 4148 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4149 }
5bc7452b 4150
15220772
DDM
4151 if (runtime->dynamic_creds->user)
4152 username = runtime->dynamic_creds->user->name;
29206d46
LP
4153
4154 } else {
4d885bd3
DH
4155 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
4156 if (r < 0) {
4157 *exit_status = EXIT_USER;
12145637 4158 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 4159 }
5bc7452b 4160
4d885bd3
DH
4161 r = get_fixed_group(context, &groupname, &gid);
4162 if (r < 0) {
4163 *exit_status = EXIT_GROUP;
12145637 4164 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 4165 }
cdc5d5c5 4166 }
29206d46 4167
cdc5d5c5
DH
4168 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4169 r = get_supplementary_groups(context, username, groupname, gid,
4170 &supplementary_gids, &ngids);
4171 if (r < 0) {
4172 *exit_status = EXIT_GROUP;
12145637 4173 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4174 }
5bc7452b 4175
00d9ef85
LP
4176 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
4177 if (r < 0) {
4178 *exit_status = EXIT_USER;
12145637 4179 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4180 }
4181
4182 user_lookup_fd = safe_close(user_lookup_fd);
4183
6732edab
LP
4184 r = acquire_home(context, uid, &home, &home_buffer);
4185 if (r < 0) {
4186 *exit_status = EXIT_CHDIR;
12145637 4187 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4188 }
4189
4a055e5a 4190 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
d35fbf6b 4191 if (socket_fd >= 0)
a34ceba6 4192 (void) fd_nonblock(socket_fd, false);
acbb0225 4193
4c70a4a7
MS
4194 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4195 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4196 if (params->cgroup_path) {
4197 _cleanup_free_ char *p = NULL;
4198
a8b993dc 4199 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4c70a4a7
MS
4200 if (r < 0) {
4201 *exit_status = EXIT_CGROUP;
4202 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4203 }
4204
4205 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4206 if (r == -EUCLEAN) {
4207 *exit_status = EXIT_CGROUP;
4208 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4209 "because the cgroup or one of its parents or "
4210 "siblings is in the threaded mode: %m", p);
4211 }
4c70a4a7
MS
4212 if (r < 0) {
4213 *exit_status = EXIT_CGROUP;
4214 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4215 }
4216 }
4217
28135da3
DDM
4218 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4219 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4220 if (r < 0) {
4221 *exit_status = EXIT_NETWORK;
4222 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4223 }
4224 }
4225
28135da3
DDM
4226 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4227 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
a70581ff
XR
4228 if (r < 0) {
4229 *exit_status = EXIT_NAMESPACE;
4230 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4231 }
4232 }
4233
52c239d7 4234 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4235 if (r < 0) {
4236 *exit_status = EXIT_STDIN;
12145637 4237 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4238 }
034c6ed7 4239
52c239d7 4240 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4241 if (r < 0) {
4242 *exit_status = EXIT_STDOUT;
12145637 4243 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4244 }
4245
52c239d7 4246 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4247 if (r < 0) {
4248 *exit_status = EXIT_STDERR;
12145637 4249 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4250 }
4251
d35fbf6b 4252 if (context->oom_score_adjust_set) {
bb44fd07
ZJS
4253 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4254 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
9f8168eb 4255 r = set_oom_score_adjust(context->oom_score_adjust);
bb44fd07
ZJS
4256 if (ERRNO_IS_NEG_PRIVILEGE(r))
4257 log_unit_debug_errno(unit, r,
4258 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4259 else if (r < 0) {
4260 *exit_status = EXIT_OOM_ADJUST;
4261 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4262 }
d35fbf6b
DM
4263 }
4264
ad21e542
ZJS
4265 if (context->coredump_filter_set) {
4266 r = set_coredump_filter(context->coredump_filter);
bb44fd07
ZJS
4267 if (ERRNO_IS_NEG_PRIVILEGE(r))
4268 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5fa01ac0
ZJS
4269 else if (r < 0) {
4270 *exit_status = EXIT_LIMITS;
bb44fd07 4271 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5cf4c468 4272 }
ad21e542
ZJS
4273 }
4274
39090201
DJL
4275 if (context->nice_set) {
4276 r = setpriority_closest(context->nice);
5fa01ac0
ZJS
4277 if (r < 0) {
4278 *exit_status = EXIT_NICE;
39090201 4279 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5fa01ac0 4280 }
39090201 4281 }
613b411c 4282
d35fbf6b
DM
4283 if (context->cpu_sched_set) {
4284 struct sched_param param = {
4285 .sched_priority = context->cpu_sched_priority,
4286 };
4287
ff0af2a1
LP
4288 r = sched_setscheduler(0,
4289 context->cpu_sched_policy |
4290 (context->cpu_sched_reset_on_fork ?
4291 SCHED_RESET_ON_FORK : 0),
4292 &param);
4293 if (r < 0) {
4294 *exit_status = EXIT_SETSCHEDULER;
12145637 4295 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4296 }
d35fbf6b 4297 }
fc9b2a84 4298
e2b2fb7f
MS
4299 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4300 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4301 const CPUSet *cpu_set;
4302
4303 if (context->cpu_affinity_from_numa) {
4304 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4305 if (r < 0) {
4306 *exit_status = EXIT_CPUAFFINITY;
4307 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4308 }
4309
4310 cpu_set = &converted_cpu_set;
4311 } else
4312 cpu_set = &context->cpu_set;
4313
4314 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4315 *exit_status = EXIT_CPUAFFINITY;
12145637 4316 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4317 }
e2b2fb7f 4318 }
034c6ed7 4319
b070c7c0
MS
4320 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4321 r = apply_numa_policy(&context->numa_policy);
bb44fd07
ZJS
4322 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4323 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4324 else if (r < 0) {
4325 *exit_status = EXIT_NUMA_POLICY;
4326 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
b070c7c0
MS
4327 }
4328 }
4329
d35fbf6b
DM
4330 if (context->ioprio_set)
4331 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4332 *exit_status = EXIT_IOPRIO;
12145637 4333 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4334 }
da726a4d 4335
d35fbf6b
DM
4336 if (context->timer_slack_nsec != NSEC_INFINITY)
4337 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4338 *exit_status = EXIT_TIMERSLACK;
12145637 4339 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4340 }
9eba9da4 4341
21022b9d
LP
4342 if (context->personality != PERSONALITY_INVALID) {
4343 r = safe_personality(context->personality);
4344 if (r < 0) {
ff0af2a1 4345 *exit_status = EXIT_PERSONALITY;
12145637 4346 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4347 }
21022b9d 4348 }
94f04347 4349
33331d11
VB
4350 if (context->utmp_id) {
4351 const char *line = context->tty_path ?
4352 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4353 NULL;
df0ff127 4354 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4355 line,
023a4f67
LP
4356 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4357 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4358 USER_PROCESS,
6a93917d 4359 username);
33331d11 4360 }
d35fbf6b 4361
08f67696 4362 if (uid_is_valid(uid)) {
ff0af2a1
LP
4363 r = chown_terminal(STDIN_FILENO, uid);
4364 if (r < 0) {
4365 *exit_status = EXIT_STDIN;
12145637 4366 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4367 }
d35fbf6b 4368 }
8e274523 4369
6bb00842
LP
4370 if (params->cgroup_path) {
4371 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4372 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4373 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4374 * touch a single hierarchy too. */
4375
4376 if (params->flags & EXEC_CGROUP_DELEGATE) {
a8b993dc
LP
4377 _cleanup_free_ char *p = NULL;
4378
6bb00842
LP
4379 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4380 if (r < 0) {
4381 *exit_status = EXIT_CGROUP;
4382 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4383 }
a8b993dc
LP
4384
4385 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4386 if (r < 0) {
4387 *exit_status = EXIT_CGROUP;
4388 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4389 }
4390 if (r > 0) {
bcd9b981 4391 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
a8b993dc
LP
4392 if (r < 0) {
4393 *exit_status = EXIT_CGROUP;
4394 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4395 }
4396 }
6bb00842
LP
4397 }
4398
4399 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4400 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4401 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4402 if (r < 0) {
4403 *exit_status = EXIT_MEMORY;
4404 return log_oom();
4405 }
4406
4407 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4408 if (r < 0) {
4409 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4410 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4411 memory_pressure_path = mfree(memory_pressure_path);
4412 }
4413 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4414 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4415 if (!memory_pressure_path) {
4416 *exit_status = EXIT_MEMORY;
4417 return log_oom();
4418 }
4419 }
034c6ed7 4420 }
d35fbf6b 4421 }
034c6ed7 4422
211a3d87
LB
4423 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4424
5b10116e 4425 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
59dd2bbb 4426 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4427 if (r < 0)
4428 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4429 }
94f04347 4430
bb0c0d6f 4431 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
43962c30 4432 r = exec_setup_credentials(context, params, unit->id, uid, gid);
bb0c0d6f
LP
4433 if (r < 0) {
4434 *exit_status = EXIT_CREDENTIALS;
4435 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4436 }
4437 }
4438
7bce046b 4439 r = build_environment(
fd63e712 4440 unit,
7bce046b
LP
4441 context,
4442 params,
6bb00842 4443 cgroup_context,
7bce046b 4444 n_fds,
cd48e23f 4445 fdnames,
7bce046b
LP
4446 home,
4447 username,
4448 shell,
4449 journal_stream_dev,
4450 journal_stream_ino,
6bb00842 4451 memory_pressure_path,
7bce046b 4452 &our_env);
2065ca69
JW
4453 if (r < 0) {
4454 *exit_status = EXIT_MEMORY;
12145637 4455 return log_oom();
2065ca69
JW
4456 }
4457
4458 r = build_pass_environment(context, &pass_env);
4459 if (r < 0) {
4460 *exit_status = EXIT_MEMORY;
12145637 4461 return log_oom();
2065ca69
JW
4462 }
4463
adf769b0
ZJS
4464 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4465 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4466 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4467 if (!strv_isempty(context->exec_search_path)) {
4468 _cleanup_free_ char *joined = NULL;
4469
4470 joined = strv_join(context->exec_search_path, ":");
4471 if (!joined) {
4472 *exit_status = EXIT_MEMORY;
4473 return log_oom();
4474 }
4475
4476 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4477 if (r < 0) {
4478 *exit_status = EXIT_MEMORY;
4479 return log_oom();
4480 }
4481 }
4482
4ab3d29f 4483 accum_env = strv_env_merge(params->environment,
2065ca69 4484 our_env,
8c35c10d 4485 joined_exec_search_path,
2065ca69
JW
4486 pass_env,
4487 context->environment,
44e5d006 4488 files_env);
2065ca69
JW
4489 if (!accum_env) {
4490 *exit_status = EXIT_MEMORY;
12145637 4491 return log_oom();
2065ca69 4492 }
1280503b 4493 accum_env = strv_env_clean(accum_env);
2065ca69 4494
096424d1 4495 (void) umask(context->umask);
b213e1c1 4496
b1edf445 4497 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4498 if (r < 0) {
4499 *exit_status = EXIT_KEYRING;
12145637 4500 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4501 }
4502
adf769b0
ZJS
4503 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4504 * from it. */
1703fa41 4505 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4506
adf769b0
ZJS
4507 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4508 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4509 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4510
adf769b0
ZJS
4511 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4512 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4513 * desired. */
165a31c0
LP
4514 if (needs_ambient_hack)
4515 needs_setuid = false;
4516 else
4517 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4518
638fd8cc
LP
4519 uint64_t capability_ambient_set = context->capability_ambient_set;
4520
165a31c0 4521 if (needs_sandboxing) {
adf769b0
ZJS
4522 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4523 * /sys being present. The actual MAC context application will happen later, as late as
4524 * possible, to avoid impacting our own code paths. */
7f18ef0a 4525
349cc4a5 4526#if HAVE_SELINUX
43b1f709 4527 use_selinux = mac_selinux_use();
7f18ef0a 4528#endif
f9fa32f0 4529#if ENABLE_SMACK
43b1f709 4530 use_smack = mac_smack_use();
7f18ef0a 4531#endif
349cc4a5 4532#if HAVE_APPARMOR
43b1f709 4533 use_apparmor = mac_apparmor_use();
7f18ef0a 4534#endif
165a31c0 4535 }
7f18ef0a 4536
ce932d2d
LP
4537 if (needs_sandboxing) {
4538 int which_failed;
4539
4540 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4541 * is set here. (See below.) */
4542
4543 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4544 if (r < 0) {
4545 *exit_status = EXIT_LIMITS;
4546 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4547 }
4548 }
4549
0af07108 4550 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4551 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4552 * wins here. (See above.) */
4553
1da37e58 4554 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4555 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4556 if (r < 0) {
4557 *exit_status = EXIT_PAM;
4558 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4559 }
ac45f971 4560
638fd8cc
LP
4561 if (ambient_capabilities_supported()) {
4562 uint64_t ambient_after_pam;
4563
4564 /* PAM modules might have set some ambient caps. Query them here and merge them into
4565 * the caps we want to set in the end, so that we don't end up unsetting them. */
4566 r = capability_get_ambient(&ambient_after_pam);
4567 if (r < 0) {
4568 *exit_status = EXIT_CAPABILITIES;
4569 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4570 }
4571
4572 capability_ambient_set |= ambient_after_pam;
4573 }
4574
0af07108
ZJS
4575 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4576 if (ngids_after_pam < 0) {
4577 *exit_status = EXIT_MEMORY;
4578 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4579 }
b213e1c1 4580 }
5749f855 4581
170d978b 4582 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5749f855
AZ
4583 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4584 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4585 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108 4586
0af07108 4587 r = setup_private_users(saved_uid, saved_gid, uid, gid);
6ef721cb
LB
4588 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4589 * the actual requested operations fail (or silently continue). */
4590 if (r < 0 && context->private_users) {
0af07108
ZJS
4591 *exit_status = EXIT_USER;
4592 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855 4593 }
6ef721cb
LB
4594 if (r < 0)
4595 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4596 else
4597 userns_set_up = true;
5749f855
AZ
4598 }
4599
28135da3 4600 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
a8d08f39 4601
5a3627e5
LP
4602 /* Try to enable network namespacing if network namespacing is available and we have
4603 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4604 * new network namespace. And if we don't have that, then we could only create a network
4605 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4606 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
28135da3 4607 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
bb44fd07
ZJS
4608 if (ERRNO_IS_NEG_PRIVILEGE(r))
4609 log_unit_notice_errno(unit, r,
4610 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4611 else if (r < 0) {
4612 *exit_status = EXIT_NETWORK;
4613 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
6e2d7c4f 4614 }
a8d08f39
LP
4615 } else if (context->network_namespace_path) {
4616 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4617 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4618 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f 4619 } else
5a3627e5 4620 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
d35fbf6b 4621 }
169c1bda 4622
28135da3 4623 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
a70581ff
XR
4624
4625 if (ns_type_supported(NAMESPACE_IPC)) {
28135da3 4626 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
a70581ff
XR
4627 if (r == -EPERM)
4628 log_unit_warning_errno(unit, r,
4629 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4630 else if (r < 0) {
4631 *exit_status = EXIT_NAMESPACE;
4632 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4633 }
4634 } else if (context->ipc_namespace_path) {
4635 *exit_status = EXIT_NAMESPACE;
4636 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4637 "IPCNamespacePath= is not supported, refusing.");
4638 } else
4639 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4640 }
4641
ee818b89 4642 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4643 _cleanup_free_ char *error_path = NULL;
4644
73ff4d48 4645 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
3fbe8dbe
LP
4646 if (r < 0) {
4647 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4648 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4649 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4650 }
d35fbf6b 4651 }
81a2b7ce 4652
daf8f72b
LP
4653 if (needs_sandboxing) {
4654 r = apply_protect_hostname(unit, context, exit_status);
4655 if (r < 0)
4656 return r;
aecd5ac6
TM
4657 }
4658
85614c6e
SR
4659 if (context->memory_ksm >= 0)
4660 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4661 if (ERRNO_IS_NOT_SUPPORTED(errno))
4662 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4663 else {
4664 *exit_status = EXIT_KSM;
4665 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4666 }
4667 }
4668
5749f855
AZ
4669 /* Drop groups as early as possible.
4670 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4671 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4672 if (needs_setuid) {
afb11bf1
DG
4673 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4674 int ngids_to_enforce = 0;
4675
4676 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4677 ngids,
4678 gids_after_pam,
4679 ngids_after_pam,
4680 &gids_to_enforce);
4681 if (ngids_to_enforce < 0) {
4682 *exit_status = EXIT_MEMORY;
4683 return log_unit_error_errno(unit,
4684 ngids_to_enforce,
4685 "Failed to merge group lists. Group membership might be incorrect: %m");
4686 }
4687
4688 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4689 if (r < 0) {
4690 *exit_status = EXIT_GROUP;
12145637 4691 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4692 }
165a31c0 4693 }
096424d1 4694
5749f855
AZ
4695 /* If the user namespace was not set up above, try to do it now.
4696 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 4697 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
4698 * case of mount namespaces being less privileged when the mount point list is copied from a
4699 * different user namespace). */
9008e1ac 4700
5749f855
AZ
4701 if (needs_sandboxing && context->private_users && !userns_set_up) {
4702 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4703 if (r < 0) {
4704 *exit_status = EXIT_USER;
4705 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4706 }
4707 }
4708
9f71ba8d
ZJS
4709 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4710 * shall execute. */
4711
4712 _cleanup_free_ char *executable = NULL;
254d1313 4713 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 4714 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4715 if (r < 0) {
4716 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4717 log_unit_struct_errno(unit, LOG_INFO, r,
4718 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4719 LOG_UNIT_INVOCATION_ID(unit),
4720 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4721 command->path),
4722 "EXECUTABLE=%s", command->path);
5fa01ac0 4723 *exit_status = EXIT_SUCCESS;
9f71ba8d
ZJS
4724 return 0;
4725 }
4726
4727 *exit_status = EXIT_EXEC;
c2503e35
RH
4728 return log_unit_struct_errno(unit, LOG_INFO, r,
4729 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4730 LOG_UNIT_INVOCATION_ID(unit),
4731 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4732 command->path),
4733 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4734 }
4735
b83d5050
ZJS
4736 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4737 if (r < 0) {
4738 *exit_status = EXIT_FDS;
4739 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4740 }
4741
9f71ba8d 4742#if HAVE_SELINUX
49590d67 4743 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 4744 int fd = -EBADF;
49590d67
MS
4745
4746 if (socket_fd >= 0)
4747 fd = socket_fd;
4748 else if (params->n_socket_fds == 1)
4749 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4750 * use context from that fd to compute the label. */
4751 fd = params->fds[0];
4752
4753 if (fd >= 0) {
4754 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4755 if (r < 0) {
4756 if (!context->selinux_context_ignore) {
4757 *exit_status = EXIT_SELINUX_CONTEXT;
4758 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4759 }
4760 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4761 }
9f71ba8d
ZJS
4762 }
4763 }
4764#endif
4765
4a055e5a
ZJS
4766 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4767 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4768 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4769 * execve(). */
5686391b 4770
1da37e58 4771 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4772 if (r >= 0)
4773 r = shift_fds(fds, n_fds);
4774 if (r >= 0)
cd48e23f 4775 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
4776 if (r < 0) {
4777 *exit_status = EXIT_FDS;
12145637 4778 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4779 }
e66cf1a3 4780
5686391b
LP
4781 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4782 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4783 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4784 * came this far. */
4785
165a31c0 4786 secure_bits = context->secure_bits;
e66cf1a3 4787
165a31c0
LP
4788 if (needs_sandboxing) {
4789 uint64_t bset;
e66cf1a3 4790
4a055e5a
ZJS
4791 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4792 * (Note this is placed after the general resource limit initialization, see above, in order
4793 * to take precedence.) */
f4170c67
LP
4794 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4795 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4796 *exit_status = EXIT_LIMITS;
12145637 4797 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4798 }
4799 }
4800
37ac2744
JB
4801#if ENABLE_SMACK
4802 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4803 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4804 if (use_smack) {
aa5ae971 4805 r = setup_smack(unit->manager, context, executable_fd);
29ff6247 4806 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4807 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4808 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4809 }
4810 }
4811#endif
4812
165a31c0
LP
4813 bset = context->capability_bounding_set;
4814 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4815 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4816 * instead of us doing that */
4817 if (needs_ambient_hack)
4818 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4819 (UINT64_C(1) << CAP_SETUID) |
4820 (UINT64_C(1) << CAP_SETGID);
4821
4822 if (!cap_test_all(bset)) {
638fd8cc 4823 r = capability_bounding_set_drop(bset, /* right_now= */ false);
ff0af2a1
LP
4824 if (r < 0) {
4825 *exit_status = EXIT_CAPABILITIES;
12145637 4826 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4827 }
4c2630eb 4828 }
3b8bddde 4829
16fcb191
TK
4830 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4831 * keep-caps set.
a954b249
LP
4832 *
4833 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4834 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4835 * the ambient capabilities can be raised as they are present in the permitted and
4836 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4837 * without changing the user, so we also set the ambient capabilities here.
4838 *
4839 * The requested ambient capabilities are raised in the inheritable set if the second
4840 * argument is true. */
943800f4 4841 if (!needs_ambient_hack) {
638fd8cc 4842 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
755d4b67
IP
4843 if (r < 0) {
4844 *exit_status = EXIT_CAPABILITIES;
12145637 4845 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4846 }
755d4b67 4847 }
165a31c0 4848 }
755d4b67 4849
fa97f630 4850 /* chroot to root directory first, before we lose the ability to chroot */
9c0c6701 4851 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
fa97f630
JB
4852 if (r < 0)
4853 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4854
165a31c0 4855 if (needs_setuid) {
08f67696 4856 if (uid_is_valid(uid)) {
638fd8cc 4857 r = enforce_user(context, uid, capability_ambient_set);
ff0af2a1
LP
4858 if (r < 0) {
4859 *exit_status = EXIT_USER;
12145637 4860 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4861 }
165a31c0 4862
638fd8cc 4863 if (!needs_ambient_hack && capability_ambient_set != 0) {
755d4b67 4864
16fcb191 4865 /* Raise the ambient capabilities after user change. */
638fd8cc 4866 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
755d4b67
IP
4867 if (r < 0) {
4868 *exit_status = EXIT_CAPABILITIES;
12145637 4869 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4870 }
755d4b67 4871 }
5b6319dc 4872 }
165a31c0 4873 }
d35fbf6b 4874
56ef8db9
JB
4875 /* Apply working directory here, because the working directory might be on NFS and only the user running
4876 * this service might have the correct privilege to change to the working directory */
9c0c6701 4877 r = apply_working_directory(context, params, runtime, home, exit_status);
56ef8db9
JB
4878 if (r < 0)
4879 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4880
165a31c0 4881 if (needs_sandboxing) {
37ac2744 4882 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4883 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4884 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4885 * are restricted. */
4886
349cc4a5 4887#if HAVE_SELINUX
43b1f709 4888 if (use_selinux) {
5cd9cd35
LP
4889 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4890
4891 if (exec_context) {
4892 r = setexeccon(exec_context);
006d1864
TM
4893 if (r < 0) {
4894 if (!context->selinux_context_ignore) {
4895 *exit_status = EXIT_SELINUX_CONTEXT;
4896 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4897 }
4898 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4899 }
4900 }
4901 }
4902#endif
4903
349cc4a5 4904#if HAVE_APPARMOR
43b1f709 4905 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4906 r = aa_change_onexec(context->apparmor_profile);
4907 if (r < 0 && !context->apparmor_profile_ignore) {
4908 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4909 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4910 }
4911 }
4912#endif
4913
a954b249
LP
4914 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4915 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4916 * requires CAP_SETPCAP. */
dbdc4098 4917 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4918 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098 4919 * effective set here.
a954b249
LP
4920 *
4921 * The effective set is overwritten during execve() with the following values:
4922 *
dbdc4098 4923 * - ambient set (for non-root processes)
a954b249 4924 *
dbdc4098
TK
4925 * - (inheritable | bounding) set for root processes)
4926 *
4927 * Hence there is no security impact to raise it in the effective set before execve
4928 */
a954b249 4929 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
dbdc4098
TK
4930 if (r < 0) {
4931 *exit_status = EXIT_CAPABILITIES;
4932 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4933 }
755d4b67 4934 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4935 *exit_status = EXIT_SECUREBITS;
12145637 4936 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4937 }
dbdc4098 4938 }
5b6319dc 4939
59eeb84b 4940 if (context_has_no_new_privileges(context))
d35fbf6b 4941 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4942 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4943 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4944 }
4945
349cc4a5 4946#if HAVE_SECCOMP
469830d1
LP
4947 r = apply_address_families(unit, context);
4948 if (r < 0) {
4949 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4950 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4951 }
04aa0cb9 4952
469830d1
LP
4953 r = apply_memory_deny_write_execute(unit, context);
4954 if (r < 0) {
4955 *exit_status = EXIT_SECCOMP;
12145637 4956 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4957 }
f4170c67 4958
469830d1
LP
4959 r = apply_restrict_realtime(unit, context);
4960 if (r < 0) {
4961 *exit_status = EXIT_SECCOMP;
12145637 4962 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4963 }
4964
f69567cb
LP
4965 r = apply_restrict_suid_sgid(unit, context);
4966 if (r < 0) {
4967 *exit_status = EXIT_SECCOMP;
4968 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4969 }
4970
add00535
LP
4971 r = apply_restrict_namespaces(unit, context);
4972 if (r < 0) {
4973 *exit_status = EXIT_SECCOMP;
12145637 4974 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4975 }
4976
469830d1
LP
4977 r = apply_protect_sysctl(unit, context);
4978 if (r < 0) {
4979 *exit_status = EXIT_SECCOMP;
12145637 4980 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4981 }
4982
469830d1
LP
4983 r = apply_protect_kernel_modules(unit, context);
4984 if (r < 0) {
4985 *exit_status = EXIT_SECCOMP;
12145637 4986 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4987 }
4988
84703040
KK
4989 r = apply_protect_kernel_logs(unit, context);
4990 if (r < 0) {
4991 *exit_status = EXIT_SECCOMP;
4992 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4993 }
4994
fc64760d
KK
4995 r = apply_protect_clock(unit, context);
4996 if (r < 0) {
4997 *exit_status = EXIT_SECCOMP;
4998 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4999 }
5000
469830d1
LP
5001 r = apply_private_devices(unit, context);
5002 if (r < 0) {
5003 *exit_status = EXIT_SECCOMP;
12145637 5004 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5005 }
5006
5007 r = apply_syscall_archs(unit, context);
5008 if (r < 0) {
5009 *exit_status = EXIT_SECCOMP;
12145637 5010 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5011 }
5012
78e864e5
TM
5013 r = apply_lock_personality(unit, context);
5014 if (r < 0) {
5015 *exit_status = EXIT_SECCOMP;
12145637 5016 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5017 }
5018
9df2cdd8
TM
5019 r = apply_syscall_log(unit, context);
5020 if (r < 0) {
5021 *exit_status = EXIT_SECCOMP;
5022 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5023 }
5024
5cd9cd35
LP
5025 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5026 * by the filter as little as possible. */
165a31c0 5027 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5028 if (r < 0) {
5029 *exit_status = EXIT_SECCOMP;
12145637 5030 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5031 }
5032#endif
b1994387
ILG
5033
5034#if HAVE_LIBBPF
5035 r = apply_restrict_filesystems(unit, context);
5036 if (r < 0) {
5037 *exit_status = EXIT_BPF;
5038 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5039 }
5040#endif
5041
d35fbf6b 5042 }
034c6ed7 5043
00819cc1
LP
5044 if (!strv_isempty(context->unset_environment)) {
5045 char **ee = NULL;
5046
5047 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5048 if (!ee) {
5049 *exit_status = EXIT_MEMORY;
12145637 5050 return log_oom();
00819cc1
LP
5051 }
5052
130d3d22 5053 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5054 }
5055
7ca69792 5056 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
f331434d
LP
5057 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5058
5059 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5060 if (r < 0) {
7ca69792 5061 *exit_status = EXIT_MEMORY;
f331434d 5062 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
7ca69792
AZ
5063 }
5064 final_argv = replaced_argv;
f331434d
LP
5065
5066 if (!strv_isempty(unset_variables)) {
5067 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5068 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5069 }
5070
5071 if (!strv_isempty(bad_variables)) {
5072 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5073 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5074 }
7ca69792
AZ
5075 } else
5076 final_argv = command->argv;
034c6ed7 5077
3ff67ec4 5078 log_command_line(unit, "Executing", executable, final_argv);
dd305ec9 5079
5686391b
LP
5080 if (exec_fd >= 0) {
5081 uint8_t hot = 1;
5082
5083 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5084 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5085
5086 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5087 *exit_status = EXIT_EXEC;
5088 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5089 }
5090 }
5091
a6d9111c 5092 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5093
5094 if (exec_fd >= 0) {
5095 uint8_t hot = 0;
5096
5097 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5098 * that POLLHUP on it no longer means execve() succeeded. */
5099
5100 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5101 *exit_status = EXIT_EXEC;
5102 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5103 }
5104 }
12145637 5105
ff0af2a1 5106 *exit_status = EXIT_EXEC;
9f71ba8d 5107 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5108}
81a2b7ce 5109
34cf6c43 5110static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 5111static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 5112
f2341e0a
LP
5113int exec_spawn(Unit *unit,
5114 ExecCommand *command,
d35fbf6b
DM
5115 const ExecContext *context,
5116 const ExecParameters *params,
28135da3 5117 ExecRuntime *runtime,
6bb00842 5118 const CGroupContext *cgroup_context,
d35fbf6b 5119 pid_t *ret) {
8351ceae 5120
ee39ca20 5121 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 5122 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5123 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 5124 size_t n_storage_fds = 0, n_socket_fds = 0;
d35fbf6b 5125 pid_t pid;
8351ceae 5126
f2341e0a 5127 assert(unit);
d35fbf6b
DM
5128 assert(command);
5129 assert(context);
5130 assert(ret);
5131 assert(params);
25b583d7 5132 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5133
4b2af439
DDM
5134 LOG_CONTEXT_PUSH_UNIT(unit);
5135
d35fbf6b
DM
5136 if (context->std_input == EXEC_INPUT_SOCKET ||
5137 context->std_output == EXEC_OUTPUT_SOCKET ||
5138 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 5139
d85ff944
YW
5140 if (params->n_socket_fds > 1)
5141 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 5142
d85ff944
YW
5143 if (params->n_socket_fds == 0)
5144 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 5145
d35fbf6b
DM
5146 socket_fd = params->fds[0];
5147 } else {
254d1313 5148 socket_fd = -EBADF;
d35fbf6b 5149 fds = params->fds;
9b141911 5150 n_socket_fds = params->n_socket_fds;
25b583d7 5151 n_storage_fds = params->n_storage_fds;
d35fbf6b 5152 }
94f04347 5153
34cf6c43 5154 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
5155 if (r < 0)
5156 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
5157
f2341e0a 5158 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 5159 if (r < 0)
f2341e0a 5160 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5161
9f71ba8d
ZJS
5162 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5163 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5164 mac_selinux_maybe_reload();
5165
3ff67ec4
ZJS
5166 /* We won't know the real executable path until we create the mount namespace in the child, but we
5167 want to log from the parent, so we use the possibly inaccurate path here. */
5168 log_command_line(unit, "About to execute", command->path, command->argv);
12145637 5169
78f93209 5170 if (params->cgroup_path) {
a8b993dc 5171 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
78f93209
LP
5172 if (r < 0)
5173 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
18c1e481
LP
5174 if (r > 0) {
5175 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5176 * realized by the unit logic) */
5177
78f93209
LP
5178 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5179 if (r < 0)
a8b993dc 5180 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
78f93209
LP
5181 }
5182 }
5183
d35fbf6b
DM
5184 pid = fork();
5185 if (pid < 0)
74129a12 5186 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5187
5188 if (pid == 0) {
5fa01ac0 5189 int exit_status;
ff0af2a1 5190
f2341e0a
LP
5191 r = exec_child(unit,
5192 command,
ff0af2a1
LP
5193 context,
5194 params,
5195 runtime,
6bb00842 5196 cgroup_context,
ff0af2a1 5197 socket_fd,
52c239d7 5198 named_iofds,
4c47affc 5199 fds,
9b141911 5200 n_socket_fds,
25b583d7 5201 n_storage_fds,
ff0af2a1 5202 files_env,
00d9ef85 5203 unit->manager->user_lookup_fds[1],
12145637
LP
5204 &exit_status);
5205
e1714f02 5206 if (r < 0) {
5fa01ac0
ZJS
5207 const char *status = ASSERT_PTR(
5208 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
e1714f02 5209
c2503e35
RH
5210 log_unit_struct_errno(unit, LOG_ERR, r,
5211 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5212 LOG_UNIT_INVOCATION_ID(unit),
5213 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5214 status, command->path),
5215 "EXECUTABLE=%s", command->path);
5fa01ac0
ZJS
5216 } else
5217 assert(exit_status == EXIT_SUCCESS);
4c2630eb 5218
ff0af2a1 5219 _exit(exit_status);
034c6ed7
LP
5220 }
5221
f2341e0a 5222 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5223
78f93209
LP
5224 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5225 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5226 * process will be killed too). */
5227 if (subcgroup_path)
5228 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5229
b58b4116 5230 exec_status_start(&command->exec_status, pid);
9fb86720 5231
034c6ed7 5232 *ret = pid;
5cb5a6ff
LP
5233 return 0;
5234}
5235
034c6ed7
LP
5236void exec_context_init(ExecContext *c) {
5237 assert(c);
5238
4c12626c 5239 c->umask = 0022;
0692548c 5240 c->ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO;
94f04347 5241 c->cpu_sched_policy = SCHED_OTHER;
071830ff 5242 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 5243 c->syslog_level_prefix = true;
353e12c2 5244 c->ignore_sigpipe = true;
3a43da28 5245 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 5246 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
5247 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5248 c->directories[t].mode = 0755;
12213aed 5249 c->timeout_clean_usec = USEC_INFINITY;
3fd5190b 5250 c->capability_bounding_set = CAP_MASK_UNSET;
aa9d574d
YW
5251 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
5252 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 5253 c->log_level_max = -1;
005bfaf1
TM
5254#if HAVE_SECCOMP
5255 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
5256#endif
51462135
DDM
5257 c->tty_rows = UINT_MAX;
5258 c->tty_cols = UINT_MAX;
b070c7c0 5259 numa_policy_reset(&c->numa_policy);
24002121 5260 c->private_mounts = -1;
85614c6e 5261 c->memory_ksm = -1;
034c6ed7
LP
5262}
5263
613b411c 5264void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5265 assert(c);
5266
6796073e
LP
5267 c->environment = strv_free(c->environment);
5268 c->environment_files = strv_free(c->environment_files);
b4c14404 5269 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5270 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5271
31ce987c 5272 rlimit_free_all(c->rlimit);
034c6ed7 5273
5b10116e 5274 for (size_t l = 0; l < 3; l++) {
52c239d7 5275 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5276 c->stdio_file[l] = mfree(c->stdio_file[l]);
5277 }
52c239d7 5278
a1e58e8e
LP
5279 c->working_directory = mfree(c->working_directory);
5280 c->root_directory = mfree(c->root_directory);
915e6d16 5281 c->root_image = mfree(c->root_image);
18d73705 5282 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5283 c->root_hash = mfree(c->root_hash);
5284 c->root_hash_size = 0;
5285 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5286 c->root_hash_sig = mfree(c->root_hash_sig);
5287 c->root_hash_sig_size = 0;
5288 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5289 c->root_verity = mfree(c->root_verity);
93f59701 5290 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5291 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5292 c->tty_path = mfree(c->tty_path);
5293 c->syslog_identifier = mfree(c->syslog_identifier);
5294 c->user = mfree(c->user);
5295 c->group = mfree(c->group);
034c6ed7 5296
6796073e 5297 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5298
a1e58e8e 5299 c->pam_name = mfree(c->pam_name);
5b6319dc 5300
2a624c36
AP
5301 c->read_only_paths = strv_free(c->read_only_paths);
5302 c->read_write_paths = strv_free(c->read_write_paths);
5303 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5304 c->exec_paths = strv_free(c->exec_paths);
5305 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5306 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5307
d2d6c096 5308 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5309 c->bind_mounts = NULL;
5310 c->n_bind_mounts = 0;
2abd4e38
YW
5311 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5312 c->temporary_filesystems = NULL;
5313 c->n_temporary_filesystems = 0;
b3d13314 5314 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5315
0985c7c4 5316 cpu_set_reset(&c->cpu_set);
b070c7c0 5317 numa_policy_reset(&c->numa_policy);
86a3475b 5318
a1e58e8e
LP
5319 c->utmp_id = mfree(c->utmp_id);
5320 c->selinux_context = mfree(c->selinux_context);
5321 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5322 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5323
9b412709 5324 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
b1994387 5325
8cfa775f 5326 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5327 c->syscall_archs = set_free(c->syscall_archs);
5328 c->address_families = set_free(c->address_families);
e66cf1a3 5329
5b10116e 5330 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5331 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5332
5333 c->log_level_max = -1;
5334
5335 exec_context_free_log_extra_fields(c);
9b412709
FS
5336 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5337 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
08f3be7a 5338
5ac1530e
ZJS
5339 c->log_ratelimit_interval_usec = 0;
5340 c->log_ratelimit_burst = 0;
90fc172e 5341
08f3be7a
LP
5342 c->stdin_data = mfree(c->stdin_data);
5343 c->stdin_data_size = 0;
a8d08f39
LP
5344
5345 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5346 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5347
5348 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5349
43144be4 5350 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5351 c->set_credentials = hashmap_free(c->set_credentials);
9b412709 5352 c->import_credentials = set_free_free(c->import_credentials);
84be0c71
LP
5353
5354 c->root_image_policy = image_policy_free(c->root_image_policy);
5355 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5356 c->extension_image_policy = image_policy_free(c->extension_image_policy);
e66cf1a3
LP
5357}
5358
34cf6c43 5359int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5360 assert(c);
5361
5362 if (!runtime_prefix)
5363 return 0;
5364
211a3d87 5365 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5366 _cleanup_free_ char *p = NULL;
e66cf1a3 5367
494d0247 5368 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5369 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5370 else
211a3d87 5371 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5372 if (!p)
5373 return -ENOMEM;
5374
7bc4bf4a
LP
5375 /* We execute this synchronously, since we need to be sure this is gone when we start the
5376 * service next. */
c6878637 5377 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5378
211a3d87
LB
5379 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5380 _cleanup_free_ char *symlink_abs = NULL;
5381
5382 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5383 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5384 else
5385 symlink_abs = path_join(runtime_prefix, *symlink);
5386 if (!symlink_abs)
5387 return -ENOMEM;
5388
5389 (void) unlink(symlink_abs);
5390 }
e66cf1a3
LP
5391 }
5392
5393 return 0;
5cb5a6ff
LP
5394}
5395
b9f976fb
MK
5396int exec_context_destroy_mount_ns_dir(Unit *u) {
5397 _cleanup_free_ char *p = NULL;
5398
5399 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5400 return 0;
5401
5402 p = path_join("/run/systemd/propagate/", u->id);
5403 if (!p)
5404 return -ENOMEM;
5405
5406 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5407 if (rmdir(p) < 0 && errno != ENOENT)
5408 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5409
5410 return 0;
5411}
5412
34cf6c43 5413static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5414 assert(c);
5415
a1e58e8e 5416 c->path = mfree(c->path);
6796073e 5417 c->argv = strv_free(c->argv);
43d0fcbd
LP
5418}
5419
da6053d0 5420void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5421 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5422 exec_command_done(c+i);
5423}
5424
f1acf85a 5425ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5426 ExecCommand *i;
5427
52e3671b 5428 while ((i = LIST_POP(command, c))) {
43d0fcbd 5429 exec_command_done(i);
5cb5a6ff
LP
5430 free(i);
5431 }
f1acf85a
ZJS
5432
5433 return NULL;
5cb5a6ff
LP
5434}
5435
da6053d0 5436void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5437 for (size_t i = 0; i < n; i++)
f1acf85a 5438 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5439}
5440
6a1d4d9f 5441void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5442 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5443 exec_status_reset(&c[i].exec_status);
5444}
5445
5446void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5447 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5448 LIST_FOREACH(command, z, c[i])
5449 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5450}
5451
039f0e70 5452typedef struct InvalidEnvInfo {
34cf6c43 5453 const Unit *unit;
039f0e70
LP
5454 const char *path;
5455} InvalidEnvInfo;
5456
5457static void invalid_env(const char *p, void *userdata) {
5458 InvalidEnvInfo *info = userdata;
5459
f2341e0a 5460 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5461}
5462
52c239d7
LB
5463const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5464 assert(c);
5465
5466 switch (fd_index) {
5073ff6b 5467
52c239d7
LB
5468 case STDIN_FILENO:
5469 if (c->std_input != EXEC_INPUT_NAMED_FD)
5470 return NULL;
5073ff6b 5471
52c239d7 5472 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5473
52c239d7
LB
5474 case STDOUT_FILENO:
5475 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5476 return NULL;
5073ff6b 5477
52c239d7 5478 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5479
52c239d7
LB
5480 case STDERR_FILENO:
5481 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5482 return NULL;
5073ff6b 5483
52c239d7 5484 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5485
52c239d7
LB
5486 default:
5487 return NULL;
5488 }
5489}
5490
2caa38e9
LP
5491static int exec_context_named_iofds(
5492 const ExecContext *c,
5493 const ExecParameters *p,
5494 int named_iofds[static 3]) {
5495
5b10116e 5496 size_t targets;
56fbd561 5497 const char* stdio_fdname[3];
da6053d0 5498 size_t n_fds;
52c239d7
LB
5499
5500 assert(c);
5501 assert(p);
2caa38e9 5502 assert(named_iofds);
52c239d7
LB
5503
5504 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5505 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5506 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5507
5b10116e 5508 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5509 stdio_fdname[i] = exec_context_fdname(c, i);
5510
4c47affc
FB
5511 n_fds = p->n_storage_fds + p->n_socket_fds;
5512
5b10116e 5513 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5514 if (named_iofds[STDIN_FILENO] < 0 &&
5515 c->std_input == EXEC_INPUT_NAMED_FD &&
5516 stdio_fdname[STDIN_FILENO] &&
5517 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5518
52c239d7
LB
5519 named_iofds[STDIN_FILENO] = p->fds[i];
5520 targets--;
56fbd561
ZJS
5521
5522 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5523 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5524 stdio_fdname[STDOUT_FILENO] &&
5525 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5526
52c239d7
LB
5527 named_iofds[STDOUT_FILENO] = p->fds[i];
5528 targets--;
56fbd561
ZJS
5529
5530 } else if (named_iofds[STDERR_FILENO] < 0 &&
5531 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5532 stdio_fdname[STDERR_FILENO] &&
5533 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5534
52c239d7
LB
5535 named_iofds[STDERR_FILENO] = p->fds[i];
5536 targets--;
5537 }
5538
56fbd561 5539 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5540}
5541
398a5009
ZJS
5542static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5543 _cleanup_strv_free_ char **v = NULL;
398a5009 5544 int r;
8c7be95e
LP
5545
5546 assert(c);
398a5009 5547 assert(ret);
8c7be95e
LP
5548
5549 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5550 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5551 bool ignore = false;
5552 char *fn = *i;
8c7be95e
LP
5553
5554 if (fn[0] == '-') {
5555 ignore = true;
313cefa1 5556 fn++;
8c7be95e
LP
5557 }
5558
5559 if (!path_is_absolute(fn)) {
8c7be95e
LP
5560 if (ignore)
5561 continue;
8c7be95e
LP
5562 return -EINVAL;
5563 }
5564
2bef10ab 5565 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5566 r = safe_glob(fn, 0, &pglob);
5567 if (r < 0) {
2bef10ab
PL
5568 if (ignore)
5569 continue;
398a5009 5570 return r;
2bef10ab 5571 }
8c7be95e 5572
d8c92e8b
ZJS
5573 /* When we don't match anything, -ENOENT should be returned */
5574 assert(pglob.gl_pathc > 0);
5575
fcc06682 5576 for (size_t n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5577 _cleanup_strv_free_ char **p = NULL;
5578
5579 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5580 if (r < 0) {
2bef10ab
PL
5581 if (ignore)
5582 continue;
398a5009 5583 return r;
e9c1ea9d 5584 }
398a5009 5585
ebc05a09 5586 /* Log invalid environment variables with filename */
039f0e70
LP
5587 if (p) {
5588 InvalidEnvInfo info = {
f2341e0a 5589 .unit = unit,
039f0e70
LP
5590 .path = pglob.gl_pathv[n]
5591 };
5592
5593 p = strv_env_clean_with_callback(p, invalid_env, &info);
5594 }
8c7be95e 5595
398a5009
ZJS
5596 if (!v)
5597 v = TAKE_PTR(p);
2bef10ab 5598 else {
398a5009 5599 char **m = strv_env_merge(v, p);
c84a9488 5600 if (!m)
2bef10ab 5601 return -ENOMEM;
2bef10ab 5602
398a5009 5603 strv_free_and_replace(v, m);
2bef10ab 5604 }
8c7be95e
LP
5605 }
5606 }
5607
398a5009 5608 *ret = TAKE_PTR(v);
8c7be95e
LP
5609
5610 return 0;
5611}
5612
6ac8fdc9 5613static bool tty_may_match_dev_console(const char *tty) {
7b912648 5614 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5615
1e22b5cd
LP
5616 if (!tty)
5617 return true;
5618
a119ec7c 5619 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5620
5621 /* trivial identity? */
5622 if (streq(tty, "console"))
5623 return true;
5624
7b912648
LP
5625 if (resolve_dev_console(&resolved) < 0)
5626 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5627
5628 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5629 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5630}
5631
6c0ae739
LP
5632static bool exec_context_may_touch_tty(const ExecContext *ec) {
5633 assert(ec);
1e22b5cd 5634
6c0ae739 5635 return ec->tty_reset ||
1e22b5cd
LP
5636 ec->tty_vhangup ||
5637 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5638 is_terminal_input(ec->std_input) ||
5639 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5640 is_terminal_output(ec->std_error);
5641}
5642
5643bool exec_context_may_touch_console(const ExecContext *ec) {
5644
5645 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5646 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5647}
5648
15ae422b 5649static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5650 assert(f);
5651
5652 STRV_FOREACH(g, l)
5653 fprintf(f, " %s", *g);
5654}
5655
ddc155b2
TM
5656static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5657 assert(f);
5658 assert(prefix);
5659 assert(name);
5660
5661 if (!strv_isempty(strv)) {
a7bd1656 5662 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5663 strv_fprintf(f, strv);
5664 fputs("\n", f);
5665 }
5666}
5667
34cf6c43 5668void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5669 int r;
9eba9da4 5670
5cb5a6ff
LP
5671 assert(c);
5672 assert(f);
5673
4ad49000 5674 prefix = strempty(prefix);
5cb5a6ff
LP
5675
5676 fprintf(f,
94f04347
LP
5677 "%sUMask: %04o\n"
5678 "%sWorkingDirectory: %s\n"
451a074f 5679 "%sRootDirectory: %s\n"
9c0c6701 5680 "%sRootEphemeral: %s\n"
15ae422b 5681 "%sNonBlocking: %s\n"
64747e2d 5682 "%sPrivateTmp: %s\n"
7f112f50 5683 "%sPrivateDevices: %s\n"
59eeb84b 5684 "%sProtectKernelTunables: %s\n"
e66a2f65 5685 "%sProtectKernelModules: %s\n"
84703040 5686 "%sProtectKernelLogs: %s\n"
fc64760d 5687 "%sProtectClock: %s\n"
59eeb84b 5688 "%sProtectControlGroups: %s\n"
d251207d
LP
5689 "%sPrivateNetwork: %s\n"
5690 "%sPrivateUsers: %s\n"
1b8689f9
LP
5691 "%sProtectHome: %s\n"
5692 "%sProtectSystem: %s\n"
5d997827 5693 "%sMountAPIVFS: %s\n"
f3e43635 5694 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5695 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5696 "%sRestrictRealtime: %s\n"
f69567cb 5697 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5698 "%sKeyringMode: %s\n"
4e399953
LP
5699 "%sProtectHostname: %s\n"
5700 "%sProtectProc: %s\n"
5701 "%sProcSubset: %s\n",
5cb5a6ff 5702 prefix, c->umask,
14eb3285
LP
5703 prefix, empty_to_root(c->working_directory),
5704 prefix, empty_to_root(c->root_directory),
9c0c6701 5705 prefix, yes_no(c->root_ephemeral),
15ae422b 5706 prefix, yes_no(c->non_blocking),
64747e2d 5707 prefix, yes_no(c->private_tmp),
7f112f50 5708 prefix, yes_no(c->private_devices),
59eeb84b 5709 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5710 prefix, yes_no(c->protect_kernel_modules),
84703040 5711 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5712 prefix, yes_no(c->protect_clock),
59eeb84b 5713 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5714 prefix, yes_no(c->private_network),
5715 prefix, yes_no(c->private_users),
1b8689f9
LP
5716 prefix, protect_home_to_string(c->protect_home),
5717 prefix, protect_system_to_string(c->protect_system),
5e98086d 5718 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5719 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5720 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5721 prefix, yes_no(c->restrict_realtime),
f69567cb 5722 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5723 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5724 prefix, yes_no(c->protect_hostname),
5725 prefix, protect_proc_to_string(c->protect_proc),
5726 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5727
915e6d16
LP
5728 if (c->root_image)
5729 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5730
18d73705 5731 if (c->root_image_options) {
18d73705
LB
5732 fprintf(f, "%sRootImageOptions:", prefix);
5733 LIST_FOREACH(mount_options, o, c->root_image_options)
5734 if (!isempty(o->options))
9ece6444
LB
5735 fprintf(f, " %s:%s",
5736 partition_designator_to_string(o->partition_designator),
5737 o->options);
18d73705
LB
5738 fprintf(f, "\n");
5739 }
5740
0389f4fa
LB
5741 if (c->root_hash) {
5742 _cleanup_free_ char *encoded = NULL;
5743 encoded = hexmem(c->root_hash, c->root_hash_size);
5744 if (encoded)
5745 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5746 }
5747
5748 if (c->root_hash_path)
5749 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5750
d4d55b0d
LB
5751 if (c->root_hash_sig) {
5752 _cleanup_free_ char *encoded = NULL;
5753 ssize_t len;
5754 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5755 if (len)
5756 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5757 }
5758
5759 if (c->root_hash_sig_path)
5760 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5761
0389f4fa
LB
5762 if (c->root_verity)
5763 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5764
8c7be95e
LP
5765 STRV_FOREACH(e, c->environment)
5766 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5767
5768 STRV_FOREACH(e, c->environment_files)
5769 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5770
b4c14404
FB
5771 STRV_FOREACH(e, c->pass_environment)
5772 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5773
00819cc1
LP
5774 STRV_FOREACH(e, c->unset_environment)
5775 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5776
53f47dfc
YW
5777 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5778
5b10116e 5779 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5780 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5781
211a3d87
LB
5782 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5783 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5784
5785 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5786 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5787 }
3536f49e 5788 }
c2bbd90b 5789
5291f26d 5790 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5791
fb33a393 5792 if (c->nice_set)
5291f26d 5793 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5794
dd6c17b1 5795 if (c->oom_score_adjust_set)
5291f26d 5796 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5797
ad21e542 5798 if (c->coredump_filter_set)
5291f26d 5799 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5800
5b10116e 5801 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5802 if (c->rlimit[i]) {
4c3a2b84 5803 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5804 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5805 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5806 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5807 }
94f04347 5808
f8b69d1d 5809 if (c->ioprio_set) {
1756a011 5810 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5811
5bead76e 5812 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5813 if (r >= 0)
5814 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5815
5bead76e 5816 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5817 }
94f04347 5818
f8b69d1d 5819 if (c->cpu_sched_set) {
1756a011 5820 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5821
837df140
YW
5822 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5823 if (r >= 0)
5824 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5825
94f04347 5826 fprintf(f,
38b48754
LP
5827 "%sCPUSchedulingPriority: %i\n"
5828 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5829 prefix, c->cpu_sched_priority,
5830 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5831 }
94f04347 5832
0985c7c4 5833 if (c->cpu_set.set) {
e7fca352
MS
5834 _cleanup_free_ char *affinity = NULL;
5835
5836 affinity = cpu_set_to_range_string(&c->cpu_set);
5837 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5838 }
5839
b070c7c0
MS
5840 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5841 _cleanup_free_ char *nodes = NULL;
5842
5843 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5844 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5845 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5846 }
5847
3a43da28 5848 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5849 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5850
5851 fprintf(f,
80876c20
LP
5852 "%sStandardInput: %s\n"
5853 "%sStandardOutput: %s\n"
5854 "%sStandardError: %s\n",
5855 prefix, exec_input_to_string(c->std_input),
5856 prefix, exec_output_to_string(c->std_output),
5857 prefix, exec_output_to_string(c->std_error));
5858
befc4a80
LP
5859 if (c->std_input == EXEC_INPUT_NAMED_FD)
5860 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5861 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5862 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5863 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5864 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5865
5866 if (c->std_input == EXEC_INPUT_FILE)
5867 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5868 if (c->std_output == EXEC_OUTPUT_FILE)
5869 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5870 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5871 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5872 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5873 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5874 if (c->std_error == EXEC_OUTPUT_FILE)
5875 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5876 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5877 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5878 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5879 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5880
80876c20
LP
5881 if (c->tty_path)
5882 fprintf(f,
6ea832a2
LP
5883 "%sTTYPath: %s\n"
5884 "%sTTYReset: %s\n"
5885 "%sTTYVHangup: %s\n"
51462135
DDM
5886 "%sTTYVTDisallocate: %s\n"
5887 "%sTTYRows: %u\n"
5888 "%sTTYColumns: %u\n",
6ea832a2
LP
5889 prefix, c->tty_path,
5890 prefix, yes_no(c->tty_reset),
5891 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5892 prefix, yes_no(c->tty_vt_disallocate),
5893 prefix, c->tty_rows,
5894 prefix, c->tty_cols);
94f04347 5895
9f6444eb 5896 if (IN_SET(c->std_output,
9f6444eb
LP
5897 EXEC_OUTPUT_KMSG,
5898 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5899 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5900 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5901 IN_SET(c->std_error,
9f6444eb
LP
5902 EXEC_OUTPUT_KMSG,
5903 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5904 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5905 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5906
5ce70e5b 5907 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5908
837df140
YW
5909 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5910 if (r >= 0)
5911 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5912
837df140
YW
5913 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5914 if (r >= 0)
5915 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5916 }
94f04347 5917
d3070fbd
LP
5918 if (c->log_level_max >= 0) {
5919 _cleanup_free_ char *t = NULL;
5920
5921 (void) log_level_to_string_alloc(c->log_level_max, &t);
5922
5923 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5924 }
5925
5291f26d 5926 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5927 fprintf(f,
5928 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5929 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5930
5ac1530e
ZJS
5931 if (c->log_ratelimit_burst > 0)
5932 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5933
523ea123
QD
5934 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5935 fprintf(f, "%sLogFilterPatterns:", prefix);
5936
5937 char *pattern;
5938 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
5939 fprintf(f, " %s", pattern);
5940 SET_FOREACH(pattern, c->log_filter_denied_patterns)
5941 fprintf(f, " ~%s", pattern);
5942 fputc('\n', f);
5943 }
5944
5b10116e
ZJS
5945 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5946 fprintf(f, "%sLogExtraFields: ", prefix);
5947 fwrite(c->log_extra_fields[j].iov_base,
5948 1, c->log_extra_fields[j].iov_len,
5949 f);
5950 fputc('\n', f);
d3070fbd
LP
5951 }
5952
91dd5f7c
LP
5953 if (c->log_namespace)
5954 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5955
07d46372
YW
5956 if (c->secure_bits) {
5957 _cleanup_free_ char *str = NULL;
5958
5959 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5960 if (r >= 0)
5961 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5962 }
94f04347 5963
3fd5190b 5964 if (c->capability_bounding_set != CAP_MASK_UNSET) {
dd1f5bd0 5965 _cleanup_free_ char *str = NULL;
94f04347 5966
8142d735 5967 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
5968 if (r >= 0)
5969 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5970 }
5971
5972 if (c->capability_ambient_set != 0) {
dd1f5bd0 5973 _cleanup_free_ char *str = NULL;
755d4b67 5974
8142d735 5975 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
5976 if (r >= 0)
5977 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5978 }
5979
5980 if (c->user)
f2d3769a 5981 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5982 if (c->group)
f2d3769a 5983 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5984
29206d46
LP
5985 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5986
ddc155b2 5987 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5988
5b6319dc 5989 if (c->pam_name)
f2d3769a 5990 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5991
ddc155b2
TM
5992 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5993 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5994 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5995 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5996 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 5997 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 5998
5b10116e
ZJS
5999 for (size_t i = 0; i < c->n_bind_mounts; i++)
6000 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6001 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6002 c->bind_mounts[i].ignore_enoent ? "-": "",
6003 c->bind_mounts[i].source,
6004 c->bind_mounts[i].destination,
6005 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6006
5b10116e
ZJS
6007 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6008 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6009
5b10116e
ZJS
6010 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6011 t->path,
6012 isempty(t->options) ? "" : ":",
6013 strempty(t->options));
6014 }
2abd4e38 6015
169c1bda
LP
6016 if (c->utmp_id)
6017 fprintf(f,
6018 "%sUtmpIdentifier: %s\n",
6019 prefix, c->utmp_id);
7b52a628
MS
6020
6021 if (c->selinux_context)
6022 fprintf(f,
5f8640fb
LP
6023 "%sSELinuxContext: %s%s\n",
6024 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6025
80c21aea
WC
6026 if (c->apparmor_profile)
6027 fprintf(f,
6028 "%sAppArmorProfile: %s%s\n",
6029 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6030
6031 if (c->smack_process_label)
6032 fprintf(f,
6033 "%sSmackProcessLabel: %s%s\n",
6034 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6035
050f7277 6036 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6037 fprintf(f,
6038 "%sPersonality: %s\n",
6039 prefix, strna(personality_to_string(c->personality)));
6040
78e864e5
TM
6041 fprintf(f,
6042 "%sLockPersonality: %s\n",
6043 prefix, yes_no(c->lock_personality));
6044
17df7223 6045 if (c->syscall_filter) {
17df7223 6046 fprintf(f,
57183d11 6047 "%sSystemCallFilter: ",
17df7223
LP
6048 prefix);
6049
6b000af4 6050 if (!c->syscall_allow_list)
17df7223
LP
6051 fputc('~', f);
6052
349cc4a5 6053#if HAVE_SECCOMP
d5a99b7c
JJ
6054 void *id, *val;
6055 bool first = true;
90e74a66 6056 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6057 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6058 const char *errno_name = NULL;
6059 int num = PTR_TO_INT(val);
17df7223
LP
6060
6061 if (first)
6062 first = false;
6063 else
6064 fputc(' ', f);
6065
57183d11 6066 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6067 fputs(strna(name), f);
8cfa775f
YW
6068
6069 if (num >= 0) {
005bfaf1 6070 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6071 if (errno_name)
6072 fprintf(f, ":%s", errno_name);
6073 else
6074 fprintf(f, ":%d", num);
6075 }
17df7223 6076 }
351a19b1 6077#endif
17df7223
LP
6078
6079 fputc('\n', f);
6080 }
6081
57183d11 6082 if (c->syscall_archs) {
57183d11
LP
6083 fprintf(f,
6084 "%sSystemCallArchitectures:",
6085 prefix);
6086
349cc4a5 6087#if HAVE_SECCOMP
d5a99b7c 6088 void *id;
90e74a66 6089 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6090 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6091#endif
6092 fputc('\n', f);
6093 }
6094
add00535
LP
6095 if (exec_context_restrict_namespaces_set(c)) {
6096 _cleanup_free_ char *s = NULL;
6097
86c2a9f1 6098 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6099 if (r >= 0)
6100 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6101 prefix, strna(s));
add00535
LP
6102 }
6103
b1994387 6104#if HAVE_LIBBPF
8fe84dc8
YW
6105 if (exec_context_restrict_filesystems_set(c)) {
6106 char *fs;
6107 SET_FOREACH(fs, c->restrict_filesystems)
6108 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6109 }
b1994387
ILG
6110#endif
6111
a8d08f39
LP
6112 if (c->network_namespace_path)
6113 fprintf(f,
6114 "%sNetworkNamespacePath: %s\n",
6115 prefix, c->network_namespace_path);
6116
3df90f24 6117 if (c->syscall_errno > 0) {
3df90f24
YW
6118 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6119
005bfaf1 6120#if HAVE_SECCOMP
d5a99b7c 6121 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6122 if (errno_name)
005bfaf1 6123 fputs(errno_name, f);
3df90f24 6124 else
005bfaf1
TM
6125 fprintf(f, "%d", c->syscall_errno);
6126#endif
6127 fputc('\n', f);
3df90f24 6128 }
b3d13314 6129
5b10116e 6130 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6131 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6132 c->mount_images[i].ignore_enoent ? "-": "",
6133 c->mount_images[i].source,
79e20ceb 6134 c->mount_images[i].destination);
427353f6 6135 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6136 fprintf(f, ":%s:%s",
427353f6 6137 partition_designator_to_string(o->partition_designator),
79e20ceb 6138 strempty(o->options));
427353f6
LB
6139 fprintf(f, "\n");
6140 }
93f59701
LB
6141
6142 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6143 fprintf(f, "%sExtensionImages: %s%s", prefix,
6144 c->extension_images[i].ignore_enoent ? "-": "",
6145 c->extension_images[i].source);
6146 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6147 fprintf(f, ":%s:%s",
6148 partition_designator_to_string(o->partition_designator),
6149 strempty(o->options));
6150 fprintf(f, "\n");
6151 }
a07b9926
LB
6152
6153 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6154}
6155
34cf6c43 6156bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6157 assert(c);
6158
61233823 6159 /* Returns true if the process forked off would run under
a931ad47
LP
6160 * an unchanged UID or as root. */
6161
6162 if (!c->user)
6163 return true;
6164
6165 if (streq(c->user, "root") || streq(c->user, "0"))
6166 return true;
6167
6168 return false;
6169}
6170
34cf6c43 6171int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6172 int p;
6173
6174 assert(c);
6175
6176 if (c->ioprio_set)
6177 return c->ioprio;
6178
6179 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6180 if (p < 0)
0692548c 6181 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6182
8b330d7d 6183 return ioprio_normalize(p);
7f452159
LP
6184}
6185
5e98086d
ZJS
6186bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6187 assert(c);
6188
61198784 6189 /* Explicit setting wins */
5e98086d
ZJS
6190 if (c->mount_apivfs_set)
6191 return c->mount_apivfs;
6192
61198784 6193 /* Default to "yes" if root directory or image are specified */
74e12520 6194 if (exec_context_with_rootfs(c))
61198784
ZJS
6195 return true;
6196
5e98086d
ZJS
6197 return false;
6198}
6199
d3070fbd 6200void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6201 assert(c);
6202
5b10116e 6203 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6204 free(c->log_extra_fields[l].iov_base);
6205 c->log_extra_fields = mfree(c->log_extra_fields);
6206 c->n_log_extra_fields = 0;
6207}
6208
6f765baf 6209void exec_context_revert_tty(ExecContext *c) {
254d1313 6210 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6211 const char *path;
6212 struct stat st;
6f765baf
LP
6213 int r;
6214
6215 assert(c);
6216
6217 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6218 exec_context_tty_reset(c, NULL);
6219
6220 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6221 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6222 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6223 if (!exec_context_may_touch_tty(c))
6224 return;
6f765baf 6225
0ba976e8
LP
6226 path = exec_context_tty_path(c);
6227 if (!path)
6228 return;
6f765baf 6229
0ba976e8
LP
6230 fd = open(path, O_PATH|O_CLOEXEC);
6231 if (fd < 0)
6232 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6233 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6234 path);
6235
6236 if (fstat(fd, &st) < 0)
6237 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6238
6239 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6240 * if things are a character device, since a proper check either means we'd have to open the TTY and
6241 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6242 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6243 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6244 if (!S_ISCHR(st.st_mode))
6245 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6246
6247 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6248 if (r < 0)
6249 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6250}
6251
4c2f5842
LP
6252int exec_context_get_clean_directories(
6253 ExecContext *c,
6254 char **prefix,
6255 ExecCleanMask mask,
6256 char ***ret) {
6257
6258 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6259 int r;
6260
6261 assert(c);
6262 assert(prefix);
6263 assert(ret);
6264
5b10116e 6265 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6266 if (!FLAGS_SET(mask, 1U << t))
6267 continue;
6268
6269 if (!prefix[t])
6270 continue;
6271
211a3d87 6272 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6273 char *j;
6274
211a3d87 6275 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6276 if (!j)
6277 return -ENOMEM;
6278
6279 r = strv_consume(&l, j);
6280 if (r < 0)
6281 return r;
7f622a19
YW
6282
6283 /* Also remove private directories unconditionally. */
6284 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6285 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6286 if (!j)
6287 return -ENOMEM;
6288
6289 r = strv_consume(&l, j);
6290 if (r < 0)
6291 return r;
6292 }
6293
211a3d87
LB
6294 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6295 j = path_join(prefix[t], *symlink);
7f622a19
YW
6296 if (!j)
6297 return -ENOMEM;
6298
6299 r = strv_consume(&l, j);
6300 if (r < 0)
6301 return r;
6302 }
4c2f5842
LP
6303 }
6304 }
6305
6306 *ret = TAKE_PTR(l);
6307 return 0;
6308}
6309
6310int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6311 ExecCleanMask mask = 0;
6312
6313 assert(c);
6314 assert(ret);
6315
6316 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6317 if (c->directories[t].n_items > 0)
4c2f5842
LP
6318 mask |= 1U << t;
6319
6320 *ret = mask;
6321 return 0;
6322}
6323
b58b4116 6324void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6325 assert(s);
5cb5a6ff 6326
2ed26ed0
LP
6327 *s = (ExecStatus) {
6328 .pid = pid,
6329 };
6330
b58b4116
LP
6331 dual_timestamp_get(&s->start_timestamp);
6332}
6333
34cf6c43 6334void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6335 assert(s);
6336
d46b79bb 6337 if (s->pid != pid)
2ed26ed0
LP
6338 *s = (ExecStatus) {
6339 .pid = pid,
6340 };
b58b4116 6341
63983207 6342 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6343
034c6ed7
LP
6344 s->code = code;
6345 s->status = status;
169c1bda 6346
6f765baf
LP
6347 if (context && context->utmp_id)
6348 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6349}
6350
6a1d4d9f
LP
6351void exec_status_reset(ExecStatus *s) {
6352 assert(s);
6353
6354 *s = (ExecStatus) {};
6355}
6356
34cf6c43 6357void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6358 assert(s);
6359 assert(f);
6360
9fb86720
LP
6361 if (s->pid <= 0)
6362 return;
6363
4c940960
LP
6364 prefix = strempty(prefix);
6365
9fb86720 6366 fprintf(f,
ccd06097
ZJS
6367 "%sPID: "PID_FMT"\n",
6368 prefix, s->pid);
9fb86720 6369
af9d16e1 6370 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6371 fprintf(f,
6372 "%sStart Timestamp: %s\n",
04f5c018 6373 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6374
af9d16e1 6375 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6376 fprintf(f,
6377 "%sExit Timestamp: %s\n"
6378 "%sExit Code: %s\n"
6379 "%sExit Status: %i\n",
04f5c018 6380 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6381 prefix, sigchld_code_to_string(s->code),
6382 prefix, s->status);
5cb5a6ff 6383}
44d8db9e 6384
34cf6c43 6385static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6386 _cleanup_free_ char *cmd = NULL;
4c940960 6387 const char *prefix2;
44d8db9e
LP
6388
6389 assert(c);
6390 assert(f);
6391
4c940960 6392 prefix = strempty(prefix);
63c372cb 6393 prefix2 = strjoina(prefix, "\t");
44d8db9e 6394
4ef15008 6395 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6396
44d8db9e
LP
6397 fprintf(f,
6398 "%sCommand Line: %s\n",
38553034 6399 prefix, strnull(cmd));
44d8db9e 6400
9fb86720 6401 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6402}
6403
6404void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6405 assert(f);
6406
4c940960 6407 prefix = strempty(prefix);
44d8db9e 6408
03677889
YW
6409 LIST_FOREACH(command, i, c)
6410 exec_command_dump(i, f, prefix);
44d8db9e 6411}
94f04347 6412
a6a80b4f
LP
6413void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6414 ExecCommand *end;
6415
6416 assert(l);
6417 assert(e);
6418
6419 if (*l) {
35b8ca3a 6420 /* It's kind of important, that we keep the order here */
cc232fa0 6421 end = LIST_FIND_TAIL(command, *l);
71fda00f 6422 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f 6423 } else
3ff67ec4 6424 *l = e;
a6a80b4f
LP
6425}
6426
26fd040d
LP
6427int exec_command_set(ExecCommand *c, const char *path, ...) {
6428 va_list ap;
6429 char **l, *p;
6430
6431 assert(c);
6432 assert(path);
6433
6434 va_start(ap, path);
6435 l = strv_new_ap(path, ap);
6436 va_end(ap);
6437
6438 if (!l)
6439 return -ENOMEM;
6440
250a918d
LP
6441 p = strdup(path);
6442 if (!p) {
26fd040d
LP
6443 strv_free(l);
6444 return -ENOMEM;
6445 }
6446
6897dfe8 6447 free_and_replace(c->path, p);
26fd040d 6448
130d3d22 6449 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6450}
6451
86b23b07 6452int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6453 _cleanup_strv_free_ char **l = NULL;
86b23b07 6454 va_list ap;
86b23b07
JS
6455 int r;
6456
6457 assert(c);
6458 assert(path);
6459
6460 va_start(ap, path);
6461 l = strv_new_ap(path, ap);
6462 va_end(ap);
6463
6464 if (!l)
6465 return -ENOMEM;
6466
e287086b 6467 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6468 if (r < 0)
86b23b07 6469 return r;
86b23b07
JS
6470
6471 return 0;
6472}
6473
437f3e35
LP
6474static char *destroy_tree(char *path) {
6475 if (!path)
6476 return NULL;
9c0c6701 6477
437f3e35
LP
6478 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6479 log_debug("Spawning process to nuke '%s'", path);
9c0c6701 6480
437f3e35
LP
6481 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6482 }
9c0c6701 6483
437f3e35 6484 return mfree(path);
9c0c6701
DDM
6485}
6486
e52a696a 6487static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
e8a565cb
YW
6488 if (!rt)
6489 return NULL;
6490
6491 if (rt->manager)
e76506b7 6492 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
e8a565cb 6493
e52a696a
DDM
6494 rt->id = mfree(rt->id);
6495 rt->tmp_dir = mfree(rt->tmp_dir);
6496 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6497 safe_close_pair(rt->netns_storage_socket);
6498 safe_close_pair(rt->ipcns_storage_socket);
6499 return mfree(rt);
6500}
6501
6502DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6503DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6504
6505ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
e52a696a
DDM
6506 if (!rt)
6507 return NULL;
6508
6509 assert(rt->n_ref > 0);
6510 rt->n_ref--;
6511
6512 if (rt->n_ref > 0)
6513 return NULL;
56a13a49 6514
437f3e35
LP
6515 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6516 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
e8a565cb 6517
e52a696a 6518 return exec_shared_runtime_free(rt);
e8a565cb
YW
6519}
6520
e76506b7 6521static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
56a13a49 6522 _cleanup_free_ char *id_copy = NULL;
e76506b7 6523 ExecSharedRuntime *n;
613b411c 6524
8e8009dc 6525 assert(ret);
613b411c 6526
56a13a49
ZJS
6527 id_copy = strdup(id);
6528 if (!id_copy)
6529 return -ENOMEM;
6530
e76506b7 6531 n = new(ExecSharedRuntime, 1);
8e8009dc 6532 if (!n)
613b411c
LP
6533 return -ENOMEM;
6534
e76506b7 6535 *n = (ExecSharedRuntime) {
56a13a49 6536 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6537 .netns_storage_socket = PIPE_EBADF,
6538 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6539 };
6540
6541 *ret = n;
613b411c
LP
6542 return 0;
6543}
6544
e76506b7 6545static int exec_shared_runtime_add(
e8a565cb
YW
6546 Manager *m,
6547 const char *id,
56a13a49
ZJS
6548 char **tmp_dir,
6549 char **var_tmp_dir,
6550 int netns_storage_socket[2],
a70581ff 6551 int ipcns_storage_socket[2],
e76506b7 6552 ExecSharedRuntime **ret) {
e8a565cb 6553
e76506b7 6554 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
613b411c
LP
6555 int r;
6556
e8a565cb 6557 assert(m);
613b411c
LP
6558 assert(id);
6559
a70581ff 6560 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6561
e76506b7 6562 r = exec_shared_runtime_allocate(&rt, id);
613b411c
LP
6563 if (r < 0)
6564 return r;
6565
e76506b7 6566 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6567 if (r < 0)
6568 return r;
e8a565cb 6569
56a13a49
ZJS
6570 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6571 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6572 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6573
6574 if (netns_storage_socket) {
56a13a49
ZJS
6575 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6576 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6577 }
6578
a70581ff
XR
6579 if (ipcns_storage_socket) {
6580 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6581 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6582 }
6583
e8a565cb
YW
6584 rt->manager = m;
6585
6586 if (ret)
6587 *ret = rt;
e76506b7 6588 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
56a13a49 6589 TAKE_PTR(rt);
e8a565cb
YW
6590 return 0;
6591}
6592
e76506b7 6593static int exec_shared_runtime_make(
74aaf59b
LP
6594 Manager *m,
6595 const ExecContext *c,
6596 const char *id,
e76506b7 6597 ExecSharedRuntime **ret) {
74aaf59b 6598
56a13a49 6599 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6600 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6601 int r;
6602
6603 assert(m);
6604 assert(c);
6605 assert(id);
6606
e76506b7 6607 /* It is not necessary to create ExecSharedRuntime object. */
fde36d25 6608 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
74aaf59b 6609 *ret = NULL;
e8a565cb 6610 return 0;
74aaf59b 6611 }
e8a565cb 6612
efa2f3a1
TM
6613 if (c->private_tmp &&
6614 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6615 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6616 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6617 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6618 if (r < 0)
6619 return r;
6620 }
6621
fbbb9697 6622 if (exec_needs_network_namespace(c)) {
e8a565cb
YW
6623 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6624 return -errno;
6625 }
6626
fde36d25 6627 if (exec_needs_ipc_namespace(c)) {
a70581ff
XR
6628 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6629 return -errno;
6630 }
6631
e76506b7 6632 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6633 if (r < 0)
6634 return r;
6635
613b411c
LP
6636 return 1;
6637}
6638
e76506b7
DDM
6639int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6640 ExecSharedRuntime *rt;
e8a565cb 6641 int r;
613b411c 6642
e8a565cb
YW
6643 assert(m);
6644 assert(id);
6645 assert(ret);
6646
e76506b7 6647 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
e8a565cb 6648 if (rt)
e76506b7 6649 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6650 goto ref;
6651
74aaf59b
LP
6652 if (!create) {
6653 *ret = NULL;
e8a565cb 6654 return 0;
74aaf59b 6655 }
e8a565cb
YW
6656
6657 /* If not found, then create a new object. */
e76506b7 6658 r = exec_shared_runtime_make(m, c, id, &rt);
74aaf59b 6659 if (r < 0)
e8a565cb 6660 return r;
74aaf59b 6661 if (r == 0) {
e76506b7 6662 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
74aaf59b
LP
6663 *ret = NULL;
6664 return 0;
6665 }
613b411c 6666
e8a565cb
YW
6667ref:
6668 /* increment reference counter. */
6669 rt->n_ref++;
6670 *ret = rt;
6671 return 1;
6672}
613b411c 6673
e76506b7
DDM
6674int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6675 ExecSharedRuntime *rt;
e8a565cb
YW
6676
6677 assert(m);
613b411c
LP
6678 assert(f);
6679 assert(fds);
6680
e76506b7 6681 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb 6682 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6683
e8a565cb
YW
6684 if (rt->tmp_dir)
6685 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6686
e8a565cb
YW
6687 if (rt->var_tmp_dir)
6688 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6689
e8a565cb
YW
6690 if (rt->netns_storage_socket[0] >= 0) {
6691 int copy;
613b411c 6692
e8a565cb
YW
6693 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6694 if (copy < 0)
6695 return copy;
613b411c 6696
e8a565cb
YW
6697 fprintf(f, " netns-socket-0=%i", copy);
6698 }
613b411c 6699
e8a565cb
YW
6700 if (rt->netns_storage_socket[1] >= 0) {
6701 int copy;
613b411c 6702
e8a565cb
YW
6703 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6704 if (copy < 0)
6705 return copy;
613b411c 6706
e8a565cb
YW
6707 fprintf(f, " netns-socket-1=%i", copy);
6708 }
6709
a70581ff
XR
6710 if (rt->ipcns_storage_socket[0] >= 0) {
6711 int copy;
6712
6713 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6714 if (copy < 0)
6715 return copy;
6716
6717 fprintf(f, " ipcns-socket-0=%i", copy);
6718 }
6719
6720 if (rt->ipcns_storage_socket[1] >= 0) {
6721 int copy;
6722
6723 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6724 if (copy < 0)
6725 return copy;
6726
6727 fprintf(f, " ipcns-socket-1=%i", copy);
6728 }
6729
e8a565cb 6730 fputc('\n', f);
613b411c
LP
6731 }
6732
6733 return 0;
6734}
6735
e76506b7
DDM
6736int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6737 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
6738 ExecSharedRuntime *rt;
613b411c
LP
6739 int r;
6740
e8a565cb
YW
6741 /* This is for the migration from old (v237 or earlier) deserialization text.
6742 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
e76506b7 6743 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
e8a565cb
YW
6744 * so or not from the serialized text, then we always creates a new object owned by this. */
6745
6746 assert(u);
613b411c
LP
6747 assert(key);
6748 assert(value);
6749
e76506b7 6750 /* Manager manages ExecSharedRuntime objects by the unit id.
e8a565cb
YW
6751 * So, we omit the serialized text when the unit does not have id (yet?)... */
6752 if (isempty(u->id)) {
6753 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6754 return 0;
6755 }
613b411c 6756
e76506b7 6757 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
cbc165d1 6758 return log_oom();
e8a565cb 6759
e76506b7 6760 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
e8a565cb 6761 if (!rt) {
e76506b7 6762 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6763 return log_oom();
613b411c 6764
e8a565cb
YW
6765 rt = rt_create;
6766 }
6767
6768 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6769 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6770 return -ENOMEM;
613b411c
LP
6771
6772 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6773 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6774 return -ENOMEM;
613b411c
LP
6775
6776 } else if (streq(key, "netns-socket-0")) {
6777 int fd;
6778
e652663a 6779 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6780 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6781 return 0;
613b411c 6782 }
e8a565cb
YW
6783
6784 safe_close(rt->netns_storage_socket[0]);
6785 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6786
613b411c
LP
6787 } else if (streq(key, "netns-socket-1")) {
6788 int fd;
6789
e652663a 6790 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6791 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6792 return 0;
613b411c 6793 }
e8a565cb
YW
6794
6795 safe_close(rt->netns_storage_socket[1]);
6796 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6797
613b411c
LP
6798 } else
6799 return 0;
6800
e76506b7 6801 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
e8a565cb 6802 if (rt_create) {
e76506b7 6803 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
e8a565cb 6804 if (r < 0) {
3fe91079 6805 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6806 return 0;
6807 }
613b411c 6808
e8a565cb 6809 rt_create->manager = u->manager;
613b411c 6810
e8a565cb 6811 /* Avoid cleanup */
56a13a49 6812 TAKE_PTR(rt_create);
e8a565cb 6813 }
98b47d54 6814
e8a565cb
YW
6815 return 1;
6816}
613b411c 6817
e76506b7 6818int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
56a13a49
ZJS
6819 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6820 char *id = NULL;
a70581ff 6821 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 6822 const char *p, *v = ASSERT_PTR(value);
e8a565cb 6823 size_t n;
613b411c 6824
e8a565cb 6825 assert(m);
e8a565cb 6826 assert(fds);
98b47d54 6827
e8a565cb 6828 n = strcspn(v, " ");
2f82562b 6829 id = strndupa_safe(v, n);
e8a565cb
YW
6830 if (v[n] != ' ')
6831 goto finalize;
6832 p = v + n + 1;
6833
6834 v = startswith(p, "tmp-dir=");
6835 if (v) {
6836 n = strcspn(v, " ");
56a13a49
ZJS
6837 tmp_dir = strndup(v, n);
6838 if (!tmp_dir)
6839 return log_oom();
e8a565cb
YW
6840 if (v[n] != ' ')
6841 goto finalize;
6842 p = v + n + 1;
6843 }
6844
6845 v = startswith(p, "var-tmp-dir=");
6846 if (v) {
6847 n = strcspn(v, " ");
56a13a49
ZJS
6848 var_tmp_dir = strndup(v, n);
6849 if (!var_tmp_dir)
6850 return log_oom();
e8a565cb
YW
6851 if (v[n] != ' ')
6852 goto finalize;
6853 p = v + n + 1;
6854 }
6855
6856 v = startswith(p, "netns-socket-0=");
6857 if (v) {
6858 char *buf;
6859
6860 n = strcspn(v, " ");
2f82562b 6861 buf = strndupa_safe(v, n);
c413bb28 6862
e652663a
DT
6863 netns_fdpair[0] = parse_fd(buf);
6864 if (netns_fdpair[0] < 0)
6865 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6866 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6867 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6868 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6869 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6870 if (v[n] != ' ')
6871 goto finalize;
6872 p = v + n + 1;
613b411c
LP
6873 }
6874
e8a565cb
YW
6875 v = startswith(p, "netns-socket-1=");
6876 if (v) {
6877 char *buf;
98b47d54 6878
e8a565cb 6879 n = strcspn(v, " ");
2f82562b 6880 buf = strndupa_safe(v, n);
a70581ff 6881
e652663a
DT
6882 netns_fdpair[1] = parse_fd(buf);
6883 if (netns_fdpair[1] < 0)
6884 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6885 if (!fdset_contains(fds, netns_fdpair[1]))
6886 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6887 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6888 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6889 if (v[n] != ' ')
6890 goto finalize;
6891 p = v + n + 1;
6892 }
6893
6894 v = startswith(p, "ipcns-socket-0=");
6895 if (v) {
6896 char *buf;
6897
6898 n = strcspn(v, " ");
2f82562b 6899 buf = strndupa_safe(v, n);
a70581ff 6900
e652663a
DT
6901 ipcns_fdpair[0] = parse_fd(buf);
6902 if (ipcns_fdpair[0] < 0)
6903 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
a70581ff
XR
6904 if (!fdset_contains(fds, ipcns_fdpair[0]))
6905 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6906 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6907 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6908 if (v[n] != ' ')
6909 goto finalize;
6910 p = v + n + 1;
6911 }
6912
6913 v = startswith(p, "ipcns-socket-1=");
6914 if (v) {
6915 char *buf;
6916
6917 n = strcspn(v, " ");
2f82562b 6918 buf = strndupa_safe(v, n);
a70581ff 6919
e652663a
DT
6920 ipcns_fdpair[1] = parse_fd(buf);
6921 if (ipcns_fdpair[1] < 0)
6922 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
a70581ff 6923 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6924 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6925 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6926 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6927 }
98b47d54 6928
e8a565cb 6929finalize:
e76506b7 6930 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6931 if (r < 0)
56a13a49
ZJS
6932 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6933 return 0;
e8a565cb 6934}
613b411c 6935
e76506b7
DDM
6936void exec_shared_runtime_vacuum(Manager *m) {
6937 ExecSharedRuntime *rt;
e8a565cb
YW
6938
6939 assert(m);
6940
e76506b7 6941 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
e8a565cb 6942
e76506b7 6943 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb
YW
6944 if (rt->n_ref > 0)
6945 continue;
6946
e52a696a 6947 (void) exec_shared_runtime_free(rt);
e8a565cb 6948 }
613b411c
LP
6949}
6950
9c0c6701
DDM
6951int exec_runtime_make(
6952 const Unit *unit,
6953 const ExecContext *context,
6954 ExecSharedRuntime *shared,
6955 DynamicCreds *creds,
6956 ExecRuntime **ret) {
6957 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
6958 _cleanup_free_ char *ephemeral = NULL;
28135da3 6959 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
9c0c6701 6960 int r;
28135da3 6961
9c0c6701
DDM
6962 assert(unit);
6963 assert(context);
28135da3
DDM
6964 assert(ret);
6965
9c0c6701 6966 if (!shared && !creds && !exec_needs_ephemeral(context)) {
28135da3
DDM
6967 *ret = NULL;
6968 return 0;
6969 }
6970
9c0c6701
DDM
6971 if (exec_needs_ephemeral(context)) {
6972 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
6973 if (r < 0)
6974 return r;
6975
6976 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
6977 if (r < 0)
6978 return r;
6979
6980 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
6981 return -errno;
6982 }
6983
28135da3
DDM
6984 rt = new(ExecRuntime, 1);
6985 if (!rt)
6986 return -ENOMEM;
6987
6988 *rt = (ExecRuntime) {
6989 .shared = shared,
15220772 6990 .dynamic_creds = creds,
9c0c6701
DDM
6991 .ephemeral_copy = TAKE_PTR(ephemeral),
6992 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
6993 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
28135da3
DDM
6994 };
6995
6996 *ret = TAKE_PTR(rt);
6997 return 1;
6998}
6999
7000ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7001 if (!rt)
7002 return NULL;
7003
7004 exec_shared_runtime_unref(rt->shared);
15220772 7005 dynamic_creds_unref(rt->dynamic_creds);
9c0c6701 7006
437f3e35 7007 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
9c0c6701 7008
9c0c6701 7009 safe_close_pair(rt->ephemeral_storage_socket);
28135da3
DDM
7010 return mfree(rt);
7011}
7012
7013ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7014 if (!rt)
7015 return NULL;
7016
7017 rt->shared = exec_shared_runtime_destroy(rt->shared);
15220772 7018 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
28135da3
DDM
7019 return exec_runtime_free(rt);
7020}
7021
b9c04eaf
YW
7022void exec_params_clear(ExecParameters *p) {
7023 if (!p)
7024 return;
7025
c3f8a065
LP
7026 p->environment = strv_free(p->environment);
7027 p->fd_names = strv_free(p->fd_names);
7028 p->fds = mfree(p->fds);
7029 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
7030}
7031
211a3d87
LB
7032void exec_directory_done(ExecDirectory *d) {
7033 if (!d)
7034 return;
7035
7036 for (size_t i = 0; i < d->n_items; i++) {
7037 free(d->items[i].path);
7038 strv_free(d->items[i].symlinks);
7039 }
7040
7041 d->items = mfree(d->items);
7042 d->n_items = 0;
7043 d->mode = 0755;
7044}
7045
564e5c98
YW
7046static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7047 assert(d);
7048 assert(path);
7049
7050 for (size_t i = 0; i < d->n_items; i++)
7051 if (path_equal(d->items[i].path, path))
7052 return &d->items[i];
7053
7054 return NULL;
7055}
7056
7057int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7058 _cleanup_strv_free_ char **s = NULL;
7059 _cleanup_free_ char *p = NULL;
564e5c98
YW
7060 ExecDirectoryItem *existing;
7061 int r;
211a3d87
LB
7062
7063 assert(d);
211a3d87
LB
7064 assert(path);
7065
564e5c98
YW
7066 existing = exec_directory_find(d, path);
7067 if (existing) {
7068 r = strv_extend(&existing->symlinks, symlink);
7069 if (r < 0)
7070 return r;
7071
7072 return 0; /* existing item is updated */
7073 }
7074
211a3d87
LB
7075 p = strdup(path);
7076 if (!p)
7077 return -ENOMEM;
7078
564e5c98
YW
7079 if (symlink) {
7080 s = strv_new(symlink);
211a3d87
LB
7081 if (!s)
7082 return -ENOMEM;
7083 }
7084
564e5c98 7085 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7086 return -ENOMEM;
7087
564e5c98 7088 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7089 .path = TAKE_PTR(p),
7090 .symlinks = TAKE_PTR(s),
7091 };
7092
564e5c98 7093 return 1; /* new item is added */
211a3d87
LB
7094}
7095
a2ab603c
YW
7096static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7097 assert(a);
7098 assert(b);
7099
7100 return path_compare(a->path, b->path);
7101}
7102
7103void exec_directory_sort(ExecDirectory *d) {
7104 assert(d);
7105
7106 /* Sort the exec directories to make always parent directories processed at first in
7107 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7108 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7109 * list. See also comments in setup_exec_directory() and issue #24783. */
7110
7111 if (d->n_items <= 1)
7112 return;
7113
7114 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7115
7116 for (size_t i = 1; i < d->n_items; i++)
7117 for (size_t j = 0; j < i; j++)
7118 if (path_startswith(d->items[i].path, d->items[j].path)) {
7119 d->items[i].only_create = true;
7120 break;
7121 }
211a3d87
LB
7122}
7123
4fb8f1e8
LP
7124ExecCleanMask exec_clean_mask_from_string(const char *s) {
7125 ExecDirectoryType t;
7126
7127 assert(s);
7128
7129 if (streq(s, "all"))
7130 return EXEC_CLEAN_ALL;
7131 if (streq(s, "fdstore"))
7132 return EXEC_CLEAN_FDSTORE;
7133
7134 t = exec_resource_type_from_string(s);
7135 if (t < 0)
7136 return (ExecCleanMask) t;
7137
7138 return 1U << t;
7139}
7140
80876c20
LP
7141static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7142 [EXEC_INPUT_NULL] = "null",
7143 [EXEC_INPUT_TTY] = "tty",
7144 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7145 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7146 [EXEC_INPUT_SOCKET] = "socket",
7147 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7148 [EXEC_INPUT_DATA] = "data",
2038c3f5 7149 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7150};
7151
8a0867d6
LP
7152DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7153
94f04347 7154static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7155 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7156 [EXEC_OUTPUT_NULL] = "null",
80876c20 7157 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7158 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7159 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7160 [EXEC_OUTPUT_JOURNAL] = "journal",
7161 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7162 [EXEC_OUTPUT_SOCKET] = "socket",
7163 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7164 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7165 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7166 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7167};
7168
7169DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7170
7171static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7172 [EXEC_UTMP_INIT] = "init",
7173 [EXEC_UTMP_LOGIN] = "login",
7174 [EXEC_UTMP_USER] = "user",
7175};
7176
7177DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7178
7179static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7180 [EXEC_PRESERVE_NO] = "no",
7181 [EXEC_PRESERVE_YES] = "yes",
7182 [EXEC_PRESERVE_RESTART] = "restart",
7183};
7184
7185DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7186
6b7b2ed9 7187/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7188static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7189 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7190 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7191 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7192 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7193 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7194};
7195
7196DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7197
211a3d87
LB
7198/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7199static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7200 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7201 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7202 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7203 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7204 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7205};
7206
7207DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7208
6b7b2ed9
LP
7209/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7210 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7211 * directories, specifically .timer units with their timestamp touch file. */
7212static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7213 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7214 [EXEC_DIRECTORY_STATE] = "state",
7215 [EXEC_DIRECTORY_CACHE] = "cache",
7216 [EXEC_DIRECTORY_LOGS] = "logs",
7217 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7218};
7219
7220DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7221
7222/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7223 * the service payload in. */
fb2042dd
YW
7224static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7225 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7226 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7227 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7228 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7229 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7230};
7231
7232DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7233
b1edf445
LP
7234static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7235 [EXEC_KEYRING_INHERIT] = "inherit",
7236 [EXEC_KEYRING_PRIVATE] = "private",
7237 [EXEC_KEYRING_SHARED] = "shared",
7238};
7239
7240DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);