]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
core: ensure execute/spawn functions can work without manager object
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
ac8db36c 7#include <sys/file.h>
f5947a5e 8#include <sys/ioctl.h>
f3e43635 9#include <sys/mman.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
9c0c6701
DDM
18#include <linux/fs.h> /* Must be included after <sys/mount.h> */
19
349cc4a5 20#if HAVE_PAM
5b6319dc
LP
21#include <security/pam_appl.h>
22#endif
23
349cc4a5 24#if HAVE_SELINUX
7b52a628
MS
25#include <selinux/selinux.h>
26#endif
27
349cc4a5 28#if HAVE_APPARMOR
eef65bf3
MS
29#include <sys/apparmor.h>
30#endif
31
24882e06 32#include "sd-messages.h"
8dd4c05b
LP
33
34#include "af-list.h"
b5efdb8a 35#include "alloc-util.h"
349cc4a5 36#if HAVE_APPARMOR
3ffd4af2
LP
37#include "apparmor-util.h"
38#endif
ee617a4e 39#include "argv-util.h"
8dd4c05b
LP
40#include "async.h"
41#include "barrier.h"
b1994387 42#include "bpf-lsm.h"
9c0c6701 43#include "btrfs-util.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
9c0c6701 46#include "chattr-util.h"
fdb3deca 47#include "cgroup-setup.h"
f461a28d 48#include "chase.h"
bb0c0d6f 49#include "chown-recursive.h"
28db6fbf 50#include "constants.h"
da681e1b 51#include "cpu-set-util.h"
6a818c3c 52#include "data-fd-util.h"
686d13b9 53#include "env-file.h"
4d1a6904 54#include "env-util.h"
17df7223 55#include "errno-list.h"
8a62620e 56#include "escape.h"
43962c30 57#include "exec-credential.h"
3ffd4af2 58#include "execute.h"
8dd4c05b 59#include "exit-status.h"
3ffd4af2 60#include "fd-util.h"
154eb43f 61#include "fileio.h"
f97b34a6 62#include "format-util.h"
7d50b32a 63#include "glob-util.h"
0389f4fa 64#include "hexdecoct.h"
c004493c 65#include "io-util.h"
032b3afb 66#include "ioprio-util.h"
9c0c6701 67#include "lock-util.h"
8dd4c05b
LP
68#include "log.h"
69#include "macro.h"
e8a565cb 70#include "manager.h"
2a341bb9 71#include "manager-dump.h"
0a970718 72#include "memory-util.h"
f5947a5e 73#include "missing_fs.h"
5bead76e 74#include "missing_ioprio.h"
7a114ed4 75#include "missing_prctl.h"
35cd0ba5 76#include "mkdir-label.h"
8dd4c05b 77#include "namespace.h"
6bedfcbb 78#include "parse-util.h"
8dd4c05b 79#include "path-util.h"
4d62ee55 80#include "proc-cmdline.h"
0b452006 81#include "process-util.h"
6bb00842 82#include "psi-util.h"
78f22b97 83#include "rlimit-util.h"
8dd4c05b 84#include "rm-rf.h"
3ffd4af2 85#include "seccomp-util.h"
07d46372 86#include "securebits-util.h"
8dd4c05b 87#include "selinux-util.h"
24882e06 88#include "signal-util.h"
8dd4c05b 89#include "smack-util.h"
57b7a260 90#include "socket-util.h"
a2ab603c 91#include "sort-util.h"
fd63e712 92#include "special.h"
949befd3 93#include "stat-util.h"
8b43440b 94#include "string-table.h"
07630cea 95#include "string-util.h"
8dd4c05b 96#include "strv.h"
7ccbd1ae 97#include "syslog-util.h"
8dd4c05b 98#include "terminal-util.h"
bb0c0d6f 99#include "tmpfile-util.h"
566b7d23 100#include "umask-util.h"
2d3b784d 101#include "unit-serialize.h"
b1d4f8e1 102#include "user-util.h"
8dd4c05b 103#include "utmp-wtmp.h"
5cb5a6ff 104
e056b01d 105#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 106#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 107
531dca78
LP
108#define SNDBUF_SIZE (8*1024*1024)
109
da6053d0 110static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
111 if (n_fds <= 0)
112 return 0;
113
a0d40ac5
LP
114 /* Modifies the fds array! (sorts it) */
115
034c6ed7
LP
116 assert(fds);
117
5b10116e
ZJS
118 for (int start = 0;;) {
119 int restart_from = -1;
034c6ed7 120
5b10116e 121 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
122 int nfd;
123
124 /* Already at right index? */
125 if (fds[i] == i+3)
126 continue;
127
3cc2aff1
LP
128 nfd = fcntl(fds[i], F_DUPFD, i + 3);
129 if (nfd < 0)
034c6ed7
LP
130 return -errno;
131
03e334a1 132 safe_close(fds[i]);
034c6ed7
LP
133 fds[i] = nfd;
134
135 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 136 * let's remember that and try again from here */
034c6ed7
LP
137 if (nfd != i+3 && restart_from < 0)
138 restart_from = i;
139 }
140
141 if (restart_from < 0)
142 break;
143
144 start = restart_from;
145 }
146
147 return 0;
148}
149
cd48e23f
RP
150static int flags_fds(
151 const int fds[],
152 size_t n_socket_fds,
153 size_t n_fds,
154 bool nonblock) {
155
e2c76839 156 int r;
47a71eed
LP
157
158 if (n_fds <= 0)
159 return 0;
160
161 assert(fds);
162
9b141911
FB
163 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
164 * O_NONBLOCK only applies to socket activation though. */
47a71eed 165
5b10116e 166 for (size_t i = 0; i < n_fds; i++) {
47a71eed 167
9b141911
FB
168 if (i < n_socket_fds) {
169 r = fd_nonblock(fds[i], nonblock);
170 if (r < 0)
171 return r;
172 }
47a71eed 173
451a074f
LP
174 /* We unconditionally drop FD_CLOEXEC from the fds,
175 * since after all we want to pass these fds to our
176 * children */
47a71eed 177
3cc2aff1
LP
178 r = fd_cloexec(fds[i], false);
179 if (r < 0)
e2c76839 180 return r;
47a71eed
LP
181 }
182
183 return 0;
184}
185
1e22b5cd 186static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
187 assert(context);
188
1e22b5cd
LP
189 if (context->stdio_as_fds)
190 return NULL;
191
80876c20
LP
192 if (context->tty_path)
193 return context->tty_path;
194
195 return "/dev/console";
196}
197
4d62ee55 198static int exec_context_tty_size(const ExecContext *context, unsigned *ret_rows, unsigned *ret_cols) {
4d62ee55
DDM
199 unsigned rows, cols;
200 const char *tty;
4d62ee55
DDM
201
202 assert(context);
203 assert(ret_rows);
204 assert(ret_cols);
205
206 rows = context->tty_rows;
207 cols = context->tty_cols;
208
209 tty = exec_context_tty_path(context);
29f5a5ae
DDM
210 if (tty)
211 (void) proc_cmdline_tty_size(tty, rows == UINT_MAX ? &rows : NULL, cols == UINT_MAX ? &cols : NULL);
4d62ee55
DDM
212
213 *ret_rows = rows;
214 *ret_cols = cols;
215
216 return 0;
217}
218
1e22b5cd 219static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
a0043bfa
ZJS
220 _cleanup_close_ int fd = -EBADF;
221 const char *path = exec_context_tty_path(ASSERT_PTR(context));
1e22b5cd 222
a0043bfa
ZJS
223 /* Take a lock around the device for the duration of the setup that we do here.
224 * systemd-vconsole-setup.service also takes the lock to avoid being interrupted.
225 * We open a new fd that will be closed automatically, and operate on it for convenience.
226 */
6ea832a2 227
a0043bfa
ZJS
228 if (p && p->stdin_fd >= 0) {
229 fd = xopenat_lock(p->stdin_fd, NULL,
230 O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY, 0, 0, LOCK_BSD, LOCK_EX);
231 if (fd < 0)
232 return;
233 } else if (path) {
234 fd = open_terminal(path, O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
235 if (fd < 0)
236 return;
6ea832a2 237
a0043bfa
ZJS
238 if (lock_generic(fd, LOCK_BSD, LOCK_EX) < 0)
239 return;
240 } else
241 return; /* nothing to do */
6ea832a2 242
a0043bfa
ZJS
243 if (context->tty_vhangup)
244 (void) terminal_vhangup_fd(fd);
245
246 if (context->tty_reset)
247 (void) reset_terminal_fd(fd, true);
1e22b5cd 248
4d62ee55
DDM
249 if (p && p->stdin_fd >= 0) {
250 unsigned rows = context->tty_rows, cols = context->tty_cols;
251
252 (void) exec_context_tty_size(context, &rows, &cols);
253 (void) terminal_set_size_fd(p->stdin_fd, path, rows, cols);
254 }
51462135 255
1e22b5cd
LP
256 if (context->tty_vt_disallocate && path)
257 (void) vt_disallocate(path);
6ea832a2
LP
258}
259
6af760f3
LP
260static bool is_terminal_input(ExecInput i) {
261 return IN_SET(i,
262 EXEC_INPUT_TTY,
263 EXEC_INPUT_TTY_FORCE,
264 EXEC_INPUT_TTY_FAIL);
265}
266
3a1286b6 267static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
268 return IN_SET(o,
269 EXEC_OUTPUT_TTY,
6af760f3
LP
270 EXEC_OUTPUT_KMSG_AND_CONSOLE,
271 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
272}
273
aac8c0c3
LP
274static bool is_kmsg_output(ExecOutput o) {
275 return IN_SET(o,
276 EXEC_OUTPUT_KMSG,
277 EXEC_OUTPUT_KMSG_AND_CONSOLE);
278}
279
6af760f3
LP
280static bool exec_context_needs_term(const ExecContext *c) {
281 assert(c);
282
283 /* Return true if the execution context suggests we should set $TERM to something useful. */
284
285 if (is_terminal_input(c->std_input))
286 return true;
287
288 if (is_terminal_output(c->std_output))
289 return true;
290
291 if (is_terminal_output(c->std_error))
292 return true;
293
294 return !!c->tty_path;
3a1286b6
MS
295}
296
80876c20 297static int open_null_as(int flags, int nfd) {
046a82c1 298 int fd;
071830ff 299
80876c20 300 assert(nfd >= 0);
071830ff 301
613b411c
LP
302 fd = open("/dev/null", flags|O_NOCTTY);
303 if (fd < 0)
071830ff
LP
304 return -errno;
305
046a82c1 306 return move_fd(fd, nfd, false);
071830ff
LP
307}
308
91dd5f7c
LP
309static int connect_journal_socket(
310 int fd,
311 const char *log_namespace,
312 uid_t uid,
313 gid_t gid) {
314
524daa8c
ZJS
315 uid_t olduid = UID_INVALID;
316 gid_t oldgid = GID_INVALID;
91dd5f7c 317 const char *j;
524daa8c
ZJS
318 int r;
319
91dd5f7c
LP
320 j = log_namespace ?
321 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
322 "/run/systemd/journal/stdout";
91dd5f7c 323
cad93f29 324 if (gid_is_valid(gid)) {
524daa8c
ZJS
325 oldgid = getgid();
326
92a17af9 327 if (setegid(gid) < 0)
524daa8c
ZJS
328 return -errno;
329 }
330
cad93f29 331 if (uid_is_valid(uid)) {
524daa8c
ZJS
332 olduid = getuid();
333
92a17af9 334 if (seteuid(uid) < 0) {
524daa8c
ZJS
335 r = -errno;
336 goto restore_gid;
337 }
338 }
339
1861986a 340 r = connect_unix_path(fd, AT_FDCWD, j);
524daa8c 341
1861986a
LP
342 /* If we fail to restore the uid or gid, things will likely fail later on. This should only happen if
343 an LSM interferes. */
524daa8c 344
cad93f29 345 if (uid_is_valid(uid))
524daa8c
ZJS
346 (void) seteuid(olduid);
347
348 restore_gid:
cad93f29 349 if (gid_is_valid(gid))
524daa8c
ZJS
350 (void) setegid(oldgid);
351
352 return r;
353}
354
fd1f9c89 355static int connect_logger_as(
34cf6c43 356 const Unit *unit,
fd1f9c89 357 const ExecContext *context,
af635cf3 358 const ExecParameters *params,
fd1f9c89
LP
359 ExecOutput output,
360 const char *ident,
fd1f9c89
LP
361 int nfd,
362 uid_t uid,
363 gid_t gid) {
364
254d1313 365 _cleanup_close_ int fd = -EBADF;
2ac1ff68 366 int r;
071830ff
LP
367
368 assert(context);
af635cf3 369 assert(params);
80876c20
LP
370 assert(output < _EXEC_OUTPUT_MAX);
371 assert(ident);
372 assert(nfd >= 0);
071830ff 373
54fe0cdb
LP
374 fd = socket(AF_UNIX, SOCK_STREAM, 0);
375 if (fd < 0)
80876c20 376 return -errno;
071830ff 377
91dd5f7c 378 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
379 if (r < 0)
380 return r;
071830ff 381
2ac1ff68 382 if (shutdown(fd, SHUT_RD) < 0)
80876c20 383 return -errno;
071830ff 384
fd1f9c89 385 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 386
2ac1ff68 387 if (dprintf(fd,
62bca2c6 388 "%s\n"
80876c20
LP
389 "%s\n"
390 "%i\n"
54fe0cdb
LP
391 "%i\n"
392 "%i\n"
393 "%i\n"
4f4a1dbf 394 "%i\n",
c867611e 395 context->syslog_identifier ?: ident,
af635cf3 396 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
397 context->syslog_priority,
398 !!context->syslog_level_prefix,
f3dc6af2 399 false,
aac8c0c3 400 is_kmsg_output(output),
2ac1ff68
EV
401 is_terminal_output(output)) < 0)
402 return -errno;
80876c20 403
2ac1ff68 404 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 405}
2ac1ff68 406
3a274a21 407static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 408 int fd;
071830ff 409
80876c20
LP
410 assert(path);
411 assert(nfd >= 0);
fd1f9c89 412
3a274a21 413 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 414 if (fd < 0)
80876c20 415 return fd;
071830ff 416
046a82c1 417 return move_fd(fd, nfd, false);
80876c20 418}
071830ff 419
2038c3f5 420static int acquire_path(const char *path, int flags, mode_t mode) {
254d1313 421 _cleanup_close_ int fd = -EBADF;
86fca584 422 int r;
071830ff 423
80876c20 424 assert(path);
071830ff 425
2038c3f5
LP
426 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
427 flags |= O_CREAT;
428
429 fd = open(path, flags|O_NOCTTY, mode);
430 if (fd >= 0)
15a3e96f 431 return TAKE_FD(fd);
071830ff 432
2038c3f5
LP
433 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
434 return -errno;
2038c3f5
LP
435
436 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
437
438 fd = socket(AF_UNIX, SOCK_STREAM, 0);
439 if (fd < 0)
440 return -errno;
441
1861986a
LP
442 r = connect_unix_path(fd, AT_FDCWD, path);
443 if (IN_SET(r, -ENOTSOCK, -EINVAL))
444 /* Propagate initial error if we get ENOTSOCK or EINVAL, i.e. we have indication that this
445 * wasn't an AF_UNIX socket after all */
446 return -ENXIO;
447 if (r < 0)
448 return r;
071830ff 449
2038c3f5
LP
450 if ((flags & O_ACCMODE) == O_RDONLY)
451 r = shutdown(fd, SHUT_WR);
452 else if ((flags & O_ACCMODE) == O_WRONLY)
453 r = shutdown(fd, SHUT_RD);
454 else
86fca584 455 r = 0;
15a3e96f 456 if (r < 0)
2038c3f5 457 return -errno;
2038c3f5 458
15a3e96f 459 return TAKE_FD(fd);
80876c20 460}
071830ff 461
08f3be7a
LP
462static int fixup_input(
463 const ExecContext *context,
464 int socket_fd,
465 bool apply_tty_stdin) {
466
467 ExecInput std_input;
468
469 assert(context);
470
471 std_input = context->std_input;
1e3ad081
LP
472
473 if (is_terminal_input(std_input) && !apply_tty_stdin)
474 return EXEC_INPUT_NULL;
071830ff 475
03fd9c49 476 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
477 return EXEC_INPUT_NULL;
478
08f3be7a
LP
479 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
480 return EXEC_INPUT_NULL;
481
03fd9c49 482 return std_input;
4f2d528d
LP
483}
484
7966a916 485static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 486
7966a916 487 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
488 return EXEC_OUTPUT_INHERIT;
489
7966a916 490 return output;
4f2d528d
LP
491}
492
a34ceba6
LP
493static int setup_input(
494 const ExecContext *context,
495 const ExecParameters *params,
52c239d7 496 int socket_fd,
2caa38e9 497 const int named_iofds[static 3]) {
a34ceba6 498
4f2d528d 499 ExecInput i;
51462135 500 int r;
4f2d528d
LP
501
502 assert(context);
a34ceba6 503 assert(params);
2caa38e9 504 assert(named_iofds);
a34ceba6
LP
505
506 if (params->stdin_fd >= 0) {
507 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
508 return -errno;
509
510 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e 511 if (isatty(STDIN_FILENO)) {
4d62ee55
DDM
512 unsigned rows = context->tty_rows, cols = context->tty_cols;
513
514 (void) exec_context_tty_size(context, &rows, &cols);
1fb0682e
LP
515 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
516 (void) reset_terminal_fd(STDIN_FILENO, true);
4d62ee55 517 (void) terminal_set_size_fd(STDIN_FILENO, NULL, rows, cols);
1fb0682e 518 }
a34ceba6
LP
519
520 return STDIN_FILENO;
521 }
4f2d528d 522
08f3be7a 523 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
524
525 switch (i) {
071830ff 526
80876c20
LP
527 case EXEC_INPUT_NULL:
528 return open_null_as(O_RDONLY, STDIN_FILENO);
529
530 case EXEC_INPUT_TTY:
531 case EXEC_INPUT_TTY_FORCE:
532 case EXEC_INPUT_TTY_FAIL: {
4d62ee55 533 unsigned rows, cols;
046a82c1 534 int fd;
071830ff 535
1e22b5cd 536 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
537 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
538 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
539 ACQUIRE_TERMINAL_WAIT,
3a43da28 540 USEC_INFINITY);
970edce6 541 if (fd < 0)
80876c20
LP
542 return fd;
543
4d62ee55
DDM
544 r = exec_context_tty_size(context, &rows, &cols);
545 if (r < 0)
546 return r;
547
548 r = terminal_set_size_fd(fd, exec_context_tty_path(context), rows, cols);
51462135
DDM
549 if (r < 0)
550 return r;
551
046a82c1 552 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
553 }
554
4f2d528d 555 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
556 assert(socket_fd >= 0);
557
7c248223 558 return RET_NERRNO(dup2(socket_fd, STDIN_FILENO));
4f2d528d 559
52c239d7 560 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
561 assert(named_iofds[STDIN_FILENO] >= 0);
562
52c239d7 563 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
7c248223 564 return RET_NERRNO(dup2(named_iofds[STDIN_FILENO], STDIN_FILENO));
52c239d7 565
08f3be7a
LP
566 case EXEC_INPUT_DATA: {
567 int fd;
568
569 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
570 if (fd < 0)
571 return fd;
572
573 return move_fd(fd, STDIN_FILENO, false);
574 }
575
2038c3f5
LP
576 case EXEC_INPUT_FILE: {
577 bool rw;
578 int fd;
579
580 assert(context->stdio_file[STDIN_FILENO]);
581
582 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
583 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
584
585 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
586 if (fd < 0)
587 return fd;
588
589 return move_fd(fd, STDIN_FILENO, false);
590 }
591
80876c20 592 default:
04499a70 593 assert_not_reached();
80876c20
LP
594 }
595}
596
41fc585a
LP
597static bool can_inherit_stderr_from_stdout(
598 const ExecContext *context,
599 ExecOutput o,
600 ExecOutput e) {
601
602 assert(context);
603
604 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
605 * stderr fd */
606
607 if (e == EXEC_OUTPUT_INHERIT)
608 return true;
609 if (e != o)
610 return false;
611
612 if (e == EXEC_OUTPUT_NAMED_FD)
613 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
614
8d7dab1f 615 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
616 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
617
618 return true;
619}
620
a34ceba6 621static int setup_output(
34cf6c43 622 const Unit *unit,
a34ceba6
LP
623 const ExecContext *context,
624 const ExecParameters *params,
625 int fileno,
626 int socket_fd,
2caa38e9 627 const int named_iofds[static 3],
a34ceba6 628 const char *ident,
7bce046b
LP
629 uid_t uid,
630 gid_t gid,
631 dev_t *journal_stream_dev,
632 ino_t *journal_stream_ino) {
a34ceba6 633
4f2d528d
LP
634 ExecOutput o;
635 ExecInput i;
47c1d80d 636 int r;
4f2d528d 637
f2341e0a 638 assert(unit);
80876c20 639 assert(context);
a34ceba6 640 assert(params);
80876c20 641 assert(ident);
7bce046b
LP
642 assert(journal_stream_dev);
643 assert(journal_stream_ino);
80876c20 644
a34ceba6
LP
645 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
646
647 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
648 return -errno;
649
650 return STDOUT_FILENO;
651 }
652
653 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
654 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
655 return -errno;
656
657 return STDERR_FILENO;
658 }
659
08f3be7a 660 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 661 o = fixup_output(context->std_output, socket_fd);
4f2d528d 662
eb17e935
MS
663 if (fileno == STDERR_FILENO) {
664 ExecOutput e;
665 e = fixup_output(context->std_error, socket_fd);
80876c20 666
eb17e935
MS
667 /* This expects the input and output are already set up */
668
669 /* Don't change the stderr file descriptor if we inherit all
670 * the way and are not on a tty */
671 if (e == EXEC_OUTPUT_INHERIT &&
672 o == EXEC_OUTPUT_INHERIT &&
673 i == EXEC_INPUT_NULL &&
674 !is_terminal_input(context->std_input) &&
7966a916 675 getppid() != 1)
eb17e935
MS
676 return fileno;
677
678 /* Duplicate from stdout if possible */
41fc585a 679 if (can_inherit_stderr_from_stdout(context, o, e))
7c248223 680 return RET_NERRNO(dup2(STDOUT_FILENO, fileno));
071830ff 681
eb17e935 682 o = e;
80876c20 683
eb17e935 684 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
685 /* If input got downgraded, inherit the original value */
686 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 687 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 688
08f3be7a
LP
689 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
690 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
7c248223 691 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
071830ff 692
acb591e4
LP
693 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
694 if (getppid() != 1)
eb17e935 695 return fileno;
94f04347 696
eb17e935
MS
697 /* We need to open /dev/null here anew, to get the right access mode. */
698 return open_null_as(O_WRONLY, fileno);
071830ff 699 }
94f04347 700
eb17e935 701 switch (o) {
80876c20
LP
702
703 case EXEC_OUTPUT_NULL:
eb17e935 704 return open_null_as(O_WRONLY, fileno);
80876c20
LP
705
706 case EXEC_OUTPUT_TTY:
4f2d528d 707 if (is_terminal_input(i))
7c248223 708 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
80876c20
LP
709
710 /* We don't reset the terminal if this is just about output */
1e22b5cd 711 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 712
9a6bca7a 713 case EXEC_OUTPUT_KMSG:
28dbc1e8 714 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
715 case EXEC_OUTPUT_JOURNAL:
716 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 717 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 718 if (r < 0) {
7966a916
ZJS
719 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
720 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 721 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
722 } else {
723 struct stat st;
724
725 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
726 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
727 * services to detect whether they are connected to the journal or not.
728 *
729 * If both stdout and stderr are connected to a stream then let's make sure to store the data
730 * about STDERR as that's usually the best way to do logging. */
7bce046b 731
ab2116b1
LP
732 if (fstat(fileno, &st) >= 0 &&
733 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
734 *journal_stream_dev = st.st_dev;
735 *journal_stream_ino = st.st_ino;
736 }
47c1d80d
MS
737 }
738 return r;
4f2d528d
LP
739
740 case EXEC_OUTPUT_SOCKET:
741 assert(socket_fd >= 0);
e75a9ed1 742
7c248223 743 return RET_NERRNO(dup2(socket_fd, fileno));
94f04347 744
52c239d7 745 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
746 assert(named_iofds[fileno] >= 0);
747
52c239d7 748 (void) fd_nonblock(named_iofds[fileno], false);
7c248223 749 return RET_NERRNO(dup2(named_iofds[fileno], fileno));
52c239d7 750
566b7d23 751 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
752 case EXEC_OUTPUT_FILE_APPEND:
753 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 754 bool rw;
566b7d23 755 int fd, flags;
2038c3f5
LP
756
757 assert(context->stdio_file[fileno]);
758
759 rw = context->std_input == EXEC_INPUT_FILE &&
760 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
761
762 if (rw)
7c248223 763 return RET_NERRNO(dup2(STDIN_FILENO, fileno));
2038c3f5 764
566b7d23
ZD
765 flags = O_WRONLY;
766 if (o == EXEC_OUTPUT_FILE_APPEND)
767 flags |= O_APPEND;
8d7dab1f
LW
768 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
769 flags |= O_TRUNC;
566b7d23
ZD
770
771 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
772 if (fd < 0)
773 return fd;
774
566b7d23 775 return move_fd(fd, fileno, 0);
2038c3f5
LP
776 }
777
94f04347 778 default:
04499a70 779 assert_not_reached();
94f04347 780 }
071830ff
LP
781}
782
02a51aba 783static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 784 int r;
02a51aba
LP
785
786 assert(fd >= 0);
02a51aba 787
1ff74fb6 788 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
789 if (isatty(fd) < 1) {
790 if (IN_SET(errno, EINVAL, ENOTTY))
791 return 0; /* not a tty */
1ff74fb6 792
02a51aba 793 return -errno;
4b3b5bc7 794 }
02a51aba 795
4b3b5bc7 796 /* This might fail. What matters are the results. */
f2df231f 797 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
798 if (r < 0)
799 return r;
02a51aba 800
4b3b5bc7 801 return 1;
02a51aba
LP
802}
803
aedec452 804static int setup_confirm_stdio(
51462135 805 const ExecContext *context,
aedec452
LP
806 const char *vc,
807 int *ret_saved_stdin,
808 int *ret_saved_stdout) {
809
254d1313 810 _cleanup_close_ int fd = -EBADF, saved_stdin = -EBADF, saved_stdout = -EBADF;
4d62ee55 811 unsigned rows, cols;
3d18b167 812 int r;
80876c20 813
aedec452
LP
814 assert(ret_saved_stdin);
815 assert(ret_saved_stdout);
80876c20 816
af6da548
LP
817 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
818 if (saved_stdin < 0)
819 return -errno;
80876c20 820
af6da548 821 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
822 if (saved_stdout < 0)
823 return -errno;
80876c20 824
8854d795 825 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
826 if (fd < 0)
827 return fd;
80876c20 828
af6da548
LP
829 r = chown_terminal(fd, getuid());
830 if (r < 0)
3d18b167 831 return r;
02a51aba 832
3d18b167
LP
833 r = reset_terminal_fd(fd, true);
834 if (r < 0)
835 return r;
80876c20 836
4d62ee55
DDM
837 r = exec_context_tty_size(context, &rows, &cols);
838 if (r < 0)
839 return r;
840
841 r = terminal_set_size_fd(fd, vc, rows, cols);
51462135
DDM
842 if (r < 0)
843 return r;
844
aedec452
LP
845 r = rearrange_stdio(fd, fd, STDERR_FILENO); /* Invalidates 'fd' also on failure */
846 TAKE_FD(fd);
2b33ab09
LP
847 if (r < 0)
848 return r;
80876c20 849
aedec452
LP
850 *ret_saved_stdin = TAKE_FD(saved_stdin);
851 *ret_saved_stdout = TAKE_FD(saved_stdout);
3d18b167 852 return 0;
80876c20
LP
853}
854
63d77c92 855static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
856 assert(err < 0);
857
858 if (err == -ETIMEDOUT)
63d77c92 859 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
860 else {
861 errno = -err;
63d77c92 862 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
863 }
864}
865
63d77c92 866static void write_confirm_error(int err, const char *vc, const Unit *u) {
254d1313 867 _cleanup_close_ int fd = -EBADF;
80876c20 868
3b20f877 869 assert(vc);
80876c20 870
7d5ceb64 871 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 872 if (fd < 0)
3b20f877 873 return;
80876c20 874
63d77c92 875 write_confirm_error_fd(err, fd, u);
af6da548 876}
80876c20 877
3d18b167 878static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 879 int r = 0;
80876c20 880
af6da548
LP
881 assert(saved_stdin);
882 assert(saved_stdout);
883
884 release_terminal();
885
886 if (*saved_stdin >= 0)
80876c20 887 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 888 r = -errno;
80876c20 889
af6da548 890 if (*saved_stdout >= 0)
80876c20 891 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 892 r = -errno;
80876c20 893
3d18b167
LP
894 *saved_stdin = safe_close(*saved_stdin);
895 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
896
897 return r;
898}
899
3b20f877
FB
900enum {
901 CONFIRM_PRETEND_FAILURE = -1,
902 CONFIRM_PRETEND_SUCCESS = 0,
903 CONFIRM_EXECUTE = 1,
904};
905
51462135 906static int ask_for_confirmation(const ExecContext *context, const char *vc, Unit *u, const char *cmdline) {
af6da548 907 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 908 _cleanup_free_ char *e = NULL;
3b20f877 909 char c;
af6da548 910
3b20f877 911 /* For any internal errors, assume a positive response. */
51462135 912 r = setup_confirm_stdio(context, vc, &saved_stdin, &saved_stdout);
3b20f877 913 if (r < 0) {
63d77c92 914 write_confirm_error(r, vc, u);
3b20f877
FB
915 return CONFIRM_EXECUTE;
916 }
af6da548 917
b0eb2944
FB
918 /* confirm_spawn might have been disabled while we were sleeping. */
919 if (manager_is_confirm_spawn_disabled(u->manager)) {
920 r = 1;
921 goto restore_stdio;
922 }
af6da548 923
2bcd3c26
FB
924 e = ellipsize(cmdline, 60, 100);
925 if (!e) {
926 log_oom();
927 r = CONFIRM_EXECUTE;
928 goto restore_stdio;
929 }
af6da548 930
d172b175 931 for (;;) {
539622bd 932 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 933 if (r < 0) {
63d77c92 934 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
935 r = CONFIRM_EXECUTE;
936 goto restore_stdio;
937 }
af6da548 938
d172b175 939 switch (c) {
b0eb2944
FB
940 case 'c':
941 printf("Resuming normal execution.\n");
942 manager_disable_confirm_spawn();
943 r = 1;
944 break;
dd6f9ac0
FB
945 case 'D':
946 unit_dump(u, stdout, " ");
947 continue; /* ask again */
d172b175
FB
948 case 'f':
949 printf("Failing execution.\n");
950 r = CONFIRM_PRETEND_FAILURE;
951 break;
952 case 'h':
b0eb2944
FB
953 printf(" c - continue, proceed without asking anymore\n"
954 " D - dump, show the state of the unit\n"
dd6f9ac0 955 " f - fail, don't execute the command and pretend it failed\n"
d172b175 956 " h - help\n"
eedf223a 957 " i - info, show a short summary of the unit\n"
56fde33a 958 " j - jobs, show jobs that are in progress\n"
d172b175
FB
959 " s - skip, don't execute the command and pretend it succeeded\n"
960 " y - yes, execute the command\n");
dd6f9ac0 961 continue; /* ask again */
eedf223a
FB
962 case 'i':
963 printf(" Description: %s\n"
964 " Unit: %s\n"
965 " Command: %s\n",
966 u->id, u->description, cmdline);
967 continue; /* ask again */
56fde33a 968 case 'j':
d1d8786c 969 manager_dump_jobs(u->manager, stdout, /* patterns= */ NULL, " ");
56fde33a 970 continue; /* ask again */
539622bd
FB
971 case 'n':
972 /* 'n' was removed in favor of 'f'. */
973 printf("Didn't understand 'n', did you mean 'f'?\n");
974 continue; /* ask again */
d172b175
FB
975 case 's':
976 printf("Skipping execution.\n");
977 r = CONFIRM_PRETEND_SUCCESS;
978 break;
979 case 'y':
980 r = CONFIRM_EXECUTE;
981 break;
982 default:
04499a70 983 assert_not_reached();
d172b175 984 }
3b20f877 985 break;
3b20f877 986 }
af6da548 987
3b20f877 988restore_stdio:
af6da548 989 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 990 return r;
80876c20
LP
991}
992
1c943355
MY
993static int get_fixed_user(
994 const char *username,
995 const char **ret_user,
996 uid_t *ret_uid,
997 gid_t *ret_gid,
998 const char **ret_home,
999 const char **ret_shell) {
81a2b7ce 1000
1c943355 1001 int r;
81a2b7ce 1002
1c943355
MY
1003 assert(username);
1004 assert(ret_user);
23deef88 1005
4d885bd3
DH
1006 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
1007 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 1008
1c943355 1009 r = get_user_creds(&username, ret_uid, ret_gid, ret_home, ret_shell, USER_CREDS_CLEAN);
4d885bd3
DH
1010 if (r < 0)
1011 return r;
81a2b7ce 1012
1c943355 1013 *ret_user = username;
4d885bd3
DH
1014 return 0;
1015}
1016
1c943355
MY
1017static int get_fixed_group(
1018 const char *groupname,
1019 const char **ret_group,
1020 gid_t *ret_gid) {
4d885bd3 1021
1c943355 1022 int r;
4d885bd3 1023
1c943355
MY
1024 assert(groupname);
1025 assert(ret_group);
4d885bd3 1026
1c943355 1027 r = get_group_creds(&groupname, ret_gid, /* flags = */ 0);
4d885bd3
DH
1028 if (r < 0)
1029 return r;
1030
1c943355 1031 *ret_group = groupname;
4d885bd3
DH
1032 return 0;
1033}
1034
cdc5d5c5
DH
1035static int get_supplementary_groups(const ExecContext *c, const char *user,
1036 const char *group, gid_t gid,
1037 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
1038 int r, k = 0;
1039 int ngroups_max;
1040 bool keep_groups = false;
1041 gid_t *groups = NULL;
1042 _cleanup_free_ gid_t *l_gids = NULL;
1043
1044 assert(c);
1045
bbeea271
DH
1046 /*
1047 * If user is given, then lookup GID and supplementary groups list.
1048 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
1049 * here and as early as possible so we keep the list of supplementary
1050 * groups of the caller.
bbeea271
DH
1051 */
1052 if (user && gid_is_valid(gid) && gid != 0) {
1053 /* First step, initialize groups from /etc/groups */
1054 if (initgroups(user, gid) < 0)
1055 return -errno;
1056
1057 keep_groups = true;
1058 }
1059
ac6e8be6 1060 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1061 return 0;
1062
366ddd25
DH
1063 /*
1064 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1065 * be positive, otherwise fail.
1066 */
1067 errno = 0;
1068 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1069 if (ngroups_max <= 0)
1070 return errno_or_else(EOPNOTSUPP);
366ddd25 1071
4d885bd3
DH
1072 l_gids = new(gid_t, ngroups_max);
1073 if (!l_gids)
1074 return -ENOMEM;
81a2b7ce 1075
4d885bd3
DH
1076 if (keep_groups) {
1077 /*
1078 * Lookup the list of groups that the user belongs to, we
1079 * avoid NSS lookups here too for gid=0.
1080 */
1081 k = ngroups_max;
1082 if (getgrouplist(user, gid, l_gids, &k) < 0)
1083 return -EINVAL;
1084 } else
1085 k = 0;
81a2b7ce 1086
4d885bd3
DH
1087 STRV_FOREACH(i, c->supplementary_groups) {
1088 const char *g;
81a2b7ce 1089
4d885bd3
DH
1090 if (k >= ngroups_max)
1091 return -E2BIG;
81a2b7ce 1092
4d885bd3 1093 g = *i;
fafff8f1 1094 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1095 if (r < 0)
1096 return r;
81a2b7ce 1097
4d885bd3
DH
1098 k++;
1099 }
81a2b7ce 1100
4d885bd3
DH
1101 /*
1102 * Sets ngids to zero to drop all supplementary groups, happens
1103 * when we are under root and SupplementaryGroups= is empty.
1104 */
1105 if (k == 0) {
1106 *ngids = 0;
1107 return 0;
1108 }
81a2b7ce 1109
4d885bd3
DH
1110 /* Otherwise get the final list of supplementary groups */
1111 groups = memdup(l_gids, sizeof(gid_t) * k);
1112 if (!groups)
1113 return -ENOMEM;
1114
1115 *supplementary_gids = groups;
1116 *ngids = k;
1117
1118 groups = NULL;
1119
1120 return 0;
1121}
1122
34cf6c43 1123static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1124 int r;
1125
709dbeac
YW
1126 /* Handle SupplementaryGroups= if it is not empty */
1127 if (ngids > 0) {
4d885bd3
DH
1128 r = maybe_setgroups(ngids, supplementary_gids);
1129 if (r < 0)
97f0e76f 1130 return r;
4d885bd3 1131 }
81a2b7ce 1132
4d885bd3
DH
1133 if (gid_is_valid(gid)) {
1134 /* Then set our gids */
1135 if (setresgid(gid, gid, gid) < 0)
1136 return -errno;
81a2b7ce
LP
1137 }
1138
1139 return 0;
1140}
1141
a954b249
LP
1142static int set_securebits(unsigned bits, unsigned mask) {
1143 unsigned applied;
1144 int current;
1145
dbdc4098
TK
1146 current = prctl(PR_GET_SECUREBITS);
1147 if (current < 0)
1148 return -errno;
a954b249 1149
dbdc4098 1150 /* Clear all securebits defined in mask and set bits */
a954b249
LP
1151 applied = ((unsigned) current & ~mask) | bits;
1152 if ((unsigned) current == applied)
dbdc4098 1153 return 0;
a954b249 1154
dbdc4098
TK
1155 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1156 return -errno;
a954b249 1157
dbdc4098
TK
1158 return 1;
1159}
1160
638fd8cc
LP
1161static int enforce_user(
1162 const ExecContext *context,
1163 uid_t uid,
1164 uint64_t capability_ambient_set) {
81a2b7ce 1165 assert(context);
dbdc4098 1166 int r;
81a2b7ce 1167
4d885bd3
DH
1168 if (!uid_is_valid(uid))
1169 return 0;
1170
a954b249
LP
1171 /* Sets (but doesn't look up) the UIS and makes sure we keep the capabilities while doing so. For
1172 * setting secure bits the capability CAP_SETPCAP is required, so we also need keep-caps in this
1173 * case. */
81a2b7ce 1174
638fd8cc 1175 if ((capability_ambient_set != 0 || context->secure_bits != 0) && uid != 0) {
81a2b7ce 1176
a954b249
LP
1177 /* First step: If we need to keep capabilities but drop privileges we need to make sure we
1178 * keep our caps, while we drop privileges. Add KEEP_CAPS to the securebits */
1179 r = set_securebits(1U << SECURE_KEEP_CAPS, 0);
1180 if (r < 0)
1181 return r;
81a2b7ce
LP
1182 }
1183
479050b3 1184 /* Second step: actually set the uids */
81a2b7ce
LP
1185 if (setresuid(uid, uid, uid) < 0)
1186 return -errno;
1187
a954b249
LP
1188 /* At this point we should have all necessary capabilities but are otherwise a normal user. However,
1189 * the caps might got corrupted due to the setresuid() so we need clean them up later. This is done
1190 * outside of this call. */
81a2b7ce
LP
1191 return 0;
1192}
1193
349cc4a5 1194#if HAVE_PAM
5b6319dc
LP
1195
1196static int null_conv(
1197 int num_msg,
1198 const struct pam_message **msg,
1199 struct pam_response **resp,
1200 void *appdata_ptr) {
1201
1202 /* We don't support conversations */
1203
1204 return PAM_CONV_ERR;
1205}
1206
cefc33ae
LP
1207#endif
1208
5b6319dc
LP
1209static int setup_pam(
1210 const char *name,
1211 const char *user,
940c5210 1212 uid_t uid,
2d6fce8d 1213 gid_t gid,
5b6319dc 1214 const char *tty,
421bb42d 1215 char ***env, /* updated on success */
5b8d1f6b 1216 const int fds[], size_t n_fds) {
5b6319dc 1217
349cc4a5 1218#if HAVE_PAM
cefc33ae 1219
5b6319dc
LP
1220 static const struct pam_conv conv = {
1221 .conv = null_conv,
1222 .appdata_ptr = NULL
1223 };
1224
2d7c6aa2 1225 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
46e5bbab 1226 _cleanup_strv_free_ char **e = NULL;
5b6319dc 1227 pam_handle_t *handle = NULL;
d6e5f3ad 1228 sigset_t old_ss;
7bb70b6e 1229 int pam_code = PAM_SUCCESS, r;
5b6319dc
LP
1230 bool close_session = false;
1231 pid_t pam_pid = 0, parent_pid;
970edce6 1232 int flags = 0;
5b6319dc
LP
1233
1234 assert(name);
1235 assert(user);
2065ca69 1236 assert(env);
5b6319dc
LP
1237
1238 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1239 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1240 * systemd via the cgroup logic. It will then remove the PAM
1241 * session again. The parent process will exec() the actual
1242 * daemon. We do things this way to ensure that the main PID
1243 * of the daemon is the one we initially fork()ed. */
1244
7bb70b6e
LP
1245 r = barrier_create(&barrier);
1246 if (r < 0)
2d7c6aa2
DH
1247 goto fail;
1248
553d2243 1249 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1250 flags |= PAM_SILENT;
1251
f546241b
ZJS
1252 pam_code = pam_start(name, user, &conv, &handle);
1253 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1254 handle = NULL;
1255 goto fail;
1256 }
1257
3cd24c1a
LP
1258 if (!tty) {
1259 _cleanup_free_ char *q = NULL;
1260
1261 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1262 * out if that's the case, and read the TTY off it. */
1263
1264 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1265 tty = strjoina("/dev/", q);
1266 }
1267
513cf7da
MS
1268 if (tty) {
1269 pam_code = pam_set_item(handle, PAM_TTY, tty);
1270 if (pam_code != PAM_SUCCESS)
1271 goto fail;
1272 }
5b6319dc 1273
84eada2f
JW
1274 STRV_FOREACH(nv, *env) {
1275 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1276 if (pam_code != PAM_SUCCESS)
1277 goto fail;
1278 }
1279
970edce6 1280 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1281 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1282 goto fail;
1283
3bb39ea9
DG
1284 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1285 if (pam_code != PAM_SUCCESS)
46d7c6af 1286 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1287
970edce6 1288 pam_code = pam_open_session(handle, flags);
f546241b 1289 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1290 goto fail;
1291
1292 close_session = true;
1293
f546241b
ZJS
1294 e = pam_getenvlist(handle);
1295 if (!e) {
5b6319dc
LP
1296 pam_code = PAM_BUF_ERR;
1297 goto fail;
1298 }
1299
cafc5ca1 1300 /* Block SIGTERM, so that we know that it won't get lost in the child */
ce30c8dc 1301
72c0a2c2 1302 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1303
df0ff127 1304 parent_pid = getpid_cached();
5b6319dc 1305
4c253ed1
LP
1306 r = safe_fork("(sd-pam)", 0, &pam_pid);
1307 if (r < 0)
5b6319dc 1308 goto fail;
4c253ed1 1309 if (r == 0) {
7bb70b6e 1310 int sig, ret = EXIT_PAM;
5b6319dc 1311
cafc5ca1 1312 /* The child's job is to reset the PAM session on termination */
2d7c6aa2 1313 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1314
1da37e58
ZJS
1315 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1316 * those fds are open here that have been opened by PAM. */
4c253ed1 1317 (void) close_many(fds, n_fds);
5b6319dc 1318
cafc5ca1
LP
1319 /* Drop privileges - we don't need any to pam_close_session and this will make
1320 * PR_SET_PDEATHSIG work in most cases. If this fails, ignore the error - but expect sd-pam
1321 * threads to fail to exit normally */
2d6fce8d 1322
97f0e76f
LP
1323 r = maybe_setgroups(0, NULL);
1324 if (r < 0)
1325 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1326 if (setresgid(gid, gid, gid) < 0)
1327 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1328 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1329 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1330
9c274488 1331 (void) ignore_signals(SIGPIPE);
ce30c8dc 1332
cafc5ca1
LP
1333 /* Wait until our parent died. This will only work if the above setresuid() succeeds,
1334 * otherwise the kernel will not allow unprivileged parents kill their privileged children
1335 * this way. We rely on the control groups kill logic to do the rest for us. */
5b6319dc
LP
1336 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1337 goto child_finish;
1338
cafc5ca1
LP
1339 /* Tell the parent that our setup is done. This is especially important regarding dropping
1340 * privileges. Otherwise, unit setup might race against our setresuid(2) call.
643f4706 1341 *
cafc5ca1 1342 * If the parent aborted, we'll detect this below, hence ignore return failure here. */
643f4706 1343 (void) barrier_place(&barrier);
2d7c6aa2 1344
643f4706 1345 /* Check if our parent process might already have died? */
5b6319dc 1346 if (getppid() == parent_pid) {
d6e5f3ad
DM
1347 sigset_t ss;
1348
1349 assert_se(sigemptyset(&ss) >= 0);
1350 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1351
3dead8d9
LP
1352 for (;;) {
1353 if (sigwait(&ss, &sig) < 0) {
1354 if (errno == EINTR)
1355 continue;
1356
1357 goto child_finish;
1358 }
5b6319dc 1359
3dead8d9
LP
1360 assert(sig == SIGTERM);
1361 break;
1362 }
5b6319dc
LP
1363 }
1364
3bb39ea9
DG
1365 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1366 if (pam_code != PAM_SUCCESS)
1367 goto child_finish;
1368
3dead8d9 1369 /* If our parent died we'll end the session */
f546241b 1370 if (getppid() != parent_pid) {
970edce6 1371 pam_code = pam_close_session(handle, flags);
f546241b 1372 if (pam_code != PAM_SUCCESS)
5b6319dc 1373 goto child_finish;
f546241b 1374 }
5b6319dc 1375
7bb70b6e 1376 ret = 0;
5b6319dc
LP
1377
1378 child_finish:
7feb2b57
LP
1379 /* NB: pam_end() when called in child processes should set PAM_DATA_SILENT to let the module
1380 * know about this. See pam_end(3) */
1381 (void) pam_end(handle, pam_code | flags | PAM_DATA_SILENT);
7bb70b6e 1382 _exit(ret);
5b6319dc
LP
1383 }
1384
2d7c6aa2
DH
1385 barrier_set_role(&barrier, BARRIER_PARENT);
1386
cafc5ca1
LP
1387 /* If the child was forked off successfully it will do all the cleanups, so forget about the handle
1388 * here. */
5b6319dc
LP
1389 handle = NULL;
1390
3b8bddde 1391 /* Unblock SIGTERM again in the parent */
72c0a2c2 1392 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc 1393
cafc5ca1
LP
1394 /* We close the log explicitly here, since the PAM modules might have opened it, but we don't want
1395 * this fd around. */
5b6319dc
LP
1396 closelog();
1397
cafc5ca1
LP
1398 /* Synchronously wait for the child to initialize. We don't care for errors as we cannot
1399 * recover. However, warn loudly if it happens. */
2d7c6aa2
DH
1400 if (!barrier_place_and_sync(&barrier))
1401 log_error("PAM initialization failed");
1402
130d3d22 1403 return strv_free_and_replace(*env, e);
5b6319dc
LP
1404
1405fail:
970edce6
ZJS
1406 if (pam_code != PAM_SUCCESS) {
1407 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1408 r = -EPERM; /* PAM errors do not map to errno */
1409 } else
1410 log_error_errno(r, "PAM failed: %m");
9ba35398 1411
5b6319dc
LP
1412 if (handle) {
1413 if (close_session)
970edce6 1414 pam_code = pam_close_session(handle, flags);
5b6319dc 1415
7feb2b57 1416 (void) pam_end(handle, pam_code | flags);
5b6319dc
LP
1417 }
1418
5b6319dc 1419 closelog();
7bb70b6e 1420 return r;
cefc33ae
LP
1421#else
1422 return 0;
5b6319dc 1423#endif
cefc33ae 1424}
5b6319dc 1425
5d6b1584 1426static void rename_process_from_path(const char *path) {
a99626c1 1427 _cleanup_free_ char *buf = NULL;
5d6b1584 1428 const char *p;
5d6b1584 1429
a99626c1
LP
1430 assert(path);
1431
1432 /* This resulting string must fit in 10 chars (i.e. the length of "/sbin/init") to look pretty in
1433 * /bin/ps */
5d6b1584 1434
a99626c1 1435 if (path_extract_filename(path, &buf) < 0) {
5d6b1584
LP
1436 rename_process("(...)");
1437 return;
1438 }
1439
a99626c1 1440 size_t l = strlen(buf);
5d6b1584 1441 if (l > 8) {
a99626c1 1442 /* The end of the process name is usually more interesting, since the first bit might just be
5d6b1584 1443 * "systemd-" */
a99626c1 1444 p = buf + l - 8;
5d6b1584 1445 l = 8;
a99626c1
LP
1446 } else
1447 p = buf;
5d6b1584 1448
a99626c1 1449 char process_name[11];
5d6b1584
LP
1450 process_name[0] = '(';
1451 memcpy(process_name+1, p, l);
1452 process_name[1+l] = ')';
1453 process_name[1+l+1] = 0;
1454
1455 rename_process(process_name);
1456}
1457
469830d1
LP
1458static bool context_has_address_families(const ExecContext *c) {
1459 assert(c);
1460
6b000af4 1461 return c->address_families_allow_list ||
469830d1
LP
1462 !set_isempty(c->address_families);
1463}
1464
1465static bool context_has_syscall_filters(const ExecContext *c) {
1466 assert(c);
1467
6b000af4 1468 return c->syscall_allow_list ||
8cfa775f 1469 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1470}
1471
9df2cdd8
TM
1472static bool context_has_syscall_logs(const ExecContext *c) {
1473 assert(c);
1474
1475 return c->syscall_log_allow_list ||
1476 !hashmap_isempty(c->syscall_log);
1477}
1478
469830d1
LP
1479static bool context_has_no_new_privileges(const ExecContext *c) {
1480 assert(c);
1481
1482 if (c->no_new_privileges)
1483 return true;
1484
26c45a6c 1485 if (have_effective_cap(CAP_SYS_ADMIN) > 0) /* if we are privileged, we don't need NNP */
469830d1
LP
1486 return false;
1487
1488 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1489 return c->lock_personality ||
469830d1 1490 c->memory_deny_write_execute ||
0538d2a8 1491 c->private_devices ||
fc64760d 1492 c->protect_clock ||
0538d2a8 1493 c->protect_hostname ||
469830d1
LP
1494 c->protect_kernel_tunables ||
1495 c->protect_kernel_modules ||
84703040 1496 c->protect_kernel_logs ||
0538d2a8
YW
1497 context_has_address_families(c) ||
1498 exec_context_restrict_namespaces_set(c) ||
1499 c->restrict_realtime ||
1500 c->restrict_suid_sgid ||
78e864e5 1501 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1502 context_has_syscall_filters(c) ||
1503 context_has_syscall_logs(c);
469830d1
LP
1504}
1505
349cc4a5 1506#if HAVE_SECCOMP
17df7223 1507
83f12b27 1508static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1509
1510 if (is_seccomp_available())
1511 return false;
1512
f673b62d 1513 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1514 return true;
83f12b27
FS
1515}
1516
165a31c0 1517static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1518 uint32_t negative_action, default_action, action;
165a31c0 1519 int r;
8351ceae 1520
469830d1 1521 assert(u);
c0467cf3 1522 assert(c);
8351ceae 1523
469830d1 1524 if (!context_has_syscall_filters(c))
83f12b27
FS
1525 return 0;
1526
469830d1
LP
1527 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1528 return 0;
e9642be2 1529
005bfaf1 1530 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1531
6b000af4 1532 if (c->syscall_allow_list) {
469830d1
LP
1533 default_action = negative_action;
1534 action = SCMP_ACT_ALLOW;
7c66bae2 1535 } else {
469830d1
LP
1536 default_action = SCMP_ACT_ALLOW;
1537 action = negative_action;
57183d11 1538 }
8351ceae 1539
165a31c0 1540 if (needs_ambient_hack) {
6b000af4 1541 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1542 if (r < 0)
1543 return r;
1544 }
1545
b54f36c6 1546 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1547}
1548
9df2cdd8
TM
1549static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1550#ifdef SCMP_ACT_LOG
1551 uint32_t default_action, action;
1552#endif
1553
1554 assert(u);
1555 assert(c);
1556
1557 if (!context_has_syscall_logs(c))
1558 return 0;
1559
1560#ifdef SCMP_ACT_LOG
1561 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1562 return 0;
1563
1564 if (c->syscall_log_allow_list) {
1565 /* Log nothing but the ones listed */
1566 default_action = SCMP_ACT_ALLOW;
1567 action = SCMP_ACT_LOG;
1568 } else {
1569 /* Log everything but the ones listed */
1570 default_action = SCMP_ACT_LOG;
1571 action = SCMP_ACT_ALLOW;
1572 }
1573
1574 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1575#else
1576 /* old libseccomp */
1577 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1578 return 0;
1579#endif
1580}
1581
469830d1
LP
1582static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1583 assert(u);
4298d0b5
LP
1584 assert(c);
1585
469830d1 1586 if (set_isempty(c->syscall_archs))
83f12b27
FS
1587 return 0;
1588
469830d1
LP
1589 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1590 return 0;
4298d0b5 1591
469830d1
LP
1592 return seccomp_restrict_archs(c->syscall_archs);
1593}
4298d0b5 1594
469830d1
LP
1595static int apply_address_families(const Unit* u, const ExecContext *c) {
1596 assert(u);
1597 assert(c);
4298d0b5 1598
469830d1
LP
1599 if (!context_has_address_families(c))
1600 return 0;
4298d0b5 1601
469830d1
LP
1602 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1603 return 0;
4298d0b5 1604
6b000af4 1605 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1606}
4298d0b5 1607
83f12b27 1608static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
7a114ed4
TM
1609 int r;
1610
469830d1 1611 assert(u);
f3e43635
TM
1612 assert(c);
1613
469830d1 1614 if (!c->memory_deny_write_execute)
83f12b27
FS
1615 return 0;
1616
7a114ed4
TM
1617 /* use prctl() if kernel supports it (6.3) */
1618 r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
1619 if (r == 0) {
1620 log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
1621 return 0;
1622 }
1623 if (r < 0 && errno != EINVAL)
1624 return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
1625 /* else use seccomp */
1626 log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
1627
469830d1
LP
1628 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1629 return 0;
f3e43635 1630
469830d1 1631 return seccomp_memory_deny_write_execute();
f3e43635
TM
1632}
1633
83f12b27 1634static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1635 assert(u);
f4170c67
LP
1636 assert(c);
1637
469830d1 1638 if (!c->restrict_realtime)
83f12b27
FS
1639 return 0;
1640
469830d1
LP
1641 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1642 return 0;
f4170c67 1643
469830d1 1644 return seccomp_restrict_realtime();
f4170c67
LP
1645}
1646
f69567cb
LP
1647static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1648 assert(u);
1649 assert(c);
1650
1651 if (!c->restrict_suid_sgid)
1652 return 0;
1653
1654 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1655 return 0;
1656
1657 return seccomp_restrict_suid_sgid();
1658}
1659
59e856c7 1660static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1661 assert(u);
59eeb84b
LP
1662 assert(c);
1663
1664 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1665 * let's protect even those systems where this is left on in the kernel. */
1666
469830d1 1667 if (!c->protect_kernel_tunables)
59eeb84b
LP
1668 return 0;
1669
469830d1
LP
1670 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1671 return 0;
59eeb84b 1672
469830d1 1673 return seccomp_protect_sysctl();
59eeb84b
LP
1674}
1675
59e856c7 1676static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1677 assert(u);
502d704e
DH
1678 assert(c);
1679
25a8d8a0 1680 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1681
469830d1
LP
1682 if (!c->protect_kernel_modules)
1683 return 0;
1684
502d704e
DH
1685 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1686 return 0;
1687
b54f36c6 1688 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1689}
1690
84703040
KK
1691static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1692 assert(u);
1693 assert(c);
1694
1695 if (!c->protect_kernel_logs)
1696 return 0;
1697
1698 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1699 return 0;
1700
1701 return seccomp_protect_syslog();
1702}
1703
daf8f72b 1704static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1705 assert(u);
1706 assert(c);
1707
1708 if (!c->protect_clock)
1709 return 0;
1710
1711 if (skip_seccomp_unavailable(u, "ProtectClock="))
1712 return 0;
1713
1714 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1715}
1716
59e856c7 1717static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1718 assert(u);
ba128bb8
LP
1719 assert(c);
1720
8f81a5f6 1721 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1722
469830d1
LP
1723 if (!c->private_devices)
1724 return 0;
1725
ba128bb8
LP
1726 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1727 return 0;
1728
b54f36c6 1729 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1730}
1731
34cf6c43 1732static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1733 assert(u);
add00535
LP
1734 assert(c);
1735
1736 if (!exec_context_restrict_namespaces_set(c))
1737 return 0;
1738
1739 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1740 return 0;
1741
1742 return seccomp_restrict_namespaces(c->restrict_namespaces);
1743}
1744
78e864e5 1745static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1746 unsigned long personality;
1747 int r;
78e864e5
TM
1748
1749 assert(u);
1750 assert(c);
1751
1752 if (!c->lock_personality)
1753 return 0;
1754
1755 if (skip_seccomp_unavailable(u, "LockPersonality="))
1756 return 0;
1757
e8132d63
LP
1758 personality = c->personality;
1759
1760 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1761 if (personality == PERSONALITY_INVALID) {
1762
1763 r = opinionated_personality(&personality);
1764 if (r < 0)
1765 return r;
1766 }
78e864e5
TM
1767
1768 return seccomp_lock_personality(personality);
1769}
1770
c0467cf3 1771#endif
8351ceae 1772
7a8288f6 1773#if HAVE_LIBBPF
154eb43f 1774static int apply_restrict_filesystems(Unit *u, const ExecContext *c, const ExecParameters *p) {
7a8288f6
DM
1775 assert(u);
1776 assert(c);
154eb43f 1777 assert(p);
7a8288f6
DM
1778
1779 if (!exec_context_restrict_filesystems_set(c))
1780 return 0;
1781
154eb43f 1782 if (p->bpf_outer_map_fd < 0) {
46004616
ZJS
1783 /* LSM BPF is unsupported or lsm_bpf_setup failed */
1784 log_unit_debug(u, "LSM BPF not supported, skipping RestrictFileSystems=");
7a8288f6 1785 return 0;
46004616 1786 }
7a8288f6 1787
154eb43f 1788 return lsm_bpf_unit_restrict_filesystems(u, c->restrict_filesystems, p->bpf_outer_map_fd, c->restrict_filesystems_allow_list);
7a8288f6
DM
1789}
1790#endif
1791
daf8f72b 1792static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1793 assert(u);
1794 assert(c);
1795
1796 if (!c->protect_hostname)
1797 return 0;
1798
1799 if (ns_type_supported(NAMESPACE_UTS)) {
1800 if (unshare(CLONE_NEWUTS) < 0) {
1801 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1802 *ret_exit_status = EXIT_NAMESPACE;
1803 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1804 }
1805
1806 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1807 }
1808 } else
1809 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1810
1811#if HAVE_SECCOMP
8f3e342f
ZJS
1812 int r;
1813
daf8f72b
LP
1814 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1815 return 0;
1816
1817 r = seccomp_protect_hostname();
1818 if (r < 0) {
1819 *ret_exit_status = EXIT_SECCOMP;
1820 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1821 }
1822#endif
1823
1824 return 0;
1825}
1826
3042bbeb 1827static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1828 assert(idle_pipe);
1829
54eb2300
LP
1830 idle_pipe[1] = safe_close(idle_pipe[1]);
1831 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1832
1833 if (idle_pipe[0] >= 0) {
1834 int r;
1835
1836 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1837
1838 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1839 ssize_t n;
1840
31a7eb86 1841 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1842 n = write(idle_pipe[3], "x", 1);
1843 if (n > 0)
cd972d69 1844 /* Wait for systemd to react to the signal above. */
54756dce 1845 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1846 }
1847
54eb2300 1848 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1849
1850 }
1851
54eb2300 1852 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1853}
1854
fb2042dd
YW
1855static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1856
7cae38c4 1857static int build_environment(
34cf6c43 1858 const Unit *u,
9fa95f85 1859 const ExecContext *c,
1e22b5cd 1860 const ExecParameters *p,
6bb00842 1861 const CGroupContext *cgroup_context,
da6053d0 1862 size_t n_fds,
cd48e23f 1863 char **fdnames,
7cae38c4
LP
1864 const char *home,
1865 const char *username,
1866 const char *shell,
7bce046b
LP
1867 dev_t journal_stream_dev,
1868 ino_t journal_stream_ino,
6bb00842 1869 const char *memory_pressure_path,
7cae38c4
LP
1870 char ***ret) {
1871
1872 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1873 size_t n_env = 0;
7cae38c4 1874 char *x;
4d62ee55 1875 int r;
7cae38c4 1876
4b58153d 1877 assert(u);
7cae38c4 1878 assert(c);
7c1cb6f1 1879 assert(p);
7cae38c4
LP
1880 assert(ret);
1881
6bb00842 1882#define N_ENV_VARS 19
8d5bb13d 1883 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1884 if (!our_env)
1885 return -ENOMEM;
1886
1887 if (n_fds > 0) {
8dd4c05b
LP
1888 _cleanup_free_ char *joined = NULL;
1889
df0ff127 1890 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1891 return -ENOMEM;
1892 our_env[n_env++] = x;
1893
da6053d0 1894 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1895 return -ENOMEM;
1896 our_env[n_env++] = x;
8dd4c05b 1897
cd48e23f 1898 joined = strv_join(fdnames, ":");
8dd4c05b
LP
1899 if (!joined)
1900 return -ENOMEM;
1901
605405c6 1902 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1903 if (!x)
1904 return -ENOMEM;
1905 our_env[n_env++] = x;
7cae38c4
LP
1906 }
1907
b08af3b1 1908 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1909 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1910 return -ENOMEM;
1911 our_env[n_env++] = x;
1912
1e22b5cd 1913 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1914 return -ENOMEM;
1915 our_env[n_env++] = x;
1916 }
1917
de90700f
LP
1918 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use blocking
1919 * Varlink calls back to us for look up dynamic users in PID 1. Break the deadlock between D-Bus and
1920 * PID 1 by disabling use of PID1' NSS interface for looking up dynamic users. */
1921 if (p->flags & EXEC_NSS_DYNAMIC_BYPASS) {
1922 x = strdup("SYSTEMD_NSS_DYNAMIC_BYPASS=1");
fd63e712
LP
1923 if (!x)
1924 return -ENOMEM;
1925 our_env[n_env++] = x;
1926 }
1927
854eca4a
MY
1928 /* We query "root" if this is a system unit and User= is not specified. $USER is always set. $HOME
1929 * could cause problem for e.g. getty, since login doesn't override $HOME, and $LOGNAME and $SHELL don't
1930 * really make much sense since we're not logged in. Hence we conditionalize the three based on
1931 * SetLoginEnvironment= switch. */
1932 if (!c->user && !c->dynamic_user && p->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
1933 r = get_fixed_user("root", &username, NULL, NULL, &home, &shell);
1934 if (r < 0)
1935 return log_unit_error_errno(u, r, "Failed to determine user credentials for root: %m");
7cae38c4
LP
1936 }
1937
854eca4a
MY
1938 bool set_user_login_env = c->set_login_environment >= 0 ? c->set_login_environment : (c->user || c->dynamic_user);
1939
7cae38c4 1940 if (username) {
854eca4a 1941 x = strjoin("USER=", username);
7cae38c4
LP
1942 if (!x)
1943 return -ENOMEM;
1944 our_env[n_env++] = x;
1945
854eca4a
MY
1946 if (set_user_login_env) {
1947 x = strjoin("LOGNAME=", username);
1948 if (!x)
1949 return -ENOMEM;
1950 our_env[n_env++] = x;
1951 }
1952 }
1953
1954 if (home && set_user_login_env) {
1955 x = strjoin("HOME=", home);
7cae38c4
LP
1956 if (!x)
1957 return -ENOMEM;
854eca4a
MY
1958
1959 path_simplify(x + 5);
7cae38c4
LP
1960 our_env[n_env++] = x;
1961 }
1962
854eca4a 1963 if (shell && set_user_login_env) {
b910cc72 1964 x = strjoin("SHELL=", shell);
7cae38c4
LP
1965 if (!x)
1966 return -ENOMEM;
7bbead1d 1967
4ff361cc 1968 path_simplify(x + 6);
7cae38c4
LP
1969 our_env[n_env++] = x;
1970 }
1971
4b58153d
LP
1972 if (!sd_id128_is_null(u->invocation_id)) {
1973 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1974 return -ENOMEM;
1975
1976 our_env[n_env++] = x;
1977 }
1978
6af760f3 1979 if (exec_context_needs_term(c)) {
4d62ee55 1980 _cleanup_free_ char *cmdline = NULL;
6af760f3
LP
1981 const char *tty_path, *term = NULL;
1982
1983 tty_path = exec_context_tty_path(c);
1984
e8cf09b2
LP
1985 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1986 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1987 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1988
e8cf09b2 1989 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1990 term = getenv("TERM");
4d62ee55
DDM
1991 else if (tty_path && in_charset(skip_dev_prefix(tty_path), ALPHANUMERICAL)) {
1992 _cleanup_free_ char *key = NULL;
1993
1994 key = strjoin("systemd.tty.term.", skip_dev_prefix(tty_path));
1995 if (!key)
1996 return -ENOMEM;
1997
1998 r = proc_cmdline_get_key(key, 0, &cmdline);
1999 if (r < 0)
2000 log_debug_errno(r, "Failed to read %s from kernel cmdline, ignoring: %m", key);
2001 else if (r > 0)
2002 term = cmdline;
2003 }
e8cf09b2 2004
6af760f3
LP
2005 if (!term)
2006 term = default_term_for_tty(tty_path);
7cae38c4 2007
b910cc72 2008 x = strjoin("TERM=", term);
7cae38c4
LP
2009 if (!x)
2010 return -ENOMEM;
2011 our_env[n_env++] = x;
2012 }
2013
7bce046b
LP
2014 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
2015 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
2016 return -ENOMEM;
2017
2018 our_env[n_env++] = x;
2019 }
2020
91dd5f7c
LP
2021 if (c->log_namespace) {
2022 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
2023 if (!x)
2024 return -ENOMEM;
2025
2026 our_env[n_env++] = x;
2027 }
2028
5b10116e 2029 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
211a3d87 2030 _cleanup_free_ char *joined = NULL;
fb2042dd
YW
2031 const char *n;
2032
2033 if (!p->prefix[t])
2034 continue;
2035
211a3d87 2036 if (c->directories[t].n_items == 0)
fb2042dd
YW
2037 continue;
2038
2039 n = exec_directory_env_name_to_string(t);
2040 if (!n)
2041 continue;
2042
211a3d87
LB
2043 for (size_t i = 0; i < c->directories[t].n_items; i++) {
2044 _cleanup_free_ char *prefixed = NULL;
fb2042dd 2045
211a3d87
LB
2046 prefixed = path_join(p->prefix[t], c->directories[t].items[i].path);
2047 if (!prefixed)
2048 return -ENOMEM;
2049
2050 if (!strextend_with_separator(&joined, ":", prefixed))
2051 return -ENOMEM;
2052 }
fb2042dd
YW
2053
2054 x = strjoin(n, "=", joined);
2055 if (!x)
2056 return -ENOMEM;
2057
2058 our_env[n_env++] = x;
2059 }
2060
133e4de2
YW
2061 _cleanup_free_ char *creds_dir = NULL;
2062 r = exec_context_get_credential_directory(c, p, u->id, &creds_dir);
2063 if (r < 0)
2064 return r;
2065 if (r > 0) {
2066 x = strjoin("CREDENTIALS_DIRECTORY=", creds_dir);
bb0c0d6f
LP
2067 if (!x)
2068 return -ENOMEM;
2069
2070 our_env[n_env++] = x;
2071 }
2072
dc4e2940
YW
2073 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
2074 return -ENOMEM;
2075
2076 our_env[n_env++] = x;
2077
6bb00842
LP
2078 if (memory_pressure_path) {
2079 x = strjoin("MEMORY_PRESSURE_WATCH=", memory_pressure_path);
2080 if (!x)
2081 return -ENOMEM;
2082
2083 our_env[n_env++] = x;
2084
2085 if (cgroup_context && !path_equal(memory_pressure_path, "/dev/null")) {
2086 _cleanup_free_ char *b = NULL, *e = NULL;
2087
2088 if (asprintf(&b, "%s " USEC_FMT " " USEC_FMT,
2089 MEMORY_PRESSURE_DEFAULT_TYPE,
2090 cgroup_context->memory_pressure_threshold_usec == USEC_INFINITY ? MEMORY_PRESSURE_DEFAULT_THRESHOLD_USEC :
2091 CLAMP(cgroup_context->memory_pressure_threshold_usec, 1U, MEMORY_PRESSURE_DEFAULT_WINDOW_USEC),
2092 MEMORY_PRESSURE_DEFAULT_WINDOW_USEC) < 0)
2093 return -ENOMEM;
2094
2095 if (base64mem(b, strlen(b) + 1, &e) < 0)
2096 return -ENOMEM;
2097
2098 x = strjoin("MEMORY_PRESSURE_WRITE=", e);
2099 if (!x)
2100 return -ENOMEM;
2101
2102 our_env[n_env++] = x;
2103 }
2104 }
2105
2106 assert(n_env < N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
8d5bb13d 2107#undef N_ENV_VARS
7cae38c4 2108
ae2a15bc 2109 *ret = TAKE_PTR(our_env);
7cae38c4
LP
2110
2111 return 0;
2112}
2113
b4c14404
FB
2114static int build_pass_environment(const ExecContext *c, char ***ret) {
2115 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 2116 size_t n_env = 0;
b4c14404
FB
2117
2118 STRV_FOREACH(i, c->pass_environment) {
2119 _cleanup_free_ char *x = NULL;
2120 char *v;
2121
2122 v = getenv(*i);
2123 if (!v)
2124 continue;
605405c6 2125 x = strjoin(*i, "=", v);
b4c14404
FB
2126 if (!x)
2127 return -ENOMEM;
00819cc1 2128
319a4f4b 2129 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 2130 return -ENOMEM;
00819cc1 2131
1cc6c93a 2132 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 2133 pass_env[n_env] = NULL;
b4c14404
FB
2134 }
2135
ae2a15bc 2136 *ret = TAKE_PTR(pass_env);
b4c14404
FB
2137
2138 return 0;
2139}
2140
fbbb9697
YW
2141bool exec_needs_network_namespace(const ExecContext *context) {
2142 assert(context);
2143
2144 return context->private_network || context->network_namespace_path;
2145}
2146
9c0c6701
DDM
2147static bool exec_needs_ephemeral(const ExecContext *context) {
2148 return (context->root_image || context->root_directory) && context->root_ephemeral;
2149}
2150
fde36d25
YW
2151static bool exec_needs_ipc_namespace(const ExecContext *context) {
2152 assert(context);
2153
2154 return context->private_ipc || context->ipc_namespace_path;
2155}
2156
5e8deb94 2157bool exec_needs_mount_namespace(
8b44a3d2
LP
2158 const ExecContext *context,
2159 const ExecParameters *params,
28135da3 2160 const ExecRuntime *runtime) {
8b44a3d2
LP
2161
2162 assert(context);
8b44a3d2 2163
915e6d16
LP
2164 if (context->root_image)
2165 return true;
2166
2a624c36
AP
2167 if (!strv_isempty(context->read_write_paths) ||
2168 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2169 !strv_isempty(context->inaccessible_paths) ||
2170 !strv_isempty(context->exec_paths) ||
2171 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2172 return true;
2173
42b1d8e0 2174 if (context->n_bind_mounts > 0)
d2d6c096
LP
2175 return true;
2176
2abd4e38
YW
2177 if (context->n_temporary_filesystems > 0)
2178 return true;
2179
b3d13314
LB
2180 if (context->n_mount_images > 0)
2181 return true;
2182
93f59701
LB
2183 if (context->n_extension_images > 0)
2184 return true;
2185
a07b9926
LB
2186 if (!strv_isempty(context->extension_directories))
2187 return true;
2188
874cdcbc 2189 if (!IN_SET(context->mount_propagation_flag, 0, MS_SHARED))
8b44a3d2
LP
2190 return true;
2191
28135da3 2192 if (context->private_tmp && runtime && runtime->shared && (runtime->shared->tmp_dir || runtime->shared->var_tmp_dir))
8b44a3d2
LP
2193 return true;
2194
8b44a3d2 2195 if (context->private_devices ||
24002121 2196 context->private_mounts > 0 ||
c2da3bf2 2197 (context->private_mounts < 0 && exec_needs_network_namespace(context)) ||
8b44a3d2 2198 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2199 context->protect_home != PROTECT_HOME_NO ||
2200 context->protect_kernel_tunables ||
c575770b 2201 context->protect_kernel_modules ||
94a7b275 2202 context->protect_kernel_logs ||
4e399953
LP
2203 context->protect_control_groups ||
2204 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44 2205 context->proc_subset != PROC_SUBSET_ALL ||
fde36d25 2206 exec_needs_ipc_namespace(context))
8b44a3d2
LP
2207 return true;
2208
37c56f89 2209 if (context->root_directory) {
5e98086d 2210 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2211 return true;
2212
5b10116e 2213 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2214 if (params && !params->prefix[t])
37c56f89
YW
2215 continue;
2216
211a3d87 2217 if (context->directories[t].n_items > 0)
37c56f89
YW
2218 return true;
2219 }
2220 }
5d997827 2221
42b1d8e0 2222 if (context->dynamic_user &&
211a3d87
LB
2223 (context->directories[EXEC_DIRECTORY_STATE].n_items > 0 ||
2224 context->directories[EXEC_DIRECTORY_CACHE].n_items > 0 ||
2225 context->directories[EXEC_DIRECTORY_LOGS].n_items > 0))
42b1d8e0
YW
2226 return true;
2227
91dd5f7c
LP
2228 if (context->log_namespace)
2229 return true;
2230
8b44a3d2
LP
2231 return false;
2232}
2233
5749f855 2234static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d 2235 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
19ee48a6 2236 _cleanup_close_pair_ int errno_pipe[2] = PIPE_EBADF;
254d1313 2237 _cleanup_close_ int unshare_ready_fd = -EBADF;
d251207d
LP
2238 _cleanup_(sigkill_waitp) pid_t pid = 0;
2239 uint64_t c = 1;
d251207d
LP
2240 ssize_t n;
2241 int r;
2242
5749f855
AZ
2243 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2244 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2245 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2246 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2247 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2248 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2249 * continues execution normally.
2250 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2251 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2252
5749f855 2253 /* Can only set up multiple mappings with CAP_SETUID. */
26c45a6c 2254 if (have_effective_cap(CAP_SETUID) > 0 && uid != ouid && uid_is_valid(uid))
587ab01b 2255 r = asprintf(&uid_map,
5749f855 2256 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2257 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2258 ouid, ouid, uid, uid);
2259 else
2260 r = asprintf(&uid_map,
2261 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2262 ouid, ouid);
d251207d 2263
5749f855
AZ
2264 if (r < 0)
2265 return -ENOMEM;
2266
2267 /* Can only set up multiple mappings with CAP_SETGID. */
26c45a6c 2268 if (have_effective_cap(CAP_SETGID) > 0 && gid != ogid && gid_is_valid(gid))
587ab01b 2269 r = asprintf(&gid_map,
5749f855 2270 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2271 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2272 ogid, ogid, gid, gid);
2273 else
2274 r = asprintf(&gid_map,
2275 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2276 ogid, ogid);
2277
2278 if (r < 0)
2279 return -ENOMEM;
d251207d
LP
2280
2281 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2282 * namespace. */
2283 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2284 if (unshare_ready_fd < 0)
2285 return -errno;
2286
2287 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2288 * failed. */
2289 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2290 return -errno;
2291
4c253ed1
LP
2292 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2293 if (r < 0)
2294 return r;
2295 if (r == 0) {
254d1313 2296 _cleanup_close_ int fd = -EBADF;
d251207d
LP
2297 const char *a;
2298 pid_t ppid;
2299
2300 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2301 * here, after the parent opened its own user namespace. */
2302
2303 ppid = getppid();
2304 errno_pipe[0] = safe_close(errno_pipe[0]);
2305
2306 /* Wait until the parent unshared the user namespace */
2307 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2308 r = -errno;
2309 goto child_fail;
2310 }
2311
2312 /* Disable the setgroups() system call in the child user namespace, for good. */
2313 a = procfs_file_alloca(ppid, "setgroups");
2314 fd = open(a, O_WRONLY|O_CLOEXEC);
2315 if (fd < 0) {
2316 if (errno != ENOENT) {
2317 r = -errno;
2318 goto child_fail;
2319 }
2320
2321 /* If the file is missing the kernel is too old, let's continue anyway. */
2322 } else {
2323 if (write(fd, "deny\n", 5) < 0) {
2324 r = -errno;
2325 goto child_fail;
2326 }
2327
2328 fd = safe_close(fd);
2329 }
2330
2331 /* First write the GID map */
2332 a = procfs_file_alloca(ppid, "gid_map");
2333 fd = open(a, O_WRONLY|O_CLOEXEC);
2334 if (fd < 0) {
2335 r = -errno;
2336 goto child_fail;
2337 }
2338 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2339 r = -errno;
2340 goto child_fail;
2341 }
2342 fd = safe_close(fd);
2343
2344 /* The write the UID map */
2345 a = procfs_file_alloca(ppid, "uid_map");
2346 fd = open(a, O_WRONLY|O_CLOEXEC);
2347 if (fd < 0) {
2348 r = -errno;
2349 goto child_fail;
2350 }
2351 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2352 r = -errno;
2353 goto child_fail;
2354 }
2355
2356 _exit(EXIT_SUCCESS);
2357
2358 child_fail:
2359 (void) write(errno_pipe[1], &r, sizeof(r));
2360 _exit(EXIT_FAILURE);
2361 }
2362
2363 errno_pipe[1] = safe_close(errno_pipe[1]);
2364
2365 if (unshare(CLONE_NEWUSER) < 0)
2366 return -errno;
2367
2368 /* Let the child know that the namespace is ready now */
2369 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2370 return -errno;
2371
2372 /* Try to read an error code from the child */
2373 n = read(errno_pipe[0], &r, sizeof(r));
2374 if (n < 0)
2375 return -errno;
2376 if (n == sizeof(r)) { /* an error code was sent to us */
2377 if (r < 0)
2378 return r;
2379 return -EIO;
2380 }
2381 if (n != 0) /* on success we should have read 0 bytes */
2382 return -EIO;
2383
8f03de53 2384 r = wait_for_terminate_and_check("(sd-userns)", TAKE_PID(pid), 0);
d251207d
LP
2385 if (r < 0)
2386 return r;
2e87a1fd 2387 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2388 return -EIO;
2389
2390 return 0;
2391}
2392
494d0247 2393static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
e43911a7
LP
2394 assert(context);
2395
494d0247
YW
2396 if (!context->dynamic_user)
2397 return false;
2398
2399 if (type == EXEC_DIRECTORY_CONFIGURATION)
2400 return false;
2401
2402 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2403 return false;
2404
2405 return true;
2406}
2407
211a3d87
LB
2408static int create_many_symlinks(const char *root, const char *source, char **symlinks) {
2409 _cleanup_free_ char *src_abs = NULL;
211a3d87
LB
2410 int r;
2411
2412 assert(source);
2413
2414 src_abs = path_join(root, source);
2415 if (!src_abs)
2416 return -ENOMEM;
2417
2418 STRV_FOREACH(dst, symlinks) {
2419 _cleanup_free_ char *dst_abs = NULL;
2420
2421 dst_abs = path_join(root, *dst);
2422 if (!dst_abs)
2423 return -ENOMEM;
2424
2425 r = mkdir_parents_label(dst_abs, 0755);
2426 if (r < 0)
2427 return r;
2428
2429 r = symlink_idempotent(src_abs, dst_abs, true);
2430 if (r < 0)
2431 return r;
2432 }
2433
2434 return 0;
2435}
2436
3536f49e 2437static int setup_exec_directory(
59dd2bbb 2438 Unit *u,
07689d5d
LP
2439 const ExecContext *context,
2440 const ExecParameters *params,
2441 uid_t uid,
3536f49e 2442 gid_t gid,
3536f49e 2443 ExecDirectoryType type,
211a3d87 2444 bool needs_mount_namespace,
3536f49e 2445 int *exit_status) {
07689d5d 2446
72fd1768 2447 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2448 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2449 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2450 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2451 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2452 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2453 };
07689d5d
LP
2454 int r;
2455
2456 assert(context);
2457 assert(params);
72fd1768 2458 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2459 assert(exit_status);
07689d5d 2460
3536f49e
YW
2461 if (!params->prefix[type])
2462 return 0;
2463
8679efde 2464 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2465 if (!uid_is_valid(uid))
2466 uid = 0;
2467 if (!gid_is_valid(gid))
2468 gid = 0;
2469 }
2470
211a3d87 2471 for (size_t i = 0; i < context->directories[type].n_items; i++) {
6c47cd7d 2472 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2473
211a3d87 2474 p = path_join(params->prefix[type], context->directories[type].items[i].path);
3536f49e
YW
2475 if (!p) {
2476 r = -ENOMEM;
2477 goto fail;
2478 }
07689d5d 2479
23a7448e
YW
2480 r = mkdir_parents_label(p, 0755);
2481 if (r < 0)
3536f49e 2482 goto fail;
23a7448e 2483
f9c91932
LP
2484 if (IN_SET(type, EXEC_DIRECTORY_STATE, EXEC_DIRECTORY_LOGS) && params->runtime_scope == RUNTIME_SCOPE_USER) {
2485
2486 /* If we are in user mode, and a configuration directory exists but a state directory
2487 * doesn't exist, then we likely are upgrading from an older systemd version that
2488 * didn't know the more recent addition to the xdg-basedir spec: the $XDG_STATE_HOME
2489 * directory. In older systemd versions EXEC_DIRECTORY_STATE was aliased to
2490 * EXEC_DIRECTORY_CONFIGURATION, with the advent of $XDG_STATE_HOME is is now
627cdcc7 2491 * separated. If a service has both dirs configured but only the configuration dir
f9c91932
LP
2492 * exists and the state dir does not, we assume we are looking at an update
2493 * situation. Hence, create a compatibility symlink, so that all expectations are
2494 * met.
2495 *
2496 * (We also do something similar with the log directory, which still doesn't exist in
2497 * the xdg basedir spec. We'll make it a subdir of the state dir.) */
2498
2499 /* this assumes the state dir is always created before the configuration dir */
2500 assert_cc(EXEC_DIRECTORY_STATE < EXEC_DIRECTORY_LOGS);
2501 assert_cc(EXEC_DIRECTORY_LOGS < EXEC_DIRECTORY_CONFIGURATION);
2502
2503 r = laccess(p, F_OK);
2504 if (r == -ENOENT) {
2505 _cleanup_free_ char *q = NULL;
2506
2507 /* OK, we know that the state dir does not exist. Let's see if the dir exists
2508 * under the configuration hierarchy. */
2509
2510 if (type == EXEC_DIRECTORY_STATE)
2511 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], context->directories[type].items[i].path);
2512 else if (type == EXEC_DIRECTORY_LOGS)
2513 q = path_join(params->prefix[EXEC_DIRECTORY_CONFIGURATION], "log", context->directories[type].items[i].path);
2514 else
2515 assert_not_reached();
2516 if (!q) {
2517 r = -ENOMEM;
2518 goto fail;
2519 }
2520
2521 r = laccess(q, F_OK);
2522 if (r >= 0) {
2523 /* It does exist! This hence looks like an update. Symlink the
2524 * configuration directory into the state directory. */
2525
2526 r = symlink_idempotent(q, p, /* make_relative= */ true);
2527 if (r < 0)
2528 goto fail;
2529
59dd2bbb 2530 log_unit_notice(u, "Unit state directory %s missing but matching configuration directory %s exists, assuming update from systemd 253 or older, creating compatibility symlink.", p, q);
f9c91932
LP
2531 continue;
2532 } else if (r != -ENOENT)
59dd2bbb 2533 log_unit_warning_errno(u, r, "Unable to detect whether unit configuration directory '%s' exists, assuming not: %m", q);
f9c91932
LP
2534
2535 } else if (r < 0)
59dd2bbb 2536 log_unit_warning_errno(u, r, "Unable to detect whether unit state directory '%s' is missing, assuming it is: %m", p);
f9c91932
LP
2537 }
2538
494d0247 2539 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2540 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2541 * case we want to avoid leaving a directory around fully accessible that is owned by
2542 * a dynamic user whose UID is later on reused. To lock this down we use the same
2543 * trick used by container managers to prohibit host users to get access to files of
2544 * the same UID in containers: we place everything inside a directory that has an
2545 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2546 * for unprivileged host code. We then use fs namespacing to make this directory
2547 * permeable for the service itself.
6c47cd7d 2548 *
3f5b1508
LP
2549 * Specifically: for a service which wants a special directory "foo/" we first create
2550 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2551 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2552 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2553 * unprivileged host users can't look into it. Inside of the namespace of the unit
2554 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2555 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2556 * for the service and making sure it only gets access to the dirs it needs but no
2557 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2558 *
3f5b1508
LP
2559 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2560 * to be owned by the service itself.
2561 *
2562 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2563 * for sharing files or sockets with other services. */
6c47cd7d 2564
4ede9802
LP
2565 pp = path_join(params->prefix[type], "private");
2566 if (!pp) {
6c47cd7d
LP
2567 r = -ENOMEM;
2568 goto fail;
2569 }
2570
2571 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2572 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2573 if (r < 0)
2574 goto fail;
2575
211a3d87 2576 if (!path_extend(&pp, context->directories[type].items[i].path)) {
6c47cd7d
LP
2577 r = -ENOMEM;
2578 goto fail;
2579 }
2580
2581 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2582 r = mkdir_parents_label(pp, 0755);
2583 if (r < 0)
2584 goto fail;
2585
949befd3 2586 if (is_dir(p, false) > 0 &&
b93d24e0 2587 (laccess(pp, F_OK) == -ENOENT)) {
949befd3
LP
2588
2589 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2590 * it over. Most likely the service has been upgraded from one that didn't use
2591 * DynamicUser=1, to one that does. */
2592
59dd2bbb
LP
2593 log_unit_info(u, "Found pre-existing public %s= directory %s, migrating to %s.\n"
2594 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2595 exec_directory_type_to_string(type), p, pp);
cf52c45d 2596
db58f5de
LP
2597 r = RET_NERRNO(rename(p, pp));
2598 if (r < 0)
949befd3 2599 goto fail;
949befd3
LP
2600 } else {
2601 /* Otherwise, create the actual directory for the service */
2602
2603 r = mkdir_label(pp, context->directories[type].mode);
2604 if (r < 0 && r != -EEXIST)
2605 goto fail;
2606 }
6c47cd7d 2607
a2ab603c
YW
2608 if (!context->directories[type].items[i].only_create) {
2609 /* And link it up from the original place.
2610 * Notes
2611 * 1) If a mount namespace is going to be used, then this symlink remains on
2612 * the host, and a new one for the child namespace will be created later.
2613 * 2) It is not necessary to create this symlink when one of its parent
2614 * directories is specified and already created. E.g.
2615 * StateDirectory=foo foo/bar
2616 * In that case, the inode points to pp and p for "foo/bar" are the same:
2617 * pp = "/var/lib/private/foo/bar"
2618 * p = "/var/lib/foo/bar"
2619 * and, /var/lib/foo is a symlink to /var/lib/private/foo. So, not only
2620 * we do not need to create the symlink, but we cannot create the symlink.
2621 * See issue #24783. */
2622 r = symlink_idempotent(pp, p, true);
2623 if (r < 0)
2624 goto fail;
2625 }
6c47cd7d 2626
6c47cd7d 2627 } else {
5c6d40d1
LP
2628 _cleanup_free_ char *target = NULL;
2629
2630 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2631 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2632 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2633
2634 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2635 * by DynamicUser=1 (see above)?
2636 *
2637 * We do this for all directory types except for ConfigurationDirectory=,
2638 * since they all support the private/ symlink logic at least in some
2639 * configurations, see above. */
5c6d40d1 2640
f461a28d 2641 r = chase(target, NULL, 0, &target_resolved, NULL);
578dc69f
YW
2642 if (r < 0)
2643 goto fail;
2644
211a3d87 2645 q = path_join(params->prefix[type], "private", context->directories[type].items[i].path);
5c6d40d1
LP
2646 if (!q) {
2647 r = -ENOMEM;
2648 goto fail;
2649 }
2650
578dc69f 2651 /* /var/lib or friends may be symlinks. So, let's chase them also. */
f461a28d 2652 r = chase(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
578dc69f
YW
2653 if (r < 0)
2654 goto fail;
2655
2656 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2657
2658 /* Hmm, apparently DynamicUser= was once turned on for this service,
2659 * but is no longer. Let's move the directory back up. */
2660
59dd2bbb
LP
2661 log_unit_info(u, "Found pre-existing private %s= directory %s, migrating to %s.\n"
2662 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2663 exec_directory_type_to_string(type), q, p);
cf52c45d 2664
db58f5de
LP
2665 r = RET_NERRNO(unlink(p));
2666 if (r < 0)
5c6d40d1 2667 goto fail;
5c6d40d1 2668
db58f5de
LP
2669 r = RET_NERRNO(rename(q, p));
2670 if (r < 0)
5c6d40d1 2671 goto fail;
5c6d40d1
LP
2672 }
2673 }
2674
6c47cd7d 2675 r = mkdir_label(p, context->directories[type].mode);
d484580c 2676 if (r < 0) {
d484580c
LP
2677 if (r != -EEXIST)
2678 goto fail;
2679
206e9864
LP
2680 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2681 struct stat st;
2682
2683 /* Don't change the owner/access mode of the configuration directory,
2684 * as in the common case it is not written to by a service, and shall
2685 * not be writable. */
2686
db58f5de
LP
2687 r = RET_NERRNO(stat(p, &st));
2688 if (r < 0)
206e9864 2689 goto fail;
206e9864
LP
2690
2691 /* Still complain if the access mode doesn't match */
2692 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
59dd2bbb
LP
2693 log_unit_warning(u, "%s \'%s\' already exists but the mode is different. "
2694 "(File system: %o %sMode: %o)",
2695 exec_directory_type_to_string(type), context->directories[type].items[i].path,
2696 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
206e9864 2697
6cff72eb 2698 continue;
206e9864 2699 }
6cff72eb 2700 }
a1164ae3 2701 }
07689d5d 2702
206e9864 2703 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2704 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2705 * current UID/GID ownership.) */
2706 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2707 if (r < 0)
2708 goto fail;
c71b2eb7 2709
f5bb36dc
LP
2710 /* Skip the rest (which deals with ownership) in user mode, since ownership changes are not
2711 * available to user code anyway */
2712 if (params->runtime_scope != RUNTIME_SCOPE_SYSTEM)
2713 continue;
2714
607b358e
LP
2715 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2716 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2717 * assignments to exist. */
d5602c16 2718 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777, AT_SYMLINK_FOLLOW);
07689d5d 2719 if (r < 0)
3536f49e 2720 goto fail;
07689d5d
LP
2721 }
2722
211a3d87
LB
2723 /* If we are not going to run in a namespace, set up the symlinks - otherwise
2724 * they are set up later, to allow configuring empty var/run/etc. */
2725 if (!needs_mount_namespace)
2726 for (size_t i = 0; i < context->directories[type].n_items; i++) {
2727 r = create_many_symlinks(params->prefix[type],
2728 context->directories[type].items[i].path,
2729 context->directories[type].items[i].symlinks);
2730 if (r < 0)
2731 goto fail;
2732 }
2733
07689d5d 2734 return 0;
3536f49e
YW
2735
2736fail:
2737 *exit_status = exit_status_table[type];
3536f49e 2738 return r;
07689d5d
LP
2739}
2740
92b423b9 2741#if ENABLE_SMACK
cefc33ae 2742static int setup_smack(
154eb43f 2743 const ExecParameters *params,
cefc33ae 2744 const ExecContext *context,
b83d5050 2745 int executable_fd) {
cefc33ae
LP
2746 int r;
2747
154eb43f 2748 assert(params);
b83d5050 2749 assert(executable_fd >= 0);
cefc33ae 2750
cefc33ae
LP
2751 if (context->smack_process_label) {
2752 r = mac_smack_apply_pid(0, context->smack_process_label);
2753 if (r < 0)
2754 return r;
154eb43f 2755 } else if (params->fallback_smack_process_label) {
cefc33ae
LP
2756 _cleanup_free_ char *exec_label = NULL;
2757
b83d5050 2758 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
00675c36 2759 if (r < 0 && !ERRNO_IS_XATTR_ABSENT(r))
cefc33ae
LP
2760 return r;
2761
154eb43f 2762 r = mac_smack_apply_pid(0, exec_label ?: params->fallback_smack_process_label);
cefc33ae
LP
2763 if (r < 0)
2764 return r;
2765 }
cefc33ae
LP
2766
2767 return 0;
2768}
92b423b9 2769#endif
cefc33ae 2770
6c47cd7d
LP
2771static int compile_bind_mounts(
2772 const ExecContext *context,
2773 const ExecParameters *params,
2774 BindMount **ret_bind_mounts,
da6053d0 2775 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2776 char ***ret_empty_directories) {
2777
2778 _cleanup_strv_free_ char **empty_directories = NULL;
ed8267c7 2779 BindMount *bind_mounts = NULL;
5b10116e 2780 size_t n, h = 0;
6c47cd7d
LP
2781 int r;
2782
2783 assert(context);
2784 assert(params);
2785 assert(ret_bind_mounts);
2786 assert(ret_n_bind_mounts);
2787 assert(ret_empty_directories);
2788
ed8267c7
DT
2789 CLEANUP_ARRAY(bind_mounts, h, bind_mount_free_many);
2790
6c47cd7d 2791 n = context->n_bind_mounts;
5b10116e 2792 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2793 if (!params->prefix[t])
2794 continue;
2795
a2ab603c
YW
2796 for (size_t i = 0; i < context->directories[t].n_items; i++)
2797 n += !context->directories[t].items[i].only_create;
6c47cd7d
LP
2798 }
2799
2800 if (n <= 0) {
2801 *ret_bind_mounts = NULL;
2802 *ret_n_bind_mounts = 0;
2803 *ret_empty_directories = NULL;
2804 return 0;
2805 }
2806
2807 bind_mounts = new(BindMount, n);
2808 if (!bind_mounts)
2809 return -ENOMEM;
2810
5b10116e 2811 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d 2812 BindMount *item = context->bind_mounts + i;
93404d34 2813 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d
LP
2814
2815 s = strdup(item->source);
ed8267c7
DT
2816 if (!s)
2817 return -ENOMEM;
6c47cd7d
LP
2818
2819 d = strdup(item->destination);
93404d34 2820 if (!d)
ed8267c7 2821 return -ENOMEM;
6c47cd7d
LP
2822
2823 bind_mounts[h++] = (BindMount) {
93404d34
DT
2824 .source = TAKE_PTR(s),
2825 .destination = TAKE_PTR(d),
6c47cd7d
LP
2826 .read_only = item->read_only,
2827 .recursive = item->recursive,
2828 .ignore_enoent = item->ignore_enoent,
2829 };
2830 }
2831
5b10116e 2832 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2833 if (!params->prefix[t])
2834 continue;
2835
211a3d87 2836 if (context->directories[t].n_items == 0)
6c47cd7d
LP
2837 continue;
2838
494d0247 2839 if (exec_directory_is_private(context, t) &&
74e12520 2840 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
2841 char *private_root;
2842
2843 /* So this is for a dynamic user, and we need to make sure the process can access its own
2844 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2845 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2846
657ee2d8 2847 private_root = path_join(params->prefix[t], "private");
ed8267c7
DT
2848 if (!private_root)
2849 return -ENOMEM;
6c47cd7d
LP
2850
2851 r = strv_consume(&empty_directories, private_root);
a635a7ae 2852 if (r < 0)
ed8267c7 2853 return r;
6c47cd7d
LP
2854 }
2855
211a3d87 2856 for (size_t i = 0; i < context->directories[t].n_items; i++) {
93404d34 2857 _cleanup_free_ char *s = NULL, *d = NULL;
6c47cd7d 2858
a2ab603c
YW
2859 /* When one of the parent directories is in the list, we cannot create the symlink
2860 * for the child directory. See also the comments in setup_exec_directory(). */
2861 if (context->directories[t].items[i].only_create)
2862 continue;
2863
494d0247 2864 if (exec_directory_is_private(context, t))
211a3d87 2865 s = path_join(params->prefix[t], "private", context->directories[t].items[i].path);
6c47cd7d 2866 else
211a3d87 2867 s = path_join(params->prefix[t], context->directories[t].items[i].path);
ed8267c7
DT
2868 if (!s)
2869 return -ENOMEM;
6c47cd7d 2870
494d0247 2871 if (exec_directory_is_private(context, t) &&
74e12520 2872 exec_context_with_rootfs(context))
5609f688
YW
2873 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2874 * directory is not created on the root directory. So, let's bind-mount the directory
2875 * on the 'non-private' place. */
211a3d87 2876 d = path_join(params->prefix[t], context->directories[t].items[i].path);
5609f688
YW
2877 else
2878 d = strdup(s);
93404d34 2879 if (!d)
ed8267c7 2880 return -ENOMEM;
6c47cd7d
LP
2881
2882 bind_mounts[h++] = (BindMount) {
93404d34
DT
2883 .source = TAKE_PTR(s),
2884 .destination = TAKE_PTR(d),
6c47cd7d 2885 .read_only = false,
9ce4e4b0 2886 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
2887 .recursive = true,
2888 .ignore_enoent = false,
2889 };
2890 }
2891 }
2892
2893 assert(h == n);
2894
ed8267c7 2895 *ret_bind_mounts = TAKE_PTR(bind_mounts);
6c47cd7d 2896 *ret_n_bind_mounts = n;
ae2a15bc 2897 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
2898
2899 return (int) n;
6c47cd7d
LP
2900}
2901
df61e79a
LB
2902/* ret_symlinks will contain a list of pairs src:dest that describes
2903 * the symlinks to create later on. For example, the symlinks needed
2904 * to safely give private directories to DynamicUser=1 users. */
2905static int compile_symlinks(
2906 const ExecContext *context,
2907 const ExecParameters *params,
663e2756 2908 bool setup_os_release_symlink,
df61e79a
LB
2909 char ***ret_symlinks) {
2910
2911 _cleanup_strv_free_ char **symlinks = NULL;
2912 int r;
2913
2914 assert(context);
2915 assert(params);
2916 assert(ret_symlinks);
2917
2918 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
211a3d87
LB
2919 for (size_t i = 0; i < context->directories[dt].n_items; i++) {
2920 _cleanup_free_ char *private_path = NULL, *path = NULL;
df61e79a 2921
211a3d87
LB
2922 STRV_FOREACH(symlink, context->directories[dt].items[i].symlinks) {
2923 _cleanup_free_ char *src_abs = NULL, *dst_abs = NULL;
df61e79a 2924
211a3d87
LB
2925 src_abs = path_join(params->prefix[dt], context->directories[dt].items[i].path);
2926 dst_abs = path_join(params->prefix[dt], *symlink);
2927 if (!src_abs || !dst_abs)
2928 return -ENOMEM;
df61e79a 2929
211a3d87
LB
2930 r = strv_consume_pair(&symlinks, TAKE_PTR(src_abs), TAKE_PTR(dst_abs));
2931 if (r < 0)
2932 return r;
2933 }
2934
a2ab603c
YW
2935 if (!exec_directory_is_private(context, dt) ||
2936 exec_context_with_rootfs(context) ||
2937 context->directories[dt].items[i].only_create)
211a3d87
LB
2938 continue;
2939
2940 private_path = path_join(params->prefix[dt], "private", context->directories[dt].items[i].path);
df61e79a
LB
2941 if (!private_path)
2942 return -ENOMEM;
2943
211a3d87 2944 path = path_join(params->prefix[dt], context->directories[dt].items[i].path);
df61e79a
LB
2945 if (!path)
2946 return -ENOMEM;
2947
2948 r = strv_consume_pair(&symlinks, TAKE_PTR(private_path), TAKE_PTR(path));
2949 if (r < 0)
2950 return r;
2951 }
2952 }
2953
663e2756
LB
2954 /* We make the host's os-release available via a symlink, so that we can copy it atomically
2955 * and readers will never get a half-written version. Note that, while the paths specified here are
2956 * absolute, when they are processed in namespace.c they will be made relative automatically, i.e.:
2957 * 'os-release -> .os-release-stage/os-release' is what will be created. */
2958 if (setup_os_release_symlink) {
2959 r = strv_extend(&symlinks, "/run/host/.os-release-stage/os-release");
2960 if (r < 0)
2961 return r;
2962
2963 r = strv_extend(&symlinks, "/run/host/os-release");
2964 if (r < 0)
2965 return r;
2966 }
2967
df61e79a
LB
2968 *ret_symlinks = TAKE_PTR(symlinks);
2969
2970 return 0;
2971}
2972
4e677599
LP
2973static bool insist_on_sandboxing(
2974 const ExecContext *context,
2975 const char *root_dir,
2976 const char *root_image,
2977 const BindMount *bind_mounts,
2978 size_t n_bind_mounts) {
2979
4e677599
LP
2980 assert(context);
2981 assert(n_bind_mounts == 0 || bind_mounts);
2982
2983 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 2984 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
2985 * rearrange stuff in a way we cannot ignore gracefully. */
2986
2987 if (context->n_temporary_filesystems > 0)
2988 return true;
2989
2990 if (root_dir || root_image)
2991 return true;
2992
b3d13314
LB
2993 if (context->n_mount_images > 0)
2994 return true;
2995
4e677599
LP
2996 if (context->dynamic_user)
2997 return true;
2998
4355c04f
LB
2999 if (context->n_extension_images > 0 || !strv_isempty(context->extension_directories))
3000 return true;
3001
4e677599
LP
3002 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3003 * essential. */
5b10116e 3004 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3005 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3006 return true;
3007
91dd5f7c
LP
3008 if (context->log_namespace)
3009 return true;
3010
4e677599
LP
3011 return false;
3012}
3013
9c0c6701
DDM
3014static int setup_ephemeral(const ExecContext *context, ExecRuntime *runtime) {
3015 _cleanup_close_ int fd = -EBADF;
3016 int r;
3017
3018 if (!runtime || !runtime->ephemeral_copy)
3019 return 0;
3020
3021 r = posix_lock(runtime->ephemeral_storage_socket[0], LOCK_EX);
3022 if (r < 0)
3023 return log_debug_errno(r, "Failed to lock ephemeral storage socket: %m");
3024
3025 CLEANUP_POSIX_UNLOCK(runtime->ephemeral_storage_socket[0]);
3026
3027 fd = receive_one_fd(runtime->ephemeral_storage_socket[0], MSG_PEEK|MSG_DONTWAIT);
3028 if (fd >= 0)
3029 /* We got an fd! That means ephemeral has already been set up, so nothing to do here. */
3030 return 0;
3031
3032 if (fd != -EAGAIN)
3033 return log_debug_errno(fd, "Failed to receive file descriptor queued on ephemeral storage socket: %m");
3034
3035 log_debug("Making ephemeral snapshot of %s to %s",
3036 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3037
3038 if (context->root_image)
3039 fd = copy_file(context->root_image, runtime->ephemeral_copy, O_EXCL, 0600,
3040 COPY_LOCK_BSD|COPY_REFLINK|COPY_CRTIME);
3041 else
3042 fd = btrfs_subvol_snapshot_at(AT_FDCWD, context->root_directory,
3043 AT_FDCWD, runtime->ephemeral_copy,
3044 BTRFS_SNAPSHOT_FALLBACK_COPY |
3045 BTRFS_SNAPSHOT_FALLBACK_DIRECTORY |
3046 BTRFS_SNAPSHOT_RECURSIVE |
3047 BTRFS_SNAPSHOT_LOCK_BSD);
3048 if (fd < 0)
3049 return log_debug_errno(fd, "Failed to snapshot %s to %s: %m",
3050 context->root_image ?: context->root_directory, runtime->ephemeral_copy);
3051
3052 if (context->root_image) {
3053 /* A root image might be subject to lots of random writes so let's try to disable COW on it
3054 * which tends to not perform well in combination with lots of random writes.
3055 *
3056 * Note: btrfs actually isn't impressed by us setting the flag after making the reflink'ed
3057 * copy, but we at least want to make the intention clear.
3058 */
3059 r = chattr_fd(fd, FS_NOCOW_FL, FS_NOCOW_FL, NULL);
3060 if (r < 0)
3061 log_debug_errno(fd, "Failed to disable copy-on-write for %s, ignoring: %m", runtime->ephemeral_copy);
3062 }
3063
3064 r = send_one_fd(runtime->ephemeral_storage_socket[1], fd, MSG_DONTWAIT);
3065 if (r < 0)
3066 return log_debug_errno(r, "Failed to queue file descriptor on ephemeral storage socket: %m");
3067
3068 return 1;
3069}
3070
66130f0a
DDM
3071static int verity_settings_prepare(
3072 VeritySettings *verity,
3073 const char *root_image,
3074 const void *root_hash,
3075 size_t root_hash_size,
3076 const char *root_hash_path,
3077 const void *root_hash_sig,
3078 size_t root_hash_sig_size,
3079 const char *root_hash_sig_path,
3080 const char *verity_data_path) {
3081
3082 int r;
3083
3084 assert(verity);
3085
3086 if (root_hash) {
3087 void *d;
3088
3089 d = memdup(root_hash, root_hash_size);
3090 if (!d)
3091 return -ENOMEM;
3092
3093 free_and_replace(verity->root_hash, d);
3094 verity->root_hash_size = root_hash_size;
3095 verity->designator = PARTITION_ROOT;
3096 }
3097
3098 if (root_hash_sig) {
3099 void *d;
3100
3101 d = memdup(root_hash_sig, root_hash_sig_size);
3102 if (!d)
3103 return -ENOMEM;
3104
3105 free_and_replace(verity->root_hash_sig, d);
3106 verity->root_hash_sig_size = root_hash_sig_size;
3107 verity->designator = PARTITION_ROOT;
3108 }
3109
3110 if (verity_data_path) {
3111 r = free_and_strdup(&verity->data_path, verity_data_path);
3112 if (r < 0)
3113 return r;
3114 }
3115
3116 r = verity_settings_load(
3117 verity,
3118 root_image,
3119 root_hash_path,
3120 root_hash_sig_path);
3121 if (r < 0)
3122 return log_debug_errno(r, "Failed to load root hash: %m");
3123
3124 return 0;
3125}
3126
6818c54c 3127static int apply_mount_namespace(
34cf6c43 3128 const Unit *u,
9f71ba8d 3129 ExecCommandFlags command_flags,
6818c54c
LP
3130 const ExecContext *context,
3131 const ExecParameters *params,
9c0c6701 3132 ExecRuntime *runtime,
d4b6ec98 3133 const char *memory_pressure_path,
7cc5ef5f 3134 char **error_path) {
6818c54c 3135
66130f0a 3136 _cleanup_(verity_settings_done) VeritySettings verity = VERITY_SETTINGS_DEFAULT;
d4b6ec98
LB
3137 _cleanup_strv_free_ char **empty_directories = NULL, **symlinks = NULL,
3138 **read_write_paths_cleanup = NULL;
73ff4d48 3139 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL,
663e2756 3140 *extension_dir = NULL, *host_os_release_stage = NULL;
66130f0a 3141 const char *root_dir = NULL, *root_image = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
d4b6ec98 3142 char **read_write_paths;
663e2756 3143 bool needs_sandboxing, setup_os_release_symlink;
6c47cd7d 3144 BindMount *bind_mounts = NULL;
da6053d0 3145 size_t n_bind_mounts = 0;
6818c54c 3146 int r;
93c6bb51 3147
2b3c1b9e
DH
3148 assert(context);
3149
29933daf
DT
3150 CLEANUP_ARRAY(bind_mounts, n_bind_mounts, bind_mount_free_many);
3151
915e6d16 3152 if (params->flags & EXEC_APPLY_CHROOT) {
9c0c6701
DDM
3153 r = setup_ephemeral(context, runtime);
3154 if (r < 0)
3155 return r;
915e6d16 3156
9c0c6701
DDM
3157 if (context->root_image)
3158 root_image = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_image;
3159 else
3160 root_dir = (runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory;
915e6d16 3161 }
93c6bb51 3162
6c47cd7d
LP
3163 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3164 if (r < 0)
3165 return r;
3166
d4b6ec98
LB
3167 /* We need to make the pressure path writable even if /sys/fs/cgroups is made read-only, as the
3168 * service will need to write to it in order to start the notifications. */
3169 if (context->protect_control_groups && memory_pressure_path && !streq(memory_pressure_path, "/dev/null")) {
3170 read_write_paths_cleanup = strv_copy(context->read_write_paths);
29933daf
DT
3171 if (!read_write_paths_cleanup)
3172 return -ENOMEM;
d4b6ec98
LB
3173
3174 r = strv_extend(&read_write_paths_cleanup, memory_pressure_path);
3175 if (r < 0)
29933daf 3176 return r;
d4b6ec98
LB
3177
3178 read_write_paths = read_write_paths_cleanup;
3179 } else
3180 read_write_paths = context->read_write_paths;
3181
9f71ba8d 3182 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91 3183 if (needs_sandboxing) {
79d956db
LP
3184 /* The runtime struct only contains the parent of the private /tmp, which is non-accessible
3185 * to world users. Inside of it there's a /tmp that is sticky, and that's the one we want to
3186 * use here. This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91 3187
28135da3
DDM
3188 if (context->private_tmp && runtime && runtime->shared) {
3189 if (streq_ptr(runtime->shared->tmp_dir, RUN_SYSTEMD_EMPTY))
3190 tmp_dir = runtime->shared->tmp_dir;
3191 else if (runtime->shared->tmp_dir)
3192 tmp_dir = strjoina(runtime->shared->tmp_dir, "/tmp");
56a13a49 3193
28135da3
DDM
3194 if (streq_ptr(runtime->shared->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3195 var_tmp_dir = runtime->shared->var_tmp_dir;
3196 else if (runtime->shared->var_tmp_dir)
3197 var_tmp_dir = strjoina(runtime->shared->var_tmp_dir, "/tmp");
ecf63c91 3198 }
79d956db 3199 }
b5a33299 3200
663e2756 3201 /* Symlinks (exec dirs, os-release) are set up after other mounts, before they are made read-only. */
79d956db 3202 setup_os_release_symlink = needs_sandboxing && exec_context_get_effective_mount_apivfs(context) && (root_dir || root_image);
663e2756
LB
3203 r = compile_symlinks(context, params, setup_os_release_symlink, &symlinks);
3204 if (r < 0)
3205 return r;
3206
874cdcbc 3207 if (context->mount_propagation_flag == MS_SHARED)
37ed15d7
FB
3208 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3209
133e4de2
YW
3210 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
3211 r = exec_context_get_credential_directory(context, params, u->id, &creds_path);
3212 if (r < 0)
3213 return r;
73ff4d48
YW
3214 }
3215
170d978b 3216 if (params->runtime_scope == RUNTIME_SCOPE_SYSTEM) {
5e8deb94 3217 propagate_dir = path_join("/run/systemd/propagate/", u->id);
29933daf
DT
3218 if (!propagate_dir)
3219 return -ENOMEM;
f2550b98 3220
5e8deb94 3221 incoming_dir = strdup("/run/systemd/incoming");
29933daf
DT
3222 if (!incoming_dir)
3223 return -ENOMEM;
24759d8f
LB
3224
3225 extension_dir = strdup("/run/systemd/unit-extensions");
29933daf
DT
3226 if (!extension_dir)
3227 return -ENOMEM;
3f37a825
LB
3228
3229 /* If running under a different root filesystem, propagate the host's os-release. We make a
3230 * copy rather than just bind mounting it, so that it can be updated on soft-reboot. */
663e2756
LB
3231 if (setup_os_release_symlink) {
3232 host_os_release_stage = strdup("/run/systemd/propagate/.os-release-stage");
3233 if (!host_os_release_stage)
3f37a825
LB
3234 return -ENOMEM;
3235 }
170d978b
LP
3236 } else {
3237 assert(params->runtime_scope == RUNTIME_SCOPE_USER);
3238
29933daf
DT
3239 if (asprintf(&extension_dir, "/run/user/" UID_FMT "/systemd/unit-extensions", geteuid()) < 0)
3240 return -ENOMEM;
3f37a825 3241
663e2756
LB
3242 if (setup_os_release_symlink) {
3243 if (asprintf(&host_os_release_stage,
3244 "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage",
3245 geteuid()) < 0)
3f37a825
LB
3246 return -ENOMEM;
3247 }
170d978b 3248 }
5e8deb94 3249
66130f0a
DDM
3250 if (root_image) {
3251 r = verity_settings_prepare(
3252 &verity,
3253 root_image,
3254 context->root_hash, context->root_hash_size, context->root_hash_path,
3255 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3256 context->root_verity);
3257 if (r < 0)
3258 return r;
3259 }
3260
79d956db
LP
3261 NamespaceParameters parameters = {
3262 .runtime_scope = params->runtime_scope,
3263
3264 .root_directory = root_dir,
3265 .root_image = root_image,
3266 .root_image_options = context->root_image_options,
3267 .root_image_policy = context->root_image_policy ?: &image_policy_service,
3268
3269 .read_write_paths = read_write_paths,
3270 .read_only_paths = needs_sandboxing ? context->read_only_paths : NULL,
3271 .inaccessible_paths = needs_sandboxing ? context->inaccessible_paths : NULL,
3272
3273 .exec_paths = needs_sandboxing ? context->exec_paths : NULL,
3274 .no_exec_paths = needs_sandboxing ? context->no_exec_paths : NULL,
3275
3276 .empty_directories = empty_directories,
3277 .symlinks = symlinks,
3278
3279 .bind_mounts = bind_mounts,
3280 .n_bind_mounts = n_bind_mounts,
3281
3282 .temporary_filesystems = context->temporary_filesystems,
3283 .n_temporary_filesystems = context->n_temporary_filesystems,
3284
3285 .mount_images = context->mount_images,
3286 .n_mount_images = context->n_mount_images,
3287 .mount_image_policy = context->mount_image_policy ?: &image_policy_service,
3288
3289 .tmp_dir = tmp_dir,
3290 .var_tmp_dir = var_tmp_dir,
3291
3292 .creds_path = creds_path,
3293 .log_namespace = context->log_namespace,
3294 .mount_propagation_flag = context->mount_propagation_flag,
3295
3296 .verity = &verity,
3297
3298 .extension_images = context->extension_images,
3299 .n_extension_images = context->n_extension_images,
3300 .extension_image_policy = context->extension_image_policy ?: &image_policy_sysext,
3301 .extension_directories = context->extension_directories,
3302
3303 .propagate_dir = propagate_dir,
3304 .incoming_dir = incoming_dir,
3305 .extension_dir = extension_dir,
3306 .notify_socket = root_dir || root_image ? params->notify_socket : NULL,
3307 .host_os_release_stage = host_os_release_stage,
3308
3309 /* If DynamicUser=no and RootDirectory= is set then lets pass a relaxed sandbox info,
3310 * otherwise enforce it, don't ignore protected paths and fail if we are enable to apply the
3311 * sandbox inside the mount namespace. */
3312 .ignore_protect_paths = !needs_sandboxing && !context->dynamic_user && root_dir,
3313
3314 .protect_control_groups = needs_sandboxing && context->protect_control_groups,
3315 .protect_kernel_tunables = needs_sandboxing && context->protect_kernel_tunables,
3316 .protect_kernel_modules = needs_sandboxing && context->protect_kernel_modules,
3317 .protect_kernel_logs = needs_sandboxing && context->protect_kernel_logs,
3318 .protect_hostname = needs_sandboxing && context->protect_hostname,
3319
3320 .private_dev = needs_sandboxing && context->private_devices,
3321 .private_network = needs_sandboxing && exec_needs_network_namespace(context),
3322 .private_ipc = needs_sandboxing && exec_needs_ipc_namespace(context),
3323
3324 .mount_apivfs = needs_sandboxing && exec_context_get_effective_mount_apivfs(context),
3325
3326 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
3327 .mount_nosuid = needs_sandboxing && context->no_new_privileges && !mac_selinux_use(),
3328
3329 .protect_home = needs_sandboxing && context->protect_home,
3330 .protect_system = needs_sandboxing && context->protect_system,
3331 .protect_proc = needs_sandboxing && context->protect_proc,
3332 .proc_subset = needs_sandboxing && context->proc_subset,
3333 };
93c6bb51 3334
79d956db 3335 r = setup_namespace(&parameters, error_path);
1beab8b0 3336 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3337 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3338 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3339 * completely different execution environment. */
aca835ed 3340 if (r == -ENOANO) {
4e677599
LP
3341 if (insist_on_sandboxing(
3342 context,
3343 root_dir, root_image,
3344 bind_mounts,
29933daf
DT
3345 n_bind_mounts))
3346 return log_unit_debug_errno(u,
3347 SYNTHETIC_ERRNO(EOPNOTSUPP),
3348 "Failed to set up namespace, and refusing to continue since "
3349 "the selected namespacing options alter mount environment non-trivially.\n"
3350 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3351 n_bind_mounts,
3352 context->n_temporary_filesystems,
3353 yes_no(root_dir),
3354 yes_no(root_image),
3355 yes_no(context->dynamic_user));
3356
3357 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
3358 return 0;
93c6bb51
DH
3359 }
3360
3361 return r;
3362}
3363
915e6d16
LP
3364static int apply_working_directory(
3365 const ExecContext *context,
3366 const ExecParameters *params,
9c0c6701 3367 ExecRuntime *runtime,
915e6d16 3368 const char *home,
376fecf6 3369 int *exit_status) {
915e6d16 3370
6732edab 3371 const char *d, *wd;
2b3c1b9e
DH
3372
3373 assert(context);
376fecf6 3374 assert(exit_status);
2b3c1b9e 3375
6732edab
LP
3376 if (context->working_directory_home) {
3377
376fecf6
LP
3378 if (!home) {
3379 *exit_status = EXIT_CHDIR;
6732edab 3380 return -ENXIO;
376fecf6 3381 }
6732edab 3382
2b3c1b9e 3383 wd = home;
6732edab 3384
14eb3285
LP
3385 } else
3386 wd = empty_to_root(context->working_directory);
e7f1e7c6 3387
fa97f630 3388 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3389 d = wd;
fa97f630 3390 else
9c0c6701 3391 d = prefix_roota((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory, wd);
e7f1e7c6 3392
376fecf6
LP
3393 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3394 *exit_status = EXIT_CHDIR;
2b3c1b9e 3395 return -errno;
376fecf6 3396 }
e7f1e7c6
DH
3397
3398 return 0;
3399}
3400
fa97f630
JB
3401static int apply_root_directory(
3402 const ExecContext *context,
3403 const ExecParameters *params,
9c0c6701 3404 ExecRuntime *runtime,
fa97f630
JB
3405 const bool needs_mount_ns,
3406 int *exit_status) {
3407
3408 assert(context);
3409 assert(exit_status);
3410
5b10116e 3411 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630 3412 if (!needs_mount_ns && context->root_directory)
9c0c6701 3413 if (chroot((runtime ? runtime->ephemeral_copy : NULL) ?: context->root_directory) < 0) {
fa97f630
JB
3414 *exit_status = EXIT_CHROOT;
3415 return -errno;
3416 }
fa97f630
JB
3417
3418 return 0;
3419}
3420
b1edf445 3421static int setup_keyring(
34cf6c43 3422 const Unit *u,
b1edf445
LP
3423 const ExecContext *context,
3424 const ExecParameters *p,
3425 uid_t uid, gid_t gid) {
3426
74dd6b51 3427 key_serial_t keyring;
e64c2d0b
DJL
3428 int r = 0;
3429 uid_t saved_uid;
3430 gid_t saved_gid;
74dd6b51
LP
3431
3432 assert(u);
b1edf445 3433 assert(context);
74dd6b51
LP
3434 assert(p);
3435
3436 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3437 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3438 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3439 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3440 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3441 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3442
b1edf445
LP
3443 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3444 return 0;
3445
e64c2d0b
DJL
3446 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3447 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3448 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3449 * & group is just as nasty as acquiring a reference to the user keyring. */
3450
3451 saved_uid = getuid();
3452 saved_gid = getgid();
3453
3454 if (gid_is_valid(gid) && gid != saved_gid) {
3455 if (setregid(gid, -1) < 0)
3456 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3457 }
3458
3459 if (uid_is_valid(uid) && uid != saved_uid) {
3460 if (setreuid(uid, -1) < 0) {
3461 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3462 goto out;
3463 }
3464 }
3465
74dd6b51
LP
3466 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3467 if (keyring == -1) {
3468 if (errno == ENOSYS)
8002fb97 3469 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3470 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3471 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3472 else if (errno == EDQUOT)
8002fb97 3473 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3474 else
e64c2d0b 3475 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3476
e64c2d0b 3477 goto out;
74dd6b51
LP
3478 }
3479
e64c2d0b
DJL
3480 /* When requested link the user keyring into the session keyring. */
3481 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3482
3483 if (keyctl(KEYCTL_LINK,
3484 KEY_SPEC_USER_KEYRING,
3485 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3486 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3487 goto out;
3488 }
3489 }
3490
3491 /* Restore uid/gid back */
3492 if (uid_is_valid(uid) && uid != saved_uid) {
3493 if (setreuid(saved_uid, -1) < 0) {
3494 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3495 goto out;
3496 }
3497 }
3498
3499 if (gid_is_valid(gid) && gid != saved_gid) {
3500 if (setregid(saved_gid, -1) < 0)
3501 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3502 }
3503
3504 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3505 if (!sd_id128_is_null(u->invocation_id)) {
3506 key_serial_t key;
3507
3508 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3509 if (key == -1)
8002fb97 3510 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3511 else {
3512 if (keyctl(KEYCTL_SETPERM, key,
3513 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3514 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3515 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3516 }
3517 }
3518
e64c2d0b 3519out:
37b22b3b 3520 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3521 /* no extra logging, as only the first already reported error matters */
3522 if (getuid() != saved_uid)
3523 (void) setreuid(saved_uid, -1);
b1edf445 3524
e64c2d0b
DJL
3525 if (getgid() != saved_gid)
3526 (void) setregid(saved_gid, -1);
b1edf445 3527
e64c2d0b 3528 return r;
74dd6b51
LP
3529}
3530
3042bbeb 3531static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3532 assert(array);
3533 assert(n);
2caa38e9 3534 assert(pair);
29206d46
LP
3535
3536 if (pair[0] >= 0)
3537 array[(*n)++] = pair[0];
3538 if (pair[1] >= 0)
3539 array[(*n)++] = pair[1];
3540}
3541
a34ceba6
LP
3542static int close_remaining_fds(
3543 const ExecParameters *params,
28135da3 3544 const ExecRuntime *runtime,
a34ceba6 3545 int socket_fd,
5b8d1f6b 3546 const int *fds, size_t n_fds) {
a34ceba6 3547
da6053d0 3548 size_t n_dont_close = 0;
9c0c6701 3549 int dont_close[n_fds + 14];
a34ceba6
LP
3550
3551 assert(params);
3552
3553 if (params->stdin_fd >= 0)
3554 dont_close[n_dont_close++] = params->stdin_fd;
3555 if (params->stdout_fd >= 0)
3556 dont_close[n_dont_close++] = params->stdout_fd;
3557 if (params->stderr_fd >= 0)
3558 dont_close[n_dont_close++] = params->stderr_fd;
3559
3560 if (socket_fd >= 0)
3561 dont_close[n_dont_close++] = socket_fd;
3562 if (n_fds > 0) {
3563 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3564 n_dont_close += n_fds;
3565 }
3566
9c0c6701
DDM
3567 if (runtime)
3568 append_socket_pair(dont_close, &n_dont_close, runtime->ephemeral_storage_socket);
3569
28135da3
DDM
3570 if (runtime && runtime->shared) {
3571 append_socket_pair(dont_close, &n_dont_close, runtime->shared->netns_storage_socket);
3572 append_socket_pair(dont_close, &n_dont_close, runtime->shared->ipcns_storage_socket);
a70581ff 3573 }
29206d46 3574
15220772
DDM
3575 if (runtime && runtime->dynamic_creds) {
3576 if (runtime->dynamic_creds->user)
3577 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->user->storage_socket);
3578 if (runtime->dynamic_creds->group)
3579 append_socket_pair(dont_close, &n_dont_close, runtime->dynamic_creds->group->storage_socket);
a34ceba6
LP
3580 }
3581
154eb43f
LB
3582 if (params->user_lookup_fd >= 0)
3583 dont_close[n_dont_close++] = params->user_lookup_fd;
00d9ef85 3584
a34ceba6
LP
3585 return close_all_fds(dont_close, n_dont_close);
3586}
3587
00d9ef85
LP
3588static int send_user_lookup(
3589 Unit *unit,
3590 int user_lookup_fd,
3591 uid_t uid,
3592 gid_t gid) {
3593
3594 assert(unit);
3595
3596 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3597 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3598 * specified. */
3599
3600 if (user_lookup_fd < 0)
3601 return 0;
3602
3603 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3604 return 0;
3605
3606 if (writev(user_lookup_fd,
3607 (struct iovec[]) {
ce16d177
YW
3608 IOVEC_MAKE(&uid, sizeof(uid)),
3609 IOVEC_MAKE(&gid, sizeof(gid)),
3610 IOVEC_MAKE_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3611 return -errno;
3612
3613 return 0;
3614}
3615
6732edab
LP
3616static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3617 int r;
3618
3619 assert(c);
3620 assert(home);
3621 assert(buf);
3622
3623 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3624
3625 if (*home)
3626 return 0;
3627
3628 if (!c->working_directory_home)
3629 return 0;
3630
6732edab
LP
3631 r = get_home_dir(buf);
3632 if (r < 0)
3633 return r;
3634
3635 *home = *buf;
3636 return 1;
3637}
3638
da50b85a
LP
3639static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3640 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3641 int r;
3642
3643 assert(c);
3644 assert(p);
3645 assert(ret);
3646
3647 assert(c->dynamic_user);
3648
3649 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3650 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3651 * directories. */
3652
5b10116e 3653 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3654 if (t == EXEC_DIRECTORY_CONFIGURATION)
3655 continue;
3656
3657 if (!p->prefix[t])
3658 continue;
3659
211a3d87 3660 for (size_t i = 0; i < c->directories[t].n_items; i++) {
da50b85a
LP
3661 char *e;
3662
494d0247 3663 if (exec_directory_is_private(c, t))
211a3d87 3664 e = path_join(p->prefix[t], "private", c->directories[t].items[i].path);
494d0247 3665 else
211a3d87 3666 e = path_join(p->prefix[t], c->directories[t].items[i].path);
da50b85a
LP
3667 if (!e)
3668 return -ENOMEM;
3669
3670 r = strv_consume(&list, e);
3671 if (r < 0)
3672 return r;
3673 }
3674 }
3675
ae2a15bc 3676 *ret = TAKE_PTR(list);
da50b85a
LP
3677
3678 return 0;
3679}
3680
a8b993dc
LP
3681static int exec_parameters_get_cgroup_path(
3682 const ExecParameters *params,
3683 const CGroupContext *c,
3684 char **ret) {
3685
3686 const char *subgroup = NULL;
78f93209
LP
3687 char *p;
3688
3689 assert(params);
3690 assert(ret);
3691
3692 if (!params->cgroup_path)
3693 return -EINVAL;
3694
3695 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3696 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3697 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3698 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3699 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3700 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3701 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3702 * flag, which is only passed for the former statements, not for the latter. */
3703
a8b993dc
LP
3704 if (FLAGS_SET(params->flags, EXEC_CGROUP_DELEGATE) && (FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP) || c->delegate_subgroup)) {
3705 if (FLAGS_SET(params->flags, EXEC_IS_CONTROL))
3706 subgroup = ".control";
3707 else
3708 subgroup = c->delegate_subgroup;
3709 }
3710
3711 if (subgroup)
3712 p = path_join(params->cgroup_path, subgroup);
78f93209
LP
3713 else
3714 p = strdup(params->cgroup_path);
3715 if (!p)
3716 return -ENOMEM;
3717
3718 *ret = p;
a8b993dc 3719 return !!subgroup;
78f93209
LP
3720}
3721
e2b2fb7f
MS
3722static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3723 _cleanup_(cpu_set_reset) CPUSet s = {};
3724 int r;
3725
3726 assert(c);
3727 assert(ret);
3728
3729 if (!c->numa_policy.nodes.set) {
3730 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3731 return 0;
3732 }
3733
3734 r = numa_to_cpu_set(&c->numa_policy, &s);
3735 if (r < 0)
3736 return r;
3737
3738 cpu_set_reset(ret);
3739
3740 return cpu_set_add_all(ret, &s);
3741}
3742
3743bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3744 assert(c);
3745
3746 return c->cpu_affinity_from_numa;
3747}
3748
1da37e58
ZJS
3749static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3750 int r;
3751
3752 assert(fds);
3753 assert(n_fds);
3754 assert(*n_fds < fds_size);
3755 assert(ret_fd);
3756
3757 if (fd < 0) {
254d1313 3758 *ret_fd = -EBADF;
1da37e58
ZJS
3759 return 0;
3760 }
3761
3762 if (fd < 3 + (int) *n_fds) {
3763 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3764 * the fds we pass to the process (or which are closed only during execve). */
3765
3766 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3767 if (r < 0)
3768 return -errno;
3769
ee3455cf 3770 close_and_replace(fd, r);
1da37e58
ZJS
3771 }
3772
3773 *ret_fd = fds[*n_fds] = fd;
3774 (*n_fds) ++;
3775 return 1;
3776}
3777
cd48e23f
RP
3778static int connect_unix_harder(Unit *u, const OpenFile *of, int ofd) {
3779 union sockaddr_union addr = {
3780 .un.sun_family = AF_UNIX,
3781 };
3782 socklen_t sa_len;
3783 static const int socket_types[] = { SOCK_DGRAM, SOCK_STREAM, SOCK_SEQPACKET };
3784 int r;
3785
3786 assert(u);
3787 assert(of);
3788 assert(ofd >= 0);
3789
3790 r = sockaddr_un_set_path(&addr.un, FORMAT_PROC_FD_PATH(ofd));
3791 if (r < 0)
3792 return log_unit_error_errno(u, r, "Failed to set sockaddr for %s: %m", of->path);
3793
3794 sa_len = r;
3795
3796 for (size_t i = 0; i < ELEMENTSOF(socket_types); i++) {
3797 _cleanup_close_ int fd = -EBADF;
3798
3799 fd = socket(AF_UNIX, socket_types[i] | SOCK_CLOEXEC, 0);
3800 if (fd < 0)
3801 return log_unit_error_errno(u, errno, "Failed to create socket for %s: %m", of->path);
3802
3803 r = RET_NERRNO(connect(fd, &addr.sa, sa_len));
3804 if (r == -EPROTOTYPE)
3805 continue;
3806 if (r < 0)
3807 return log_unit_error_errno(u, r, "Failed to connect socket for %s: %m", of->path);
3808
3809 return TAKE_FD(fd);
3810 }
3811
3812 return log_unit_error_errno(u, SYNTHETIC_ERRNO(EPROTOTYPE), "Failed to connect socket for \"%s\".", of->path);
3813}
3814
3815static int get_open_file_fd(Unit *u, const OpenFile *of) {
3816 struct stat st;
3817 _cleanup_close_ int fd = -EBADF, ofd = -EBADF;
3818
3819 assert(u);
3820 assert(of);
3821
3822 ofd = open(of->path, O_PATH | O_CLOEXEC);
3823 if (ofd < 0)
dcebb015
DDM
3824 return log_unit_error_errno(u, errno, "Could not open \"%s\": %m", of->path);
3825
cd48e23f 3826 if (fstat(ofd, &st) < 0)
dcebb015 3827 return log_unit_error_errno(u, errno, "Failed to stat %s: %m", of->path);
cd48e23f
RP
3828
3829 if (S_ISSOCK(st.st_mode)) {
3830 fd = connect_unix_harder(u, of, ofd);
3831 if (fd < 0)
3832 return fd;
3833
3834 if (FLAGS_SET(of->flags, OPENFILE_READ_ONLY) && shutdown(fd, SHUT_WR) < 0)
dcebb015
DDM
3835 return log_unit_error_errno(u, errno, "Failed to shutdown send for socket %s: %m",
3836 of->path);
cd48e23f
RP
3837
3838 log_unit_debug(u, "socket %s opened (fd=%d)", of->path, fd);
3839 } else {
3840 int flags = FLAGS_SET(of->flags, OPENFILE_READ_ONLY) ? O_RDONLY : O_RDWR;
3841 if (FLAGS_SET(of->flags, OPENFILE_APPEND))
3842 flags |= O_APPEND;
3843 else if (FLAGS_SET(of->flags, OPENFILE_TRUNCATE))
3844 flags |= O_TRUNC;
3845
3846 fd = fd_reopen(ofd, flags | O_CLOEXEC);
3847 if (fd < 0)
3848 return log_unit_error_errno(u, fd, "Failed to open file %s: %m", of->path);
3849
3850 log_unit_debug(u, "file %s opened (fd=%d)", of->path, fd);
3851 }
3852
3853 return TAKE_FD(fd);
3854}
3855
3856static int collect_open_file_fds(
3857 Unit *u,
3858 OpenFile* open_files,
3859 int **fds,
3860 char ***fdnames,
3861 size_t *n_fds) {
3862 int r;
3863
3864 assert(u);
3865 assert(fds);
3866 assert(fdnames);
3867 assert(n_fds);
3868
3869 LIST_FOREACH(open_files, of, open_files) {
3870 _cleanup_close_ int fd = -EBADF;
3871
3872 fd = get_open_file_fd(u, of);
3873 if (fd < 0) {
3874 if (FLAGS_SET(of->flags, OPENFILE_GRACEFUL)) {
3875 log_unit_debug_errno(u, fd, "Failed to get OpenFile= file descriptor for %s, ignoring: %m", of->path);
3876 continue;
3877 }
3878
3879 return fd;
3880 }
3881
3882 if (!GREEDY_REALLOC(*fds, *n_fds + 1))
3883 return -ENOMEM;
3884
3885 r = strv_extend(fdnames, of->fdname);
3886 if (r < 0)
3887 return r;
3888
3889 (*fds)[*n_fds] = TAKE_FD(fd);
3890
3891 (*n_fds)++;
3892 }
3893
3894 return 0;
3895}
3896
3ff67ec4
ZJS
3897static void log_command_line(Unit *unit, const char *msg, const char *executable, char **argv) {
3898 assert(unit);
3899 assert(msg);
3900 assert(executable);
3901
3902 if (!DEBUG_LOGGING)
3903 return;
3904
3905 _cleanup_free_ char *cmdline = quote_command_line(argv, SHELL_ESCAPE_EMPTY);
3906
3907 log_unit_struct(unit, LOG_DEBUG,
3908 "EXECUTABLE=%s", executable,
3909 LOG_UNIT_MESSAGE(unit, "%s: %s", msg, strnull(cmdline)),
3910 LOG_UNIT_INVOCATION_ID(unit));
3911}
3912
170d978b
LP
3913static bool exec_context_need_unprivileged_private_users(
3914 const ExecContext *context,
3915 const ExecParameters *params) {
3916
6ef721cb 3917 assert(context);
170d978b 3918 assert(params);
6ef721cb
LB
3919
3920 /* These options require PrivateUsers= when used in user units, as we need to be in a user namespace
3921 * to have permission to enable them when not running as root. If we have effective CAP_SYS_ADMIN
3922 * (system manager) then we have privileges and don't need this. */
170d978b 3923 if (params->runtime_scope != RUNTIME_SCOPE_USER)
6ef721cb
LB
3924 return false;
3925
3926 return context->private_users ||
3927 context->private_tmp ||
3928 context->private_devices ||
3929 context->private_network ||
3930 context->network_namespace_path ||
3931 context->private_ipc ||
3932 context->ipc_namespace_path ||
adeff822 3933 context->private_mounts > 0 ||
6ef721cb
LB
3934 context->mount_apivfs ||
3935 context->n_bind_mounts > 0 ||
3936 context->n_temporary_filesystems > 0 ||
3937 context->root_directory ||
3938 !strv_isempty(context->extension_directories) ||
3939 context->protect_system != PROTECT_SYSTEM_NO ||
3940 context->protect_home != PROTECT_HOME_NO ||
3941 context->protect_kernel_tunables ||
3942 context->protect_kernel_modules ||
3943 context->protect_kernel_logs ||
3944 context->protect_control_groups ||
3945 context->protect_clock ||
3946 context->protect_hostname ||
3947 !strv_isempty(context->read_write_paths) ||
3948 !strv_isempty(context->read_only_paths) ||
3949 !strv_isempty(context->inaccessible_paths) ||
3950 !strv_isempty(context->exec_paths) ||
3951 !strv_isempty(context->no_exec_paths);
3952}
3953
154eb43f
LB
3954static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3955static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
3956
ff0af2a1 3957static int exec_child(
f2341e0a 3958 Unit *unit,
34cf6c43 3959 const ExecCommand *command,
ff0af2a1 3960 const ExecContext *context,
154eb43f 3961 ExecParameters *params,
28135da3 3962 ExecRuntime *runtime,
6bb00842 3963 const CGroupContext *cgroup_context,
12145637 3964 int *exit_status) {
d35fbf6b 3965
8c35c10d 3966 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **joined_exec_search_path = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3967 int r, ngids = 0, exec_fd;
4d885bd3
DH
3968 _cleanup_free_ gid_t *supplementary_gids = NULL;
3969 const char *username = NULL, *groupname = NULL;
73ff4d48 3970 _cleanup_free_ char *home_buffer = NULL, *memory_pressure_path = NULL;
2b3c1b9e 3971 const char *home = NULL, *shell = NULL;
7ca69792 3972 char **final_argv = NULL;
7bce046b
LP
3973 dev_t journal_stream_dev = 0;
3974 ino_t journal_stream_ino = 0;
5749f855 3975 bool userns_set_up = false;
165a31c0
LP
3976 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3977 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3978 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3979 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3980#if HAVE_SELINUX
7f59dd35 3981 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3982 bool use_selinux = false;
ecfbc84f 3983#endif
f9fa32f0 3984#if ENABLE_SMACK
43b1f709 3985 bool use_smack = false;
ecfbc84f 3986#endif
349cc4a5 3987#if HAVE_APPARMOR
43b1f709 3988 bool use_apparmor = false;
ecfbc84f 3989#endif
5749f855
AZ
3990 uid_t saved_uid = getuid();
3991 gid_t saved_gid = getgid();
fed1e721
LP
3992 uid_t uid = UID_INVALID;
3993 gid_t gid = GID_INVALID;
154eb43f 3994 size_t n_fds, /* fds to pass to the child */
1da37e58 3995 n_keep_fds; /* total number of fds not to close */
165a31c0 3996 int secure_bits;
afb11bf1
DG
3997 _cleanup_free_ gid_t *gids_after_pam = NULL;
3998 int ngids_after_pam = 0;
cd48e23f
RP
3999 _cleanup_free_ int *fds = NULL;
4000 _cleanup_strv_free_ char **fdnames = NULL;
154eb43f
LB
4001 int socket_fd = -EBADF, named_iofds[3] = { -EBADF, -EBADF, -EBADF }, *params_fds = NULL;
4002 size_t n_storage_fds = 0, n_socket_fds = 0;
034c6ed7 4003
f2341e0a 4004 assert(unit);
5cb5a6ff
LP
4005 assert(command);
4006 assert(context);
d35fbf6b 4007 assert(params);
ff0af2a1 4008 assert(exit_status);
d35fbf6b 4009
69339ae9
LP
4010 /* Explicitly test for CVE-2021-4034 inspired invocations */
4011 assert(command->path);
4012 assert(!strv_isempty(command->argv));
4013
154eb43f
LB
4014 if (context->std_input == EXEC_INPUT_SOCKET ||
4015 context->std_output == EXEC_OUTPUT_SOCKET ||
4016 context->std_error == EXEC_OUTPUT_SOCKET) {
4017
4018 if (params->n_socket_fds > 1)
4019 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
4020
4021 if (params->n_socket_fds == 0)
4022 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
4023
4024 socket_fd = params->fds[0];
4025 } else {
4026 params_fds = params->fds;
4027 n_socket_fds = params->n_socket_fds;
4028 n_storage_fds = params->n_storage_fds;
4029 }
4030 n_fds = n_socket_fds + n_storage_fds;
4031
4032 r = exec_context_named_iofds(context, params, named_iofds);
4033 if (r < 0)
4034 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4035
d35fbf6b
DM
4036 rename_process_from_path(command->path);
4037
9c274488
LP
4038 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
4039 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
4040 * both of which will be demoted to SIG_DFL. */
ce30c8dc 4041 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 4042 SIGNALS_IGNORE);
d35fbf6b
DM
4043
4044 if (context->ignore_sigpipe)
9c274488 4045 (void) ignore_signals(SIGPIPE);
d35fbf6b 4046
ff0af2a1
LP
4047 r = reset_signal_mask();
4048 if (r < 0) {
4049 *exit_status = EXIT_SIGNAL_MASK;
12145637 4050 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 4051 }
034c6ed7 4052
d35fbf6b
DM
4053 if (params->idle_pipe)
4054 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 4055
2c027c62
LP
4056 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
4057 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
4058 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
4059 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 4060
d35fbf6b 4061 log_forget_fds();
2c027c62 4062 log_set_open_when_needed(true);
a3b00f91 4063 log_settle_target();
3bb424c8
YW
4064 if (context->log_level_max >= 0)
4065 log_set_max_level(context->log_level_max);
4f2d528d 4066
40a80078
LP
4067 /* In case anything used libc syslog(), close this here, too */
4068 closelog();
4069
cd48e23f
RP
4070 fds = newdup(int, params_fds, n_fds);
4071 if (!fds) {
4072 *exit_status = EXIT_MEMORY;
4073 return log_oom();
4074 }
4075
4076 fdnames = strv_copy((char**) params->fd_names);
4077 if (!fdnames) {
4078 *exit_status = EXIT_MEMORY;
4079 return log_oom();
4080 }
4081
4082 r = collect_open_file_fds(unit, params->open_files, &fds, &fdnames, &n_fds);
4083 if (r < 0) {
4084 *exit_status = EXIT_FDS;
4085 return log_unit_error_errno(unit, r, "Failed to get OpenFile= file descriptors: %m");
4086 }
4087
b1994387 4088 int keep_fds[n_fds + 3];
1da37e58
ZJS
4089 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
4090 n_keep_fds = n_fds;
4091
4092 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
4093 if (r < 0) {
4094 *exit_status = EXIT_FDS;
4095 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4096 }
4097
b1994387 4098#if HAVE_LIBBPF
154eb43f
LB
4099 if (params->bpf_outer_map_fd >= 0) {
4100 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->bpf_outer_map_fd, (int *)&params->bpf_outer_map_fd);
b1994387
ILG
4101 if (r < 0) {
4102 *exit_status = EXIT_FDS;
4103 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4104 }
4105 }
4106#endif
4107
154eb43f 4108 r = close_remaining_fds(params, runtime, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
4109 if (r < 0) {
4110 *exit_status = EXIT_FDS;
12145637 4111 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
4112 }
4113
0af07108
ZJS
4114 if (!context->same_pgrp &&
4115 setsid() < 0) {
4116 *exit_status = EXIT_SETSID;
4117 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
4118 }
9e2f7c11 4119
1e22b5cd 4120 exec_context_tty_reset(context, params);
d35fbf6b 4121
154eb43f 4122 if (params->shall_confirm_spawn && unit_shall_confirm_spawn(unit)) {
3b20f877
FB
4123 _cleanup_free_ char *cmdline = NULL;
4124
4ef15008 4125 cmdline = quote_command_line(command->argv, SHELL_ESCAPE_EMPTY);
3b20f877 4126 if (!cmdline) {
0460aa5c 4127 *exit_status = EXIT_MEMORY;
12145637 4128 return log_oom();
3b20f877 4129 }
d35fbf6b 4130
4ef15008 4131 r = ask_for_confirmation(context, params->confirm_spawn, unit, cmdline);
3b20f877
FB
4132 if (r != CONFIRM_EXECUTE) {
4133 if (r == CONFIRM_PRETEND_SUCCESS) {
4134 *exit_status = EXIT_SUCCESS;
4135 return 0;
4136 }
5fa01ac0 4137
ff0af2a1 4138 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
4139 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
4140 "Execution cancelled by the user");
d35fbf6b
DM
4141 }
4142 }
1a63a750 4143
d521916d
LP
4144 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
4145 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
4146 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
4147 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
4148 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
4149 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
170d978b 4150 setenv("SYSTEMD_ACTIVATION_SCOPE", runtime_scope_to_string(params->runtime_scope), true) != 0) {
d521916d
LP
4151 *exit_status = EXIT_MEMORY;
4152 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
4153 }
4154
15220772 4155 if (context->dynamic_user && runtime && runtime->dynamic_creds) {
da50b85a 4156 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 4157
d521916d 4158 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 4159 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
4160 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
4161 *exit_status = EXIT_USER;
12145637 4162 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
4163 }
4164
da50b85a
LP
4165 r = compile_suggested_paths(context, params, &suggested_paths);
4166 if (r < 0) {
4167 *exit_status = EXIT_MEMORY;
4168 return log_oom();
4169 }
4170
15220772 4171 r = dynamic_creds_realize(runtime->dynamic_creds, suggested_paths, &uid, &gid);
ff0af2a1
LP
4172 if (r < 0) {
4173 *exit_status = EXIT_USER;
d85ff944
YW
4174 if (r == -EILSEQ)
4175 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4176 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 4177 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 4178 }
524daa8c 4179
70dd455c 4180 if (!uid_is_valid(uid)) {
29206d46 4181 *exit_status = EXIT_USER;
d85ff944 4182 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
4183 }
4184
4185 if (!gid_is_valid(gid)) {
4186 *exit_status = EXIT_USER;
d85ff944 4187 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 4188 }
5bc7452b 4189
15220772
DDM
4190 if (runtime->dynamic_creds->user)
4191 username = runtime->dynamic_creds->user->name;
29206d46
LP
4192
4193 } else {
1c943355
MY
4194 if (context->user) {
4195 r = get_fixed_user(context->user, &username, &uid, &gid, &home, &shell);
4196 if (r < 0) {
4197 *exit_status = EXIT_USER;
4198 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
4199 }
5bc7452b 4200 }
5bc7452b 4201
1c943355
MY
4202 if (context->group) {
4203 r = get_fixed_group(context->group, &groupname, &gid);
4204 if (r < 0) {
4205 *exit_status = EXIT_GROUP;
4206 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4207 }
4d885bd3 4208 }
cdc5d5c5 4209 }
29206d46 4210
cdc5d5c5
DH
4211 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
4212 r = get_supplementary_groups(context, username, groupname, gid,
4213 &supplementary_gids, &ngids);
4214 if (r < 0) {
4215 *exit_status = EXIT_GROUP;
12145637 4216 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 4217 }
5bc7452b 4218
154eb43f 4219 r = send_user_lookup(unit, params->user_lookup_fd, uid, gid);
00d9ef85
LP
4220 if (r < 0) {
4221 *exit_status = EXIT_USER;
12145637 4222 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
4223 }
4224
154eb43f 4225 params->user_lookup_fd = safe_close(params->user_lookup_fd);
00d9ef85 4226
6732edab
LP
4227 r = acquire_home(context, uid, &home, &home_buffer);
4228 if (r < 0) {
4229 *exit_status = EXIT_CHDIR;
12145637 4230 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
4231 }
4232
4a055e5a 4233 /* If a socket is connected to STDIN/STDOUT/STDERR, we must drop O_NONBLOCK */
d35fbf6b 4234 if (socket_fd >= 0)
a34ceba6 4235 (void) fd_nonblock(socket_fd, false);
acbb0225 4236
4c70a4a7
MS
4237 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
4238 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
4239 if (params->cgroup_path) {
4240 _cleanup_free_ char *p = NULL;
4241
a8b993dc 4242 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4c70a4a7
MS
4243 if (r < 0) {
4244 *exit_status = EXIT_CGROUP;
4245 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4246 }
4247
4248 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
702cf08f
YW
4249 if (r == -EUCLEAN) {
4250 *exit_status = EXIT_CGROUP;
4251 return log_unit_error_errno(unit, r, "Failed to attach process to cgroup %s "
4252 "because the cgroup or one of its parents or "
4253 "siblings is in the threaded mode: %m", p);
4254 }
4c70a4a7
MS
4255 if (r < 0) {
4256 *exit_status = EXIT_CGROUP;
4257 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
4258 }
4259 }
4260
28135da3
DDM
4261 if (context->network_namespace_path && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
4262 r = open_shareable_ns_path(runtime->shared->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
4263 if (r < 0) {
4264 *exit_status = EXIT_NETWORK;
4265 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
4266 }
4267 }
4268
28135da3
DDM
4269 if (context->ipc_namespace_path && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
4270 r = open_shareable_ns_path(runtime->shared->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
a70581ff
XR
4271 if (r < 0) {
4272 *exit_status = EXIT_NAMESPACE;
4273 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
4274 }
4275 }
4276
52c239d7 4277 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
4278 if (r < 0) {
4279 *exit_status = EXIT_STDIN;
12145637 4280 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 4281 }
034c6ed7 4282
52c239d7 4283 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4284 if (r < 0) {
4285 *exit_status = EXIT_STDOUT;
12145637 4286 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
4287 }
4288
52c239d7 4289 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4290 if (r < 0) {
4291 *exit_status = EXIT_STDERR;
12145637 4292 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4293 }
4294
d35fbf6b 4295 if (context->oom_score_adjust_set) {
bb44fd07
ZJS
4296 /* When we can't make this change due to EPERM, then let's silently skip over it. User
4297 * namespaces prohibit write access to this file, and we shouldn't trip up over that. */
9f8168eb 4298 r = set_oom_score_adjust(context->oom_score_adjust);
bb44fd07
ZJS
4299 if (ERRNO_IS_NEG_PRIVILEGE(r))
4300 log_unit_debug_errno(unit, r,
4301 "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
4302 else if (r < 0) {
4303 *exit_status = EXIT_OOM_ADJUST;
4304 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4305 }
d35fbf6b
DM
4306 }
4307
ad21e542
ZJS
4308 if (context->coredump_filter_set) {
4309 r = set_coredump_filter(context->coredump_filter);
bb44fd07
ZJS
4310 if (ERRNO_IS_NEG_PRIVILEGE(r))
4311 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
5fa01ac0
ZJS
4312 else if (r < 0) {
4313 *exit_status = EXIT_LIMITS;
bb44fd07 4314 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
5cf4c468 4315 }
ad21e542
ZJS
4316 }
4317
39090201
DJL
4318 if (context->nice_set) {
4319 r = setpriority_closest(context->nice);
5fa01ac0
ZJS
4320 if (r < 0) {
4321 *exit_status = EXIT_NICE;
39090201 4322 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
5fa01ac0 4323 }
39090201 4324 }
613b411c 4325
d35fbf6b
DM
4326 if (context->cpu_sched_set) {
4327 struct sched_param param = {
4328 .sched_priority = context->cpu_sched_priority,
4329 };
4330
ff0af2a1
LP
4331 r = sched_setscheduler(0,
4332 context->cpu_sched_policy |
4333 (context->cpu_sched_reset_on_fork ?
4334 SCHED_RESET_ON_FORK : 0),
4335 &param);
4336 if (r < 0) {
4337 *exit_status = EXIT_SETSCHEDULER;
12145637 4338 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4339 }
d35fbf6b 4340 }
fc9b2a84 4341
e2b2fb7f
MS
4342 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4343 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4344 const CPUSet *cpu_set;
4345
4346 if (context->cpu_affinity_from_numa) {
4347 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4348 if (r < 0) {
4349 *exit_status = EXIT_CPUAFFINITY;
4350 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4351 }
4352
4353 cpu_set = &converted_cpu_set;
4354 } else
4355 cpu_set = &context->cpu_set;
4356
4357 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4358 *exit_status = EXIT_CPUAFFINITY;
12145637 4359 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4360 }
e2b2fb7f 4361 }
034c6ed7 4362
b070c7c0
MS
4363 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4364 r = apply_numa_policy(&context->numa_policy);
bb44fd07
ZJS
4365 if (ERRNO_IS_NEG_NOT_SUPPORTED(r))
4366 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
4367 else if (r < 0) {
4368 *exit_status = EXIT_NUMA_POLICY;
4369 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
b070c7c0
MS
4370 }
4371 }
4372
d35fbf6b
DM
4373 if (context->ioprio_set)
4374 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4375 *exit_status = EXIT_IOPRIO;
12145637 4376 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4377 }
da726a4d 4378
d35fbf6b
DM
4379 if (context->timer_slack_nsec != NSEC_INFINITY)
4380 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4381 *exit_status = EXIT_TIMERSLACK;
12145637 4382 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4383 }
9eba9da4 4384
21022b9d
LP
4385 if (context->personality != PERSONALITY_INVALID) {
4386 r = safe_personality(context->personality);
4387 if (r < 0) {
ff0af2a1 4388 *exit_status = EXIT_PERSONALITY;
12145637 4389 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4390 }
21022b9d 4391 }
94f04347 4392
33331d11
VB
4393 if (context->utmp_id) {
4394 const char *line = context->tty_path ?
4395 (path_startswith(context->tty_path, "/dev/") ?: context->tty_path) :
4396 NULL;
df0ff127 4397 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
33331d11 4398 line,
023a4f67
LP
4399 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4400 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4401 USER_PROCESS,
6a93917d 4402 username);
33331d11 4403 }
d35fbf6b 4404
08f67696 4405 if (uid_is_valid(uid)) {
ff0af2a1
LP
4406 r = chown_terminal(STDIN_FILENO, uid);
4407 if (r < 0) {
4408 *exit_status = EXIT_STDIN;
12145637 4409 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4410 }
d35fbf6b 4411 }
8e274523 4412
6bb00842
LP
4413 if (params->cgroup_path) {
4414 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
4415 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4416 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
4417 * touch a single hierarchy too. */
4418
4419 if (params->flags & EXEC_CGROUP_DELEGATE) {
a8b993dc
LP
4420 _cleanup_free_ char *p = NULL;
4421
6bb00842
LP
4422 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
4423 if (r < 0) {
4424 *exit_status = EXIT_CGROUP;
4425 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
4426 }
a8b993dc
LP
4427
4428 r = exec_parameters_get_cgroup_path(params, cgroup_context, &p);
4429 if (r < 0) {
4430 *exit_status = EXIT_CGROUP;
4431 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
4432 }
4433 if (r > 0) {
bcd9b981 4434 r = cg_set_access_recursive(SYSTEMD_CGROUP_CONTROLLER, p, uid, gid);
a8b993dc
LP
4435 if (r < 0) {
4436 *exit_status = EXIT_CGROUP;
4437 return log_unit_error_errno(unit, r, "Failed to adjust control subgroup access: %m");
4438 }
4439 }
6bb00842
LP
4440 }
4441
4442 if (cgroup_context && cg_unified() > 0 && is_pressure_supported() > 0) {
4443 if (cgroup_context_want_memory_pressure(cgroup_context)) {
4444 r = cg_get_path("memory", params->cgroup_path, "memory.pressure", &memory_pressure_path);
4445 if (r < 0) {
4446 *exit_status = EXIT_MEMORY;
4447 return log_oom();
4448 }
4449
4450 r = chmod_and_chown(memory_pressure_path, 0644, uid, gid);
4451 if (r < 0) {
4452 log_unit_full_errno(unit, r == -ENOENT || ERRNO_IS_PRIVILEGE(r) ? LOG_DEBUG : LOG_WARNING, r,
4453 "Failed to adjust ownership of '%s', ignoring: %m", memory_pressure_path);
4454 memory_pressure_path = mfree(memory_pressure_path);
4455 }
4456 } else if (cgroup_context->memory_pressure_watch == CGROUP_PRESSURE_WATCH_OFF) {
4457 memory_pressure_path = strdup("/dev/null"); /* /dev/null is explicit indicator for turning of memory pressure watch */
4458 if (!memory_pressure_path) {
4459 *exit_status = EXIT_MEMORY;
4460 return log_oom();
4461 }
4462 }
034c6ed7 4463 }
d35fbf6b 4464 }
034c6ed7 4465
211a3d87
LB
4466 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
4467
5b10116e 4468 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
59dd2bbb 4469 r = setup_exec_directory(unit, context, params, uid, gid, dt, needs_mount_namespace, exit_status);
12145637
LP
4470 if (r < 0)
4471 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4472 }
94f04347 4473
bb0c0d6f 4474 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
43962c30 4475 r = exec_setup_credentials(context, params, unit->id, uid, gid);
bb0c0d6f
LP
4476 if (r < 0) {
4477 *exit_status = EXIT_CREDENTIALS;
4478 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4479 }
4480 }
4481
7bce046b 4482 r = build_environment(
fd63e712 4483 unit,
7bce046b
LP
4484 context,
4485 params,
6bb00842 4486 cgroup_context,
7bce046b 4487 n_fds,
cd48e23f 4488 fdnames,
7bce046b
LP
4489 home,
4490 username,
4491 shell,
4492 journal_stream_dev,
4493 journal_stream_ino,
6bb00842 4494 memory_pressure_path,
7bce046b 4495 &our_env);
2065ca69
JW
4496 if (r < 0) {
4497 *exit_status = EXIT_MEMORY;
12145637 4498 return log_oom();
2065ca69
JW
4499 }
4500
4501 r = build_pass_environment(context, &pass_env);
4502 if (r < 0) {
4503 *exit_status = EXIT_MEMORY;
12145637 4504 return log_oom();
2065ca69
JW
4505 }
4506
adf769b0
ZJS
4507 /* The $PATH variable is set to the default path in params->environment. However, this is overridden
4508 * if user-specified fields have $PATH set. The intention is to also override $PATH if the unit does
4509 * not specify PATH but the unit has ExecSearchPath. */
8c35c10d 4510 if (!strv_isempty(context->exec_search_path)) {
4511 _cleanup_free_ char *joined = NULL;
4512
4513 joined = strv_join(context->exec_search_path, ":");
4514 if (!joined) {
4515 *exit_status = EXIT_MEMORY;
4516 return log_oom();
4517 }
4518
4519 r = strv_env_assign(&joined_exec_search_path, "PATH", joined);
4520 if (r < 0) {
4521 *exit_status = EXIT_MEMORY;
4522 return log_oom();
4523 }
4524 }
4525
4ab3d29f 4526 accum_env = strv_env_merge(params->environment,
2065ca69 4527 our_env,
8c35c10d 4528 joined_exec_search_path,
2065ca69
JW
4529 pass_env,
4530 context->environment,
154eb43f 4531 params->files_env);
2065ca69
JW
4532 if (!accum_env) {
4533 *exit_status = EXIT_MEMORY;
12145637 4534 return log_oom();
2065ca69 4535 }
1280503b 4536 accum_env = strv_env_clean(accum_env);
2065ca69 4537
096424d1 4538 (void) umask(context->umask);
b213e1c1 4539
b1edf445 4540 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4541 if (r < 0) {
4542 *exit_status = EXIT_KEYRING;
12145637 4543 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4544 }
4545
adf769b0
ZJS
4546 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted
4547 * from it. */
1703fa41 4548 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4549
adf769b0
ZJS
4550 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked
4551 * for it, and the kernel doesn't actually support ambient caps. */
165a31c0 4552 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4553
adf769b0
ZJS
4554 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly
4555 * excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not
4556 * desired. */
165a31c0
LP
4557 if (needs_ambient_hack)
4558 needs_setuid = false;
4559 else
4560 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4561
638fd8cc
LP
4562 uint64_t capability_ambient_set = context->capability_ambient_set;
4563
165a31c0 4564 if (needs_sandboxing) {
adf769b0
ZJS
4565 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on
4566 * /sys being present. The actual MAC context application will happen later, as late as
4567 * possible, to avoid impacting our own code paths. */
7f18ef0a 4568
349cc4a5 4569#if HAVE_SELINUX
43b1f709 4570 use_selinux = mac_selinux_use();
7f18ef0a 4571#endif
f9fa32f0 4572#if ENABLE_SMACK
43b1f709 4573 use_smack = mac_smack_use();
7f18ef0a 4574#endif
349cc4a5 4575#if HAVE_APPARMOR
43b1f709 4576 use_apparmor = mac_apparmor_use();
7f18ef0a 4577#endif
165a31c0 4578 }
7f18ef0a 4579
ce932d2d
LP
4580 if (needs_sandboxing) {
4581 int which_failed;
4582
4583 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4584 * is set here. (See below.) */
4585
4586 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4587 if (r < 0) {
4588 *exit_status = EXIT_LIMITS;
4589 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4590 }
4591 }
4592
0af07108 4593 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4594 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4595 * wins here. (See above.) */
4596
1da37e58 4597 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4598 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4599 if (r < 0) {
4600 *exit_status = EXIT_PAM;
4601 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4602 }
ac45f971 4603
638fd8cc
LP
4604 if (ambient_capabilities_supported()) {
4605 uint64_t ambient_after_pam;
4606
4607 /* PAM modules might have set some ambient caps. Query them here and merge them into
4608 * the caps we want to set in the end, so that we don't end up unsetting them. */
4609 r = capability_get_ambient(&ambient_after_pam);
4610 if (r < 0) {
4611 *exit_status = EXIT_CAPABILITIES;
4612 return log_unit_error_errno(unit, r, "Failed to query ambient caps: %m");
4613 }
4614
4615 capability_ambient_set |= ambient_after_pam;
4616 }
4617
0af07108
ZJS
4618 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4619 if (ngids_after_pam < 0) {
4620 *exit_status = EXIT_MEMORY;
4621 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4622 }
b213e1c1 4623 }
5749f855 4624
170d978b 4625 if (needs_sandboxing && exec_context_need_unprivileged_private_users(context, params)) {
5749f855
AZ
4626 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4627 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4628 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108 4629
0af07108 4630 r = setup_private_users(saved_uid, saved_gid, uid, gid);
6ef721cb
LB
4631 /* If it was requested explicitly and we can't set it up, fail early. Otherwise, continue and let
4632 * the actual requested operations fail (or silently continue). */
4633 if (r < 0 && context->private_users) {
0af07108
ZJS
4634 *exit_status = EXIT_USER;
4635 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855 4636 }
6ef721cb
LB
4637 if (r < 0)
4638 log_unit_info_errno(unit, r, "Failed to set up user namespacing for unprivileged user, ignoring: %m");
4639 else
4640 userns_set_up = true;
5749f855
AZ
4641 }
4642
28135da3 4643 if (exec_needs_network_namespace(context) && runtime && runtime->shared && runtime->shared->netns_storage_socket[0] >= 0) {
a8d08f39 4644
5a3627e5
LP
4645 /* Try to enable network namespacing if network namespacing is available and we have
4646 * CAP_NET_ADMIN. We need CAP_NET_ADMIN to be able to configure the loopback device in the
4647 * new network namespace. And if we don't have that, then we could only create a network
4648 * namespace without the ability to set up "lo". Hence gracefully skip things then. */
4649 if (ns_type_supported(NAMESPACE_NET) && have_effective_cap(CAP_NET_ADMIN) > 0) {
28135da3 4650 r = setup_shareable_ns(runtime->shared->netns_storage_socket, CLONE_NEWNET);
bb44fd07
ZJS
4651 if (ERRNO_IS_NEG_PRIVILEGE(r))
4652 log_unit_notice_errno(unit, r,
4653 "PrivateNetwork=yes is configured, but network namespace setup not permitted, proceeding without: %m");
4654 else if (r < 0) {
4655 *exit_status = EXIT_NETWORK;
4656 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
6e2d7c4f 4657 }
a8d08f39
LP
4658 } else if (context->network_namespace_path) {
4659 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4660 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4661 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f 4662 } else
5a3627e5 4663 log_unit_notice(unit, "PrivateNetwork=yes is configured, but the kernel does not support or we lack privileges for network namespace, proceeding without.");
d35fbf6b 4664 }
169c1bda 4665
28135da3 4666 if (exec_needs_ipc_namespace(context) && runtime && runtime->shared && runtime->shared->ipcns_storage_socket[0] >= 0) {
a70581ff
XR
4667
4668 if (ns_type_supported(NAMESPACE_IPC)) {
28135da3 4669 r = setup_shareable_ns(runtime->shared->ipcns_storage_socket, CLONE_NEWIPC);
a70581ff
XR
4670 if (r == -EPERM)
4671 log_unit_warning_errno(unit, r,
4672 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4673 else if (r < 0) {
4674 *exit_status = EXIT_NAMESPACE;
4675 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4676 }
4677 } else if (context->ipc_namespace_path) {
4678 *exit_status = EXIT_NAMESPACE;
4679 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4680 "IPCNamespacePath= is not supported, refusing.");
4681 } else
4682 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4683 }
4684
ee818b89 4685 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4686 _cleanup_free_ char *error_path = NULL;
4687
73ff4d48 4688 r = apply_mount_namespace(unit, command->flags, context, params, runtime, memory_pressure_path, &error_path);
3fbe8dbe
LP
4689 if (r < 0) {
4690 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4691 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4692 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4693 }
d35fbf6b 4694 }
81a2b7ce 4695
daf8f72b
LP
4696 if (needs_sandboxing) {
4697 r = apply_protect_hostname(unit, context, exit_status);
4698 if (r < 0)
4699 return r;
aecd5ac6
TM
4700 }
4701
85614c6e
SR
4702 if (context->memory_ksm >= 0)
4703 if (prctl(PR_SET_MEMORY_MERGE, context->memory_ksm) < 0) {
4704 if (ERRNO_IS_NOT_SUPPORTED(errno))
4705 log_unit_debug_errno(unit, errno, "KSM support not available, ignoring.");
4706 else {
4707 *exit_status = EXIT_KSM;
4708 return log_unit_error_errno(unit, errno, "Failed to set KSM: %m");
4709 }
4710 }
4711
5749f855
AZ
4712 /* Drop groups as early as possible.
4713 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4714 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4715 if (needs_setuid) {
afb11bf1
DG
4716 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4717 int ngids_to_enforce = 0;
4718
4719 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4720 ngids,
4721 gids_after_pam,
4722 ngids_after_pam,
4723 &gids_to_enforce);
4724 if (ngids_to_enforce < 0) {
4725 *exit_status = EXIT_MEMORY;
4726 return log_unit_error_errno(unit,
4727 ngids_to_enforce,
4728 "Failed to merge group lists. Group membership might be incorrect: %m");
4729 }
4730
4731 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4732 if (r < 0) {
4733 *exit_status = EXIT_GROUP;
12145637 4734 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4735 }
165a31c0 4736 }
096424d1 4737
5749f855
AZ
4738 /* If the user namespace was not set up above, try to do it now.
4739 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
d09df6b9 4740 * restricted by rules pertaining to combining user namespaces with other namespaces (e.g. in the
5749f855
AZ
4741 * case of mount namespaces being less privileged when the mount point list is copied from a
4742 * different user namespace). */
9008e1ac 4743
5749f855
AZ
4744 if (needs_sandboxing && context->private_users && !userns_set_up) {
4745 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4746 if (r < 0) {
4747 *exit_status = EXIT_USER;
4748 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4749 }
4750 }
4751
9f71ba8d
ZJS
4752 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4753 * shall execute. */
4754
4755 _cleanup_free_ char *executable = NULL;
254d1313 4756 _cleanup_close_ int executable_fd = -EBADF;
8c35c10d 4757 r = find_executable_full(command->path, /* root= */ NULL, context->exec_search_path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4758 if (r < 0) {
4759 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4760 log_unit_struct_errno(unit, LOG_INFO, r,
4761 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4762 LOG_UNIT_INVOCATION_ID(unit),
4763 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4764 command->path),
4765 "EXECUTABLE=%s", command->path);
5fa01ac0 4766 *exit_status = EXIT_SUCCESS;
9f71ba8d
ZJS
4767 return 0;
4768 }
4769
4770 *exit_status = EXIT_EXEC;
c2503e35
RH
4771 return log_unit_struct_errno(unit, LOG_INFO, r,
4772 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4773 LOG_UNIT_INVOCATION_ID(unit),
4774 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4775 command->path),
4776 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4777 }
4778
b83d5050
ZJS
4779 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4780 if (r < 0) {
4781 *exit_status = EXIT_FDS;
4782 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4783 }
4784
9f71ba8d 4785#if HAVE_SELINUX
49590d67 4786 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
254d1313 4787 int fd = -EBADF;
49590d67
MS
4788
4789 if (socket_fd >= 0)
4790 fd = socket_fd;
4791 else if (params->n_socket_fds == 1)
4792 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4793 * use context from that fd to compute the label. */
4794 fd = params->fds[0];
4795
4796 if (fd >= 0) {
4797 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
006d1864
TM
4798 if (r < 0) {
4799 if (!context->selinux_context_ignore) {
4800 *exit_status = EXIT_SELINUX_CONTEXT;
4801 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4802 }
4803 log_unit_debug_errno(unit, r, "Failed to determine SELinux context, ignoring: %m");
49590d67 4804 }
9f71ba8d
ZJS
4805 }
4806 }
4807#endif
4808
4a055e5a
ZJS
4809 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that
4810 * we are more aggressive this time, since we don't need socket_fd and the netns and ipcns fds any
4811 * more. We do keep exec_fd however, if we have it, since we need to keep it open until the final
4812 * execve(). */
5686391b 4813
1da37e58 4814 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4815 if (r >= 0)
4816 r = shift_fds(fds, n_fds);
4817 if (r >= 0)
cd48e23f 4818 r = flags_fds(fds, n_socket_fds, n_fds, context->non_blocking);
ff0af2a1
LP
4819 if (r < 0) {
4820 *exit_status = EXIT_FDS;
12145637 4821 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4822 }
e66cf1a3 4823
5686391b
LP
4824 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4825 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4826 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4827 * came this far. */
4828
165a31c0 4829 secure_bits = context->secure_bits;
e66cf1a3 4830
165a31c0
LP
4831 if (needs_sandboxing) {
4832 uint64_t bset;
e66cf1a3 4833
4a055e5a
ZJS
4834 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested.
4835 * (Note this is placed after the general resource limit initialization, see above, in order
4836 * to take precedence.) */
f4170c67
LP
4837 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4838 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4839 *exit_status = EXIT_LIMITS;
12145637 4840 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4841 }
4842 }
4843
37ac2744
JB
4844#if ENABLE_SMACK
4845 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4846 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4847 if (use_smack) {
154eb43f 4848 r = setup_smack(params, context, executable_fd);
29ff6247 4849 if (r < 0 && !context->smack_process_label_ignore) {
37ac2744
JB
4850 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4851 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4852 }
4853 }
4854#endif
4855
165a31c0
LP
4856 bset = context->capability_bounding_set;
4857 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4858 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4859 * instead of us doing that */
4860 if (needs_ambient_hack)
4861 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4862 (UINT64_C(1) << CAP_SETUID) |
4863 (UINT64_C(1) << CAP_SETGID);
4864
4865 if (!cap_test_all(bset)) {
638fd8cc 4866 r = capability_bounding_set_drop(bset, /* right_now= */ false);
ff0af2a1
LP
4867 if (r < 0) {
4868 *exit_status = EXIT_CAPABILITIES;
12145637 4869 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4870 }
4c2630eb 4871 }
3b8bddde 4872
16fcb191
TK
4873 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4874 * keep-caps set.
a954b249
LP
4875 *
4876 * To be able to raise the ambient capabilities after setresuid() they have to be added to
4877 * the inherited set and keep caps has to be set (done in enforce_user()). After setresuid()
4878 * the ambient capabilities can be raised as they are present in the permitted and
4879 * inhertiable set. However it is possible that someone wants to set ambient capabilities
4880 * without changing the user, so we also set the ambient capabilities here.
4881 *
4882 * The requested ambient capabilities are raised in the inheritable set if the second
4883 * argument is true. */
943800f4 4884 if (!needs_ambient_hack) {
638fd8cc 4885 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ true);
755d4b67
IP
4886 if (r < 0) {
4887 *exit_status = EXIT_CAPABILITIES;
12145637 4888 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4889 }
755d4b67 4890 }
165a31c0 4891 }
755d4b67 4892
fa97f630 4893 /* chroot to root directory first, before we lose the ability to chroot */
9c0c6701 4894 r = apply_root_directory(context, params, runtime, needs_mount_namespace, exit_status);
fa97f630
JB
4895 if (r < 0)
4896 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4897
165a31c0 4898 if (needs_setuid) {
08f67696 4899 if (uid_is_valid(uid)) {
638fd8cc 4900 r = enforce_user(context, uid, capability_ambient_set);
ff0af2a1
LP
4901 if (r < 0) {
4902 *exit_status = EXIT_USER;
12145637 4903 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4904 }
165a31c0 4905
638fd8cc 4906 if (!needs_ambient_hack && capability_ambient_set != 0) {
755d4b67 4907
16fcb191 4908 /* Raise the ambient capabilities after user change. */
638fd8cc 4909 r = capability_ambient_set_apply(capability_ambient_set, /* also_inherit= */ false);
755d4b67
IP
4910 if (r < 0) {
4911 *exit_status = EXIT_CAPABILITIES;
12145637 4912 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4913 }
755d4b67 4914 }
5b6319dc 4915 }
165a31c0 4916 }
d35fbf6b 4917
56ef8db9
JB
4918 /* Apply working directory here, because the working directory might be on NFS and only the user running
4919 * this service might have the correct privilege to change to the working directory */
9c0c6701 4920 r = apply_working_directory(context, params, runtime, home, exit_status);
56ef8db9
JB
4921 if (r < 0)
4922 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4923
165a31c0 4924 if (needs_sandboxing) {
37ac2744 4925 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4926 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4927 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4928 * are restricted. */
4929
349cc4a5 4930#if HAVE_SELINUX
43b1f709 4931 if (use_selinux) {
5cd9cd35
LP
4932 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4933
4934 if (exec_context) {
4935 r = setexeccon(exec_context);
006d1864
TM
4936 if (r < 0) {
4937 if (!context->selinux_context_ignore) {
4938 *exit_status = EXIT_SELINUX_CONTEXT;
4939 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
4940 }
4941 log_unit_debug_errno(unit, r, "Failed to change SELinux context to %s, ignoring: %m", exec_context);
5cd9cd35
LP
4942 }
4943 }
4944 }
4945#endif
4946
349cc4a5 4947#if HAVE_APPARMOR
43b1f709 4948 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4949 r = aa_change_onexec(context->apparmor_profile);
4950 if (r < 0 && !context->apparmor_profile_ignore) {
4951 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4952 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4953 }
4954 }
4955#endif
4956
a954b249
LP
4957 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential
4958 * EPERMs we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits
4959 * requires CAP_SETPCAP. */
dbdc4098 4960 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4961 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098 4962 * effective set here.
a954b249
LP
4963 *
4964 * The effective set is overwritten during execve() with the following values:
4965 *
dbdc4098 4966 * - ambient set (for non-root processes)
a954b249 4967 *
dbdc4098
TK
4968 * - (inheritable | bounding) set for root processes)
4969 *
4970 * Hence there is no security impact to raise it in the effective set before execve
4971 */
a954b249 4972 r = capability_gain_cap_setpcap(/* return_caps= */ NULL);
dbdc4098
TK
4973 if (r < 0) {
4974 *exit_status = EXIT_CAPABILITIES;
4975 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4976 }
755d4b67 4977 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4978 *exit_status = EXIT_SECUREBITS;
12145637 4979 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4980 }
dbdc4098 4981 }
5b6319dc 4982
59eeb84b 4983 if (context_has_no_new_privileges(context))
d35fbf6b 4984 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4985 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4986 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4987 }
4988
349cc4a5 4989#if HAVE_SECCOMP
469830d1
LP
4990 r = apply_address_families(unit, context);
4991 if (r < 0) {
4992 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4993 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4994 }
04aa0cb9 4995
469830d1
LP
4996 r = apply_memory_deny_write_execute(unit, context);
4997 if (r < 0) {
4998 *exit_status = EXIT_SECCOMP;
12145637 4999 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 5000 }
f4170c67 5001
469830d1
LP
5002 r = apply_restrict_realtime(unit, context);
5003 if (r < 0) {
5004 *exit_status = EXIT_SECCOMP;
12145637 5005 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
5006 }
5007
f69567cb
LP
5008 r = apply_restrict_suid_sgid(unit, context);
5009 if (r < 0) {
5010 *exit_status = EXIT_SECCOMP;
5011 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
5012 }
5013
add00535
LP
5014 r = apply_restrict_namespaces(unit, context);
5015 if (r < 0) {
5016 *exit_status = EXIT_SECCOMP;
12145637 5017 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
5018 }
5019
469830d1
LP
5020 r = apply_protect_sysctl(unit, context);
5021 if (r < 0) {
5022 *exit_status = EXIT_SECCOMP;
12145637 5023 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
5024 }
5025
469830d1
LP
5026 r = apply_protect_kernel_modules(unit, context);
5027 if (r < 0) {
5028 *exit_status = EXIT_SECCOMP;
12145637 5029 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
5030 }
5031
84703040
KK
5032 r = apply_protect_kernel_logs(unit, context);
5033 if (r < 0) {
5034 *exit_status = EXIT_SECCOMP;
5035 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
5036 }
5037
fc64760d
KK
5038 r = apply_protect_clock(unit, context);
5039 if (r < 0) {
5040 *exit_status = EXIT_SECCOMP;
5041 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
5042 }
5043
469830d1
LP
5044 r = apply_private_devices(unit, context);
5045 if (r < 0) {
5046 *exit_status = EXIT_SECCOMP;
12145637 5047 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
5048 }
5049
5050 r = apply_syscall_archs(unit, context);
5051 if (r < 0) {
5052 *exit_status = EXIT_SECCOMP;
12145637 5053 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
5054 }
5055
78e864e5
TM
5056 r = apply_lock_personality(unit, context);
5057 if (r < 0) {
5058 *exit_status = EXIT_SECCOMP;
12145637 5059 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
5060 }
5061
9df2cdd8
TM
5062 r = apply_syscall_log(unit, context);
5063 if (r < 0) {
5064 *exit_status = EXIT_SECCOMP;
5065 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
5066 }
5067
5cd9cd35
LP
5068 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
5069 * by the filter as little as possible. */
165a31c0 5070 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
5071 if (r < 0) {
5072 *exit_status = EXIT_SECCOMP;
12145637 5073 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
5074 }
5075#endif
b1994387
ILG
5076
5077#if HAVE_LIBBPF
154eb43f 5078 r = apply_restrict_filesystems(unit, context, params);
b1994387
ILG
5079 if (r < 0) {
5080 *exit_status = EXIT_BPF;
5081 return log_unit_error_errno(unit, r, "Failed to restrict filesystems: %m");
5082 }
5083#endif
5084
d35fbf6b 5085 }
034c6ed7 5086
00819cc1
LP
5087 if (!strv_isempty(context->unset_environment)) {
5088 char **ee = NULL;
5089
5090 ee = strv_env_delete(accum_env, 1, context->unset_environment);
5091 if (!ee) {
5092 *exit_status = EXIT_MEMORY;
12145637 5093 return log_oom();
00819cc1
LP
5094 }
5095
130d3d22 5096 strv_free_and_replace(accum_env, ee);
00819cc1
LP
5097 }
5098
7ca69792 5099 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
f331434d
LP
5100 _cleanup_strv_free_ char **unset_variables = NULL, **bad_variables = NULL;
5101
5102 r = replace_env_argv(command->argv, accum_env, &replaced_argv, &unset_variables, &bad_variables);
5103 if (r < 0) {
7ca69792 5104 *exit_status = EXIT_MEMORY;
f331434d 5105 return log_unit_error_errno(unit, r, "Failed to replace environment variables: %m");
7ca69792
AZ
5106 }
5107 final_argv = replaced_argv;
f331434d
LP
5108
5109 if (!strv_isempty(unset_variables)) {
5110 _cleanup_free_ char *ju = strv_join(unset_variables, ", ");
5111 log_unit_warning(unit, "Referenced but unset environment variable evaluates to an empty string: %s", strna(ju));
5112 }
5113
5114 if (!strv_isempty(bad_variables)) {
5115 _cleanup_free_ char *jb = strv_join(bad_variables, ", ");
5116 log_unit_warning(unit, "Invalid environment variable name evaluates to an empty string: %s", strna(jb));;
5117 }
7ca69792
AZ
5118 } else
5119 final_argv = command->argv;
034c6ed7 5120
3ff67ec4 5121 log_command_line(unit, "Executing", executable, final_argv);
dd305ec9 5122
5686391b
LP
5123 if (exec_fd >= 0) {
5124 uint8_t hot = 1;
5125
5126 /* We have finished with all our initializations. Let's now let the manager know that. From this point
5127 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
5128
5129 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5130 *exit_status = EXIT_EXEC;
5131 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
5132 }
5133 }
5134
a6d9111c 5135 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
5136
5137 if (exec_fd >= 0) {
5138 uint8_t hot = 0;
5139
5140 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
5141 * that POLLHUP on it no longer means execve() succeeded. */
5142
5143 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
5144 *exit_status = EXIT_EXEC;
5145 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
5146 }
5147 }
12145637 5148
ff0af2a1 5149 *exit_status = EXIT_EXEC;
9f71ba8d 5150 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 5151}
81a2b7ce 5152
34cf6c43 5153
f2341e0a
LP
5154int exec_spawn(Unit *unit,
5155 ExecCommand *command,
d35fbf6b 5156 const ExecContext *context,
154eb43f 5157 ExecParameters *params,
28135da3 5158 ExecRuntime *runtime,
6bb00842 5159 const CGroupContext *cgroup_context,
d35fbf6b 5160 pid_t *ret) {
8351ceae 5161
78f93209 5162 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 5163 pid_t pid;
154eb43f 5164 int r;
8351ceae 5165
f2341e0a 5166 assert(unit);
154eb43f 5167 assert(unit->manager);
d35fbf6b
DM
5168 assert(command);
5169 assert(context);
5170 assert(ret);
5171 assert(params);
25b583d7 5172 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 5173
4b2af439
DDM
5174 LOG_CONTEXT_PUSH_UNIT(unit);
5175
154eb43f 5176 r = exec_context_load_environment(unit, context, &params->files_env);
ff0af2a1 5177 if (r < 0)
f2341e0a 5178 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 5179
9f71ba8d
ZJS
5180 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
5181 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
5182 mac_selinux_maybe_reload();
5183
3ff67ec4
ZJS
5184 /* We won't know the real executable path until we create the mount namespace in the child, but we
5185 want to log from the parent, so we use the possibly inaccurate path here. */
5186 log_command_line(unit, "About to execute", command->path, command->argv);
12145637 5187
78f93209 5188 if (params->cgroup_path) {
a8b993dc 5189 r = exec_parameters_get_cgroup_path(params, cgroup_context, &subcgroup_path);
78f93209
LP
5190 if (r < 0)
5191 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
18c1e481
LP
5192 if (r > 0) {
5193 /* If there's a subcgroup, then let's create it here now (the main cgroup was already
5194 * realized by the unit logic) */
5195
78f93209
LP
5196 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
5197 if (r < 0)
a8b993dc 5198 return log_unit_error_errno(unit, r, "Failed to create subcgroup '%s': %m", subcgroup_path);
78f93209
LP
5199 }
5200 }
5201
d35fbf6b
DM
5202 pid = fork();
5203 if (pid < 0)
74129a12 5204 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
5205
5206 if (pid == 0) {
5fa01ac0 5207 int exit_status;
ff0af2a1 5208
f2341e0a
LP
5209 r = exec_child(unit,
5210 command,
ff0af2a1
LP
5211 context,
5212 params,
5213 runtime,
6bb00842 5214 cgroup_context,
12145637
LP
5215 &exit_status);
5216
e1714f02 5217 if (r < 0) {
5fa01ac0
ZJS
5218 const char *status = ASSERT_PTR(
5219 exit_status_to_string(exit_status, EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD));
e1714f02 5220
c2503e35
RH
5221 log_unit_struct_errno(unit, LOG_ERR, r,
5222 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
5223 LOG_UNIT_INVOCATION_ID(unit),
5224 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
5225 status, command->path),
5226 "EXECUTABLE=%s", command->path);
5fa01ac0
ZJS
5227 } else
5228 assert(exit_status == EXIT_SUCCESS);
4c2630eb 5229
ff0af2a1 5230 _exit(exit_status);
034c6ed7
LP
5231 }
5232
f2341e0a 5233 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 5234
78f93209
LP
5235 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
5236 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
5237 * process will be killed too). */
5238 if (subcgroup_path)
5239 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 5240
b58b4116 5241 exec_status_start(&command->exec_status, pid);
9fb86720 5242
034c6ed7 5243 *ret = pid;
5cb5a6ff
LP
5244 return 0;
5245}
5246
034c6ed7
LP
5247void exec_context_init(ExecContext *c) {
5248 assert(c);
5249
154eb43f
LB
5250 /* When initializing a bool member to 'true', make sure to serialize in execute-serialize.c using
5251 * serialize_bool() instead of serialize_bool_elide(). */
5252
02131627
LB
5253 *c = (ExecContext) {
5254 .umask = 0022,
5255 .ioprio = IOPRIO_DEFAULT_CLASS_AND_PRIO,
5256 .cpu_sched_policy = SCHED_OTHER,
5257 .syslog_priority = LOG_DAEMON|LOG_INFO,
5258 .syslog_level_prefix = true,
5259 .ignore_sigpipe = true,
5260 .timer_slack_nsec = NSEC_INFINITY,
5261 .personality = PERSONALITY_INVALID,
5262 .timeout_clean_usec = USEC_INFINITY,
5263 .capability_bounding_set = CAP_MASK_UNSET,
5264 .restrict_namespaces = NAMESPACE_FLAGS_INITIAL,
5265 .log_level_max = -1,
005bfaf1 5266#if HAVE_SECCOMP
02131627 5267 .syscall_errno = SECCOMP_ERROR_NUMBER_KILL,
005bfaf1 5268#endif
02131627
LB
5269 .tty_rows = UINT_MAX,
5270 .tty_cols = UINT_MAX,
5271 .private_mounts = -1,
5272 .memory_ksm = -1,
854eca4a 5273 .set_login_environment = -1,
02131627
LB
5274 };
5275
59026bcc
MY
5276 FOREACH_ARRAY(d, c->directories, _EXEC_DIRECTORY_TYPE_MAX)
5277 d->mode = 0755;
02131627 5278
b070c7c0 5279 numa_policy_reset(&c->numa_policy);
02131627
LB
5280
5281 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
034c6ed7
LP
5282}
5283
613b411c 5284void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
5285 assert(c);
5286
6796073e
LP
5287 c->environment = strv_free(c->environment);
5288 c->environment_files = strv_free(c->environment_files);
b4c14404 5289 c->pass_environment = strv_free(c->pass_environment);
00819cc1 5290 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 5291
31ce987c 5292 rlimit_free_all(c->rlimit);
034c6ed7 5293
5b10116e 5294 for (size_t l = 0; l < 3; l++) {
52c239d7 5295 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
5296 c->stdio_file[l] = mfree(c->stdio_file[l]);
5297 }
52c239d7 5298
a1e58e8e
LP
5299 c->working_directory = mfree(c->working_directory);
5300 c->root_directory = mfree(c->root_directory);
915e6d16 5301 c->root_image = mfree(c->root_image);
18d73705 5302 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
5303 c->root_hash = mfree(c->root_hash);
5304 c->root_hash_size = 0;
5305 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
5306 c->root_hash_sig = mfree(c->root_hash_sig);
5307 c->root_hash_sig_size = 0;
5308 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 5309 c->root_verity = mfree(c->root_verity);
93f59701 5310 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a07b9926 5311 c->extension_directories = strv_free(c->extension_directories);
a1e58e8e
LP
5312 c->tty_path = mfree(c->tty_path);
5313 c->syslog_identifier = mfree(c->syslog_identifier);
5314 c->user = mfree(c->user);
5315 c->group = mfree(c->group);
034c6ed7 5316
6796073e 5317 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 5318
a1e58e8e 5319 c->pam_name = mfree(c->pam_name);
5b6319dc 5320
2a624c36
AP
5321 c->read_only_paths = strv_free(c->read_only_paths);
5322 c->read_write_paths = strv_free(c->read_write_paths);
5323 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
5324 c->exec_paths = strv_free(c->exec_paths);
5325 c->no_exec_paths = strv_free(c->no_exec_paths);
8c35c10d 5326 c->exec_search_path = strv_free(c->exec_search_path);
82c121a4 5327
d2d6c096 5328 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
5329 c->bind_mounts = NULL;
5330 c->n_bind_mounts = 0;
2abd4e38
YW
5331 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
5332 c->temporary_filesystems = NULL;
5333 c->n_temporary_filesystems = 0;
b3d13314 5334 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 5335
0985c7c4 5336 cpu_set_reset(&c->cpu_set);
b070c7c0 5337 numa_policy_reset(&c->numa_policy);
86a3475b 5338
a1e58e8e
LP
5339 c->utmp_id = mfree(c->utmp_id);
5340 c->selinux_context = mfree(c->selinux_context);
5341 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 5342 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 5343
9b412709 5344 c->restrict_filesystems = set_free_free(c->restrict_filesystems);
b1994387 5345
8cfa775f 5346 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
5347 c->syscall_archs = set_free(c->syscall_archs);
5348 c->address_families = set_free(c->address_families);
e66cf1a3 5349
5b10116e 5350 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 5351 exec_directory_done(&c->directories[t]);
d3070fbd
LP
5352
5353 c->log_level_max = -1;
5354
5355 exec_context_free_log_extra_fields(c);
9b412709
FS
5356 c->log_filter_allowed_patterns = set_free_free(c->log_filter_allowed_patterns);
5357 c->log_filter_denied_patterns = set_free_free(c->log_filter_denied_patterns);
08f3be7a 5358
5ac1530e
ZJS
5359 c->log_ratelimit_interval_usec = 0;
5360 c->log_ratelimit_burst = 0;
90fc172e 5361
08f3be7a
LP
5362 c->stdin_data = mfree(c->stdin_data);
5363 c->stdin_data_size = 0;
a8d08f39
LP
5364
5365 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 5366 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
5367
5368 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 5369
43144be4 5370 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 5371 c->set_credentials = hashmap_free(c->set_credentials);
9b412709 5372 c->import_credentials = set_free_free(c->import_credentials);
84be0c71
LP
5373
5374 c->root_image_policy = image_policy_free(c->root_image_policy);
5375 c->mount_image_policy = image_policy_free(c->mount_image_policy);
5376 c->extension_image_policy = image_policy_free(c->extension_image_policy);
e66cf1a3
LP
5377}
5378
34cf6c43 5379int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
5380 assert(c);
5381
5382 if (!runtime_prefix)
5383 return 0;
5384
211a3d87 5385 for (size_t i = 0; i < c->directories[EXEC_DIRECTORY_RUNTIME].n_items; i++) {
c2b2df60 5386 _cleanup_free_ char *p = NULL;
e66cf1a3 5387
494d0247 5388 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
211a3d87 5389 p = path_join(runtime_prefix, "private", c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
494d0247 5390 else
211a3d87 5391 p = path_join(runtime_prefix, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].path);
e66cf1a3
LP
5392 if (!p)
5393 return -ENOMEM;
5394
7bc4bf4a
LP
5395 /* We execute this synchronously, since we need to be sure this is gone when we start the
5396 * service next. */
c6878637 5397 (void) rm_rf(p, REMOVE_ROOT);
211a3d87 5398
211a3d87
LB
5399 STRV_FOREACH(symlink, c->directories[EXEC_DIRECTORY_RUNTIME].items[i].symlinks) {
5400 _cleanup_free_ char *symlink_abs = NULL;
5401
5402 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
5403 symlink_abs = path_join(runtime_prefix, "private", *symlink);
5404 else
5405 symlink_abs = path_join(runtime_prefix, *symlink);
5406 if (!symlink_abs)
5407 return -ENOMEM;
5408
5409 (void) unlink(symlink_abs);
5410 }
e66cf1a3
LP
5411 }
5412
5413 return 0;
5cb5a6ff
LP
5414}
5415
b9f976fb
MK
5416int exec_context_destroy_mount_ns_dir(Unit *u) {
5417 _cleanup_free_ char *p = NULL;
5418
5419 if (!u || !MANAGER_IS_SYSTEM(u->manager))
5420 return 0;
5421
5422 p = path_join("/run/systemd/propagate/", u->id);
5423 if (!p)
5424 return -ENOMEM;
5425
5426 /* This is only filled transiently (see mount_in_namespace()), should be empty or even non-existent*/
5427 if (rmdir(p) < 0 && errno != ENOENT)
5428 log_unit_debug_errno(u, errno, "Unable to remove propagation dir '%s', ignoring: %m", p);
5429
5430 return 0;
5431}
5432
34cf6c43 5433static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5434 assert(c);
5435
a1e58e8e 5436 c->path = mfree(c->path);
6796073e 5437 c->argv = strv_free(c->argv);
43d0fcbd
LP
5438}
5439
da6053d0 5440void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5441 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5442 exec_command_done(c+i);
5443}
5444
f1acf85a 5445ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5446 ExecCommand *i;
5447
52e3671b 5448 while ((i = LIST_POP(command, c))) {
43d0fcbd 5449 exec_command_done(i);
5cb5a6ff
LP
5450 free(i);
5451 }
f1acf85a
ZJS
5452
5453 return NULL;
5cb5a6ff
LP
5454}
5455
da6053d0 5456void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5457 for (size_t i = 0; i < n; i++)
f1acf85a 5458 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5459}
5460
6a1d4d9f 5461void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5462 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5463 exec_status_reset(&c[i].exec_status);
5464}
5465
5466void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
03677889 5467 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5468 LIST_FOREACH(command, z, c[i])
5469 exec_status_reset(&z->exec_status);
6a1d4d9f
LP
5470}
5471
039f0e70 5472typedef struct InvalidEnvInfo {
34cf6c43 5473 const Unit *unit;
039f0e70
LP
5474 const char *path;
5475} InvalidEnvInfo;
5476
5477static void invalid_env(const char *p, void *userdata) {
5478 InvalidEnvInfo *info = userdata;
5479
f2341e0a 5480 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5481}
5482
52c239d7
LB
5483const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5484 assert(c);
5485
5486 switch (fd_index) {
5073ff6b 5487
52c239d7
LB
5488 case STDIN_FILENO:
5489 if (c->std_input != EXEC_INPUT_NAMED_FD)
5490 return NULL;
5073ff6b 5491
52c239d7 5492 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5493
52c239d7
LB
5494 case STDOUT_FILENO:
5495 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5496 return NULL;
5073ff6b 5497
52c239d7 5498 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5499
52c239d7
LB
5500 case STDERR_FILENO:
5501 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5502 return NULL;
5073ff6b 5503
52c239d7 5504 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5505
52c239d7
LB
5506 default:
5507 return NULL;
5508 }
5509}
5510
2caa38e9
LP
5511static int exec_context_named_iofds(
5512 const ExecContext *c,
5513 const ExecParameters *p,
5514 int named_iofds[static 3]) {
5515
5b10116e 5516 size_t targets;
56fbd561 5517 const char* stdio_fdname[3];
da6053d0 5518 size_t n_fds;
52c239d7
LB
5519
5520 assert(c);
5521 assert(p);
2caa38e9 5522 assert(named_iofds);
52c239d7
LB
5523
5524 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5525 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5526 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5527
5b10116e 5528 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5529 stdio_fdname[i] = exec_context_fdname(c, i);
5530
4c47affc
FB
5531 n_fds = p->n_storage_fds + p->n_socket_fds;
5532
5b10116e 5533 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5534 if (named_iofds[STDIN_FILENO] < 0 &&
5535 c->std_input == EXEC_INPUT_NAMED_FD &&
5536 stdio_fdname[STDIN_FILENO] &&
5537 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5538
52c239d7
LB
5539 named_iofds[STDIN_FILENO] = p->fds[i];
5540 targets--;
56fbd561
ZJS
5541
5542 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5543 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5544 stdio_fdname[STDOUT_FILENO] &&
5545 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5546
52c239d7
LB
5547 named_iofds[STDOUT_FILENO] = p->fds[i];
5548 targets--;
56fbd561
ZJS
5549
5550 } else if (named_iofds[STDERR_FILENO] < 0 &&
5551 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5552 stdio_fdname[STDERR_FILENO] &&
5553 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5554
52c239d7
LB
5555 named_iofds[STDERR_FILENO] = p->fds[i];
5556 targets--;
5557 }
5558
56fbd561 5559 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5560}
5561
398a5009
ZJS
5562static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***ret) {
5563 _cleanup_strv_free_ char **v = NULL;
398a5009 5564 int r;
8c7be95e
LP
5565
5566 assert(c);
398a5009 5567 assert(ret);
8c7be95e
LP
5568
5569 STRV_FOREACH(i, c->environment_files) {
7fd1b19b 5570 _cleanup_globfree_ glob_t pglob = {};
398a5009
ZJS
5571 bool ignore = false;
5572 char *fn = *i;
8c7be95e
LP
5573
5574 if (fn[0] == '-') {
5575 ignore = true;
313cefa1 5576 fn++;
8c7be95e
LP
5577 }
5578
5579 if (!path_is_absolute(fn)) {
8c7be95e
LP
5580 if (ignore)
5581 continue;
8c7be95e
LP
5582 return -EINVAL;
5583 }
5584
2bef10ab 5585 /* Filename supports globbing, take all matching files */
398a5009
ZJS
5586 r = safe_glob(fn, 0, &pglob);
5587 if (r < 0) {
2bef10ab
PL
5588 if (ignore)
5589 continue;
398a5009 5590 return r;
2bef10ab 5591 }
8c7be95e 5592
d8c92e8b
ZJS
5593 /* When we don't match anything, -ENOENT should be returned */
5594 assert(pglob.gl_pathc > 0);
5595
fcc06682 5596 for (size_t n = 0; n < pglob.gl_pathc; n++) {
398a5009
ZJS
5597 _cleanup_strv_free_ char **p = NULL;
5598
5599 r = load_env_file(NULL, pglob.gl_pathv[n], &p);
5600 if (r < 0) {
2bef10ab
PL
5601 if (ignore)
5602 continue;
398a5009 5603 return r;
e9c1ea9d 5604 }
398a5009 5605
ebc05a09 5606 /* Log invalid environment variables with filename */
039f0e70
LP
5607 if (p) {
5608 InvalidEnvInfo info = {
f2341e0a 5609 .unit = unit,
039f0e70
LP
5610 .path = pglob.gl_pathv[n]
5611 };
5612
5613 p = strv_env_clean_with_callback(p, invalid_env, &info);
5614 }
8c7be95e 5615
398a5009
ZJS
5616 if (!v)
5617 v = TAKE_PTR(p);
2bef10ab 5618 else {
398a5009 5619 char **m = strv_env_merge(v, p);
c84a9488 5620 if (!m)
2bef10ab 5621 return -ENOMEM;
2bef10ab 5622
398a5009 5623 strv_free_and_replace(v, m);
2bef10ab 5624 }
8c7be95e
LP
5625 }
5626 }
5627
398a5009 5628 *ret = TAKE_PTR(v);
8c7be95e
LP
5629
5630 return 0;
5631}
5632
6ac8fdc9 5633static bool tty_may_match_dev_console(const char *tty) {
7b912648 5634 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5635
1e22b5cd
LP
5636 if (!tty)
5637 return true;
5638
a119ec7c 5639 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5640
5641 /* trivial identity? */
5642 if (streq(tty, "console"))
5643 return true;
5644
7b912648
LP
5645 if (resolve_dev_console(&resolved) < 0)
5646 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5647
5648 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5649 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5650}
5651
6c0ae739
LP
5652static bool exec_context_may_touch_tty(const ExecContext *ec) {
5653 assert(ec);
1e22b5cd 5654
6c0ae739 5655 return ec->tty_reset ||
1e22b5cd
LP
5656 ec->tty_vhangup ||
5657 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5658 is_terminal_input(ec->std_input) ||
5659 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5660 is_terminal_output(ec->std_error);
5661}
5662
5663bool exec_context_may_touch_console(const ExecContext *ec) {
5664
5665 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5666 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5667}
5668
15ae422b 5669static void strv_fprintf(FILE *f, char **l) {
15ae422b
LP
5670 assert(f);
5671
5672 STRV_FOREACH(g, l)
5673 fprintf(f, " %s", *g);
5674}
5675
ddc155b2
TM
5676static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5677 assert(f);
5678 assert(prefix);
5679 assert(name);
5680
5681 if (!strv_isempty(strv)) {
a7bd1656 5682 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5683 strv_fprintf(f, strv);
5684 fputs("\n", f);
5685 }
5686}
5687
97f53fec
LB
5688void exec_params_dump(const ExecParameters *p, FILE* f, const char *prefix) {
5689 assert(p);
5690 assert(f);
5691
5692 prefix = strempty(prefix);
5693
5694 fprintf(f,
5695 "%sRuntimeScope: %s\n"
5696 "%sExecFlags: %u\n"
5697 "%sSELinuxContextNetwork: %s\n"
5698 "%sCgroupSupportedMask: %u\n"
5699 "%sCgroupPath: %s\n"
5700 "%sCrededentialsDirectory: %s\n"
5701 "%sEncryptedCredentialsDirectory: %s\n"
5702 "%sConfirmSpawn: %s\n"
5703 "%sShallConfirmSpawn: %s\n"
5704 "%sWatchdogUSec: " USEC_FMT "\n"
5705 "%sNotifySocket: %s\n"
5706 "%sFallbackSmackProcessLabel: %s\n",
5707 prefix, runtime_scope_to_string(p->runtime_scope),
5708 prefix, p->flags,
5709 prefix, yes_no(p->selinux_context_net),
5710 prefix, p->cgroup_supported,
5711 prefix, p->cgroup_path,
5712 prefix, strempty(p->received_credentials_directory),
5713 prefix, strempty(p->received_encrypted_credentials_directory),
5714 prefix, strempty(p->confirm_spawn),
5715 prefix, yes_no(p->shall_confirm_spawn),
5716 prefix, p->watchdog_usec,
5717 prefix, strempty(p->notify_socket),
5718 prefix, strempty(p->fallback_smack_process_label));
5719
5720 strv_dump(f, prefix, "FdNames", p->fd_names);
5721 strv_dump(f, prefix, "Environment", p->environment);
5722 strv_dump(f, prefix, "Prefix", p->prefix);
5723
5724 LIST_FOREACH(open_files, file, p->open_files)
5725 fprintf(f, "%sOpenFile: %s %s", prefix, file->path, open_file_flags_to_string(file->flags));
5726
5727 strv_dump(f, prefix, "FilesEnv", p->files_env);
5728}
5729
34cf6c43 5730void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
add00535 5731 int r;
9eba9da4 5732
5cb5a6ff
LP
5733 assert(c);
5734 assert(f);
5735
4ad49000 5736 prefix = strempty(prefix);
5cb5a6ff
LP
5737
5738 fprintf(f,
94f04347
LP
5739 "%sUMask: %04o\n"
5740 "%sWorkingDirectory: %s\n"
451a074f 5741 "%sRootDirectory: %s\n"
9c0c6701 5742 "%sRootEphemeral: %s\n"
15ae422b 5743 "%sNonBlocking: %s\n"
64747e2d 5744 "%sPrivateTmp: %s\n"
7f112f50 5745 "%sPrivateDevices: %s\n"
59eeb84b 5746 "%sProtectKernelTunables: %s\n"
e66a2f65 5747 "%sProtectKernelModules: %s\n"
84703040 5748 "%sProtectKernelLogs: %s\n"
fc64760d 5749 "%sProtectClock: %s\n"
59eeb84b 5750 "%sProtectControlGroups: %s\n"
d251207d
LP
5751 "%sPrivateNetwork: %s\n"
5752 "%sPrivateUsers: %s\n"
1b8689f9
LP
5753 "%sProtectHome: %s\n"
5754 "%sProtectSystem: %s\n"
5d997827 5755 "%sMountAPIVFS: %s\n"
f3e43635 5756 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5757 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5758 "%sRestrictRealtime: %s\n"
f69567cb 5759 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5760 "%sKeyringMode: %s\n"
4e399953
LP
5761 "%sProtectHostname: %s\n"
5762 "%sProtectProc: %s\n"
5763 "%sProcSubset: %s\n",
5cb5a6ff 5764 prefix, c->umask,
14eb3285
LP
5765 prefix, empty_to_root(c->working_directory),
5766 prefix, empty_to_root(c->root_directory),
9c0c6701 5767 prefix, yes_no(c->root_ephemeral),
15ae422b 5768 prefix, yes_no(c->non_blocking),
64747e2d 5769 prefix, yes_no(c->private_tmp),
7f112f50 5770 prefix, yes_no(c->private_devices),
59eeb84b 5771 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5772 prefix, yes_no(c->protect_kernel_modules),
84703040 5773 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5774 prefix, yes_no(c->protect_clock),
59eeb84b 5775 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5776 prefix, yes_no(c->private_network),
5777 prefix, yes_no(c->private_users),
1b8689f9
LP
5778 prefix, protect_home_to_string(c->protect_home),
5779 prefix, protect_system_to_string(c->protect_system),
5e98086d 5780 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5781 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5782 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5783 prefix, yes_no(c->restrict_realtime),
f69567cb 5784 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5785 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5786 prefix, yes_no(c->protect_hostname),
5787 prefix, protect_proc_to_string(c->protect_proc),
5788 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5789
915e6d16
LP
5790 if (c->root_image)
5791 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5792
18d73705 5793 if (c->root_image_options) {
18d73705
LB
5794 fprintf(f, "%sRootImageOptions:", prefix);
5795 LIST_FOREACH(mount_options, o, c->root_image_options)
5796 if (!isempty(o->options))
9ece6444
LB
5797 fprintf(f, " %s:%s",
5798 partition_designator_to_string(o->partition_designator),
5799 o->options);
18d73705
LB
5800 fprintf(f, "\n");
5801 }
5802
0389f4fa
LB
5803 if (c->root_hash) {
5804 _cleanup_free_ char *encoded = NULL;
5805 encoded = hexmem(c->root_hash, c->root_hash_size);
5806 if (encoded)
5807 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5808 }
5809
5810 if (c->root_hash_path)
5811 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5812
d4d55b0d
LB
5813 if (c->root_hash_sig) {
5814 _cleanup_free_ char *encoded = NULL;
5815 ssize_t len;
5816 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5817 if (len)
5818 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5819 }
5820
5821 if (c->root_hash_sig_path)
5822 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5823
0389f4fa
LB
5824 if (c->root_verity)
5825 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5826
8c7be95e
LP
5827 STRV_FOREACH(e, c->environment)
5828 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5829
5830 STRV_FOREACH(e, c->environment_files)
5831 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5832
b4c14404
FB
5833 STRV_FOREACH(e, c->pass_environment)
5834 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5835
00819cc1
LP
5836 STRV_FOREACH(e, c->unset_environment)
5837 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5838
53f47dfc
YW
5839 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5840
5b10116e 5841 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5842 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5843
211a3d87
LB
5844 for (size_t i = 0; i < c->directories[dt].n_items; i++) {
5845 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].items[i].path);
5846
5847 STRV_FOREACH(d, c->directories[dt].items[i].symlinks)
5848 fprintf(f, "%s%s: %s:%s\n", prefix, exec_directory_type_symlink_to_string(dt), c->directories[dt].items[i].path, *d);
5849 }
3536f49e 5850 }
c2bbd90b 5851
5291f26d 5852 fprintf(f, "%sTimeoutCleanSec: %s\n", prefix, FORMAT_TIMESPAN(c->timeout_clean_usec, USEC_PER_SEC));
12213aed 5853
fb33a393 5854 if (c->nice_set)
5291f26d 5855 fprintf(f, "%sNice: %i\n", prefix, c->nice);
fb33a393 5856
dd6c17b1 5857 if (c->oom_score_adjust_set)
5291f26d 5858 fprintf(f, "%sOOMScoreAdjust: %i\n", prefix, c->oom_score_adjust);
9eba9da4 5859
ad21e542 5860 if (c->coredump_filter_set)
5291f26d 5861 fprintf(f, "%sCoredumpFilter: 0x%"PRIx64"\n", prefix, c->coredump_filter);
ad21e542 5862
5b10116e 5863 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5864 if (c->rlimit[i]) {
4c3a2b84 5865 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5866 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5867 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5868 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5869 }
94f04347 5870
f8b69d1d 5871 if (c->ioprio_set) {
1756a011 5872 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5873
5bead76e 5874 r = ioprio_class_to_string_alloc(ioprio_prio_class(c->ioprio), &class_str);
837df140
YW
5875 if (r >= 0)
5876 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5877
5bead76e 5878 fprintf(f, "%sIOPriority: %d\n", prefix, ioprio_prio_data(c->ioprio));
f8b69d1d 5879 }
94f04347 5880
f8b69d1d 5881 if (c->cpu_sched_set) {
1756a011 5882 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5883
837df140
YW
5884 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5885 if (r >= 0)
5886 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5887
94f04347 5888 fprintf(f,
38b48754
LP
5889 "%sCPUSchedulingPriority: %i\n"
5890 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5891 prefix, c->cpu_sched_priority,
5892 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5893 }
94f04347 5894
0985c7c4 5895 if (c->cpu_set.set) {
e7fca352
MS
5896 _cleanup_free_ char *affinity = NULL;
5897
5898 affinity = cpu_set_to_range_string(&c->cpu_set);
5899 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5900 }
5901
b070c7c0
MS
5902 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5903 _cleanup_free_ char *nodes = NULL;
5904
5905 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5906 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5907 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5908 }
5909
3a43da28 5910 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5911 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5912
5913 fprintf(f,
80876c20
LP
5914 "%sStandardInput: %s\n"
5915 "%sStandardOutput: %s\n"
5916 "%sStandardError: %s\n",
5917 prefix, exec_input_to_string(c->std_input),
5918 prefix, exec_output_to_string(c->std_output),
5919 prefix, exec_output_to_string(c->std_error));
5920
befc4a80
LP
5921 if (c->std_input == EXEC_INPUT_NAMED_FD)
5922 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5923 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5924 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5925 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5926 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5927
5928 if (c->std_input == EXEC_INPUT_FILE)
5929 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5930 if (c->std_output == EXEC_OUTPUT_FILE)
5931 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5932 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5933 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5934 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5935 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5936 if (c->std_error == EXEC_OUTPUT_FILE)
5937 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5938 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5939 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5940 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5941 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5942
80876c20
LP
5943 if (c->tty_path)
5944 fprintf(f,
6ea832a2
LP
5945 "%sTTYPath: %s\n"
5946 "%sTTYReset: %s\n"
5947 "%sTTYVHangup: %s\n"
51462135
DDM
5948 "%sTTYVTDisallocate: %s\n"
5949 "%sTTYRows: %u\n"
5950 "%sTTYColumns: %u\n",
6ea832a2
LP
5951 prefix, c->tty_path,
5952 prefix, yes_no(c->tty_reset),
5953 prefix, yes_no(c->tty_vhangup),
51462135
DDM
5954 prefix, yes_no(c->tty_vt_disallocate),
5955 prefix, c->tty_rows,
5956 prefix, c->tty_cols);
94f04347 5957
9f6444eb 5958 if (IN_SET(c->std_output,
9f6444eb
LP
5959 EXEC_OUTPUT_KMSG,
5960 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5961 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5962 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5963 IN_SET(c->std_error,
9f6444eb
LP
5964 EXEC_OUTPUT_KMSG,
5965 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5966 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5967 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5968
5ce70e5b 5969 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5970
837df140
YW
5971 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5972 if (r >= 0)
5973 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5974
837df140
YW
5975 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5976 if (r >= 0)
5977 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5978 }
94f04347 5979
d3070fbd
LP
5980 if (c->log_level_max >= 0) {
5981 _cleanup_free_ char *t = NULL;
5982
5983 (void) log_level_to_string_alloc(c->log_level_max, &t);
5984
5985 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5986 }
5987
5291f26d 5988 if (c->log_ratelimit_interval_usec > 0)
90fc172e
AZ
5989 fprintf(f,
5990 "%sLogRateLimitIntervalSec: %s\n",
5291f26d 5991 prefix, FORMAT_TIMESPAN(c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e 5992
5ac1530e
ZJS
5993 if (c->log_ratelimit_burst > 0)
5994 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5995
523ea123
QD
5996 if (!set_isempty(c->log_filter_allowed_patterns) || !set_isempty(c->log_filter_denied_patterns)) {
5997 fprintf(f, "%sLogFilterPatterns:", prefix);
5998
5999 char *pattern;
6000 SET_FOREACH(pattern, c->log_filter_allowed_patterns)
6001 fprintf(f, " %s", pattern);
6002 SET_FOREACH(pattern, c->log_filter_denied_patterns)
6003 fprintf(f, " ~%s", pattern);
6004 fputc('\n', f);
6005 }
6006
5b10116e
ZJS
6007 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
6008 fprintf(f, "%sLogExtraFields: ", prefix);
6009 fwrite(c->log_extra_fields[j].iov_base,
6010 1, c->log_extra_fields[j].iov_len,
6011 f);
6012 fputc('\n', f);
d3070fbd
LP
6013 }
6014
91dd5f7c
LP
6015 if (c->log_namespace)
6016 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
6017
07d46372
YW
6018 if (c->secure_bits) {
6019 _cleanup_free_ char *str = NULL;
6020
6021 r = secure_bits_to_string_alloc(c->secure_bits, &str);
6022 if (r >= 0)
6023 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
6024 }
94f04347 6025
3fd5190b 6026 if (c->capability_bounding_set != CAP_MASK_UNSET) {
dd1f5bd0 6027 _cleanup_free_ char *str = NULL;
94f04347 6028
8142d735 6029 r = capability_set_to_string(c->capability_bounding_set, &str);
dd1f5bd0
YW
6030 if (r >= 0)
6031 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
6032 }
6033
6034 if (c->capability_ambient_set != 0) {
dd1f5bd0 6035 _cleanup_free_ char *str = NULL;
755d4b67 6036
8142d735 6037 r = capability_set_to_string(c->capability_ambient_set, &str);
dd1f5bd0
YW
6038 if (r >= 0)
6039 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
6040 }
6041
6042 if (c->user)
f2d3769a 6043 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 6044 if (c->group)
f2d3769a 6045 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 6046
29206d46
LP
6047 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
6048
ddc155b2 6049 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 6050
5b6319dc 6051 if (c->pam_name)
f2d3769a 6052 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 6053
ddc155b2
TM
6054 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
6055 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
6056 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
6057 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
6058 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
8c35c10d 6059 strv_dump(f, prefix, "ExecSearchPath", c->exec_search_path);
2e22afe9 6060
5b10116e
ZJS
6061 for (size_t i = 0; i < c->n_bind_mounts; i++)
6062 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
6063 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
6064 c->bind_mounts[i].ignore_enoent ? "-": "",
6065 c->bind_mounts[i].source,
6066 c->bind_mounts[i].destination,
6067 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 6068
5b10116e
ZJS
6069 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
6070 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 6071
5b10116e
ZJS
6072 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
6073 t->path,
6074 isempty(t->options) ? "" : ":",
6075 strempty(t->options));
6076 }
2abd4e38 6077
169c1bda
LP
6078 if (c->utmp_id)
6079 fprintf(f,
6080 "%sUtmpIdentifier: %s\n",
6081 prefix, c->utmp_id);
7b52a628
MS
6082
6083 if (c->selinux_context)
6084 fprintf(f,
5f8640fb
LP
6085 "%sSELinuxContext: %s%s\n",
6086 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 6087
80c21aea
WC
6088 if (c->apparmor_profile)
6089 fprintf(f,
6090 "%sAppArmorProfile: %s%s\n",
6091 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
6092
6093 if (c->smack_process_label)
6094 fprintf(f,
6095 "%sSmackProcessLabel: %s%s\n",
6096 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
6097
050f7277 6098 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
6099 fprintf(f,
6100 "%sPersonality: %s\n",
6101 prefix, strna(personality_to_string(c->personality)));
6102
78e864e5
TM
6103 fprintf(f,
6104 "%sLockPersonality: %s\n",
6105 prefix, yes_no(c->lock_personality));
6106
17df7223 6107 if (c->syscall_filter) {
17df7223 6108 fprintf(f,
57183d11 6109 "%sSystemCallFilter: ",
17df7223
LP
6110 prefix);
6111
6b000af4 6112 if (!c->syscall_allow_list)
17df7223
LP
6113 fputc('~', f);
6114
349cc4a5 6115#if HAVE_SECCOMP
d5a99b7c
JJ
6116 void *id, *val;
6117 bool first = true;
90e74a66 6118 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 6119 _cleanup_free_ char *name = NULL;
8cfa775f
YW
6120 const char *errno_name = NULL;
6121 int num = PTR_TO_INT(val);
17df7223
LP
6122
6123 if (first)
6124 first = false;
6125 else
6126 fputc(' ', f);
6127
57183d11 6128 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 6129 fputs(strna(name), f);
8cfa775f
YW
6130
6131 if (num >= 0) {
005bfaf1 6132 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
6133 if (errno_name)
6134 fprintf(f, ":%s", errno_name);
6135 else
6136 fprintf(f, ":%d", num);
6137 }
17df7223 6138 }
351a19b1 6139#endif
17df7223
LP
6140
6141 fputc('\n', f);
6142 }
6143
57183d11 6144 if (c->syscall_archs) {
57183d11
LP
6145 fprintf(f,
6146 "%sSystemCallArchitectures:",
6147 prefix);
6148
349cc4a5 6149#if HAVE_SECCOMP
d5a99b7c 6150 void *id;
90e74a66 6151 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
6152 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
6153#endif
6154 fputc('\n', f);
6155 }
6156
add00535
LP
6157 if (exec_context_restrict_namespaces_set(c)) {
6158 _cleanup_free_ char *s = NULL;
6159
86c2a9f1 6160 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
6161 if (r >= 0)
6162 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 6163 prefix, strna(s));
add00535
LP
6164 }
6165
b1994387 6166#if HAVE_LIBBPF
8fe84dc8
YW
6167 if (exec_context_restrict_filesystems_set(c)) {
6168 char *fs;
6169 SET_FOREACH(fs, c->restrict_filesystems)
6170 fprintf(f, "%sRestrictFileSystems: %s\n", prefix, fs);
6171 }
b1994387
ILG
6172#endif
6173
a8d08f39
LP
6174 if (c->network_namespace_path)
6175 fprintf(f,
6176 "%sNetworkNamespacePath: %s\n",
6177 prefix, c->network_namespace_path);
6178
3df90f24 6179 if (c->syscall_errno > 0) {
3df90f24
YW
6180 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
6181
005bfaf1 6182#if HAVE_SECCOMP
d5a99b7c 6183 const char *errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 6184 if (errno_name)
005bfaf1 6185 fputs(errno_name, f);
3df90f24 6186 else
005bfaf1
TM
6187 fprintf(f, "%d", c->syscall_errno);
6188#endif
6189 fputc('\n', f);
3df90f24 6190 }
b3d13314 6191
5b10116e 6192 for (size_t i = 0; i < c->n_mount_images; i++) {
79e20ceb 6193 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
6194 c->mount_images[i].ignore_enoent ? "-": "",
6195 c->mount_images[i].source,
79e20ceb 6196 c->mount_images[i].destination);
427353f6 6197 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 6198 fprintf(f, ":%s:%s",
427353f6 6199 partition_designator_to_string(o->partition_designator),
79e20ceb 6200 strempty(o->options));
427353f6
LB
6201 fprintf(f, "\n");
6202 }
93f59701
LB
6203
6204 for (size_t i = 0; i < c->n_extension_images; i++) {
93f59701
LB
6205 fprintf(f, "%sExtensionImages: %s%s", prefix,
6206 c->extension_images[i].ignore_enoent ? "-": "",
6207 c->extension_images[i].source);
6208 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
6209 fprintf(f, ":%s:%s",
6210 partition_designator_to_string(o->partition_designator),
6211 strempty(o->options));
6212 fprintf(f, "\n");
6213 }
a07b9926
LB
6214
6215 strv_dump(f, prefix, "ExtensionDirectories", c->extension_directories);
5cb5a6ff
LP
6216}
6217
34cf6c43 6218bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
6219 assert(c);
6220
61233823 6221 /* Returns true if the process forked off would run under
a931ad47
LP
6222 * an unchanged UID or as root. */
6223
6224 if (!c->user)
6225 return true;
6226
6227 if (streq(c->user, "root") || streq(c->user, "0"))
6228 return true;
6229
6230 return false;
6231}
6232
34cf6c43 6233int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
6234 int p;
6235
6236 assert(c);
6237
6238 if (c->ioprio_set)
6239 return c->ioprio;
6240
6241 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
6242 if (p < 0)
0692548c 6243 return IOPRIO_DEFAULT_CLASS_AND_PRIO;
7f452159 6244
8b330d7d 6245 return ioprio_normalize(p);
7f452159
LP
6246}
6247
5e98086d
ZJS
6248bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
6249 assert(c);
6250
61198784 6251 /* Explicit setting wins */
5e98086d
ZJS
6252 if (c->mount_apivfs_set)
6253 return c->mount_apivfs;
6254
61198784 6255 /* Default to "yes" if root directory or image are specified */
74e12520 6256 if (exec_context_with_rootfs(c))
61198784
ZJS
6257 return true;
6258
5e98086d
ZJS
6259 return false;
6260}
6261
d3070fbd 6262void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
6263 assert(c);
6264
5b10116e 6265 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
6266 free(c->log_extra_fields[l].iov_base);
6267 c->log_extra_fields = mfree(c->log_extra_fields);
6268 c->n_log_extra_fields = 0;
6269}
6270
6f765baf 6271void exec_context_revert_tty(ExecContext *c) {
254d1313 6272 _cleanup_close_ int fd = -EBADF;
0ba976e8
LP
6273 const char *path;
6274 struct stat st;
6f765baf
LP
6275 int r;
6276
6277 assert(c);
6278
6279 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
6280 exec_context_tty_reset(c, NULL);
6281
6282 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
6283 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
6284 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
6285 if (!exec_context_may_touch_tty(c))
6286 return;
6f765baf 6287
0ba976e8
LP
6288 path = exec_context_tty_path(c);
6289 if (!path)
6290 return;
6f765baf 6291
0ba976e8
LP
6292 fd = open(path, O_PATH|O_CLOEXEC);
6293 if (fd < 0)
6294 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
6295 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
6296 path);
6297
6298 if (fstat(fd, &st) < 0)
6299 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
6300
6301 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
6302 * if things are a character device, since a proper check either means we'd have to open the TTY and
6303 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
6304 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
6305 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
6306 if (!S_ISCHR(st.st_mode))
6307 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
6308
6309 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
6310 if (r < 0)
6311 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
6312}
6313
4c2f5842
LP
6314int exec_context_get_clean_directories(
6315 ExecContext *c,
6316 char **prefix,
6317 ExecCleanMask mask,
6318 char ***ret) {
6319
6320 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
6321 int r;
6322
6323 assert(c);
6324 assert(prefix);
6325 assert(ret);
6326
5b10116e 6327 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
6328 if (!FLAGS_SET(mask, 1U << t))
6329 continue;
6330
6331 if (!prefix[t])
6332 continue;
6333
211a3d87 6334 for (size_t i = 0; i < c->directories[t].n_items; i++) {
4c2f5842
LP
6335 char *j;
6336
211a3d87 6337 j = path_join(prefix[t], c->directories[t].items[i].path);
4c2f5842
LP
6338 if (!j)
6339 return -ENOMEM;
6340
6341 r = strv_consume(&l, j);
6342 if (r < 0)
6343 return r;
7f622a19
YW
6344
6345 /* Also remove private directories unconditionally. */
6346 if (t != EXEC_DIRECTORY_CONFIGURATION) {
211a3d87
LB
6347 j = path_join(prefix[t], "private", c->directories[t].items[i].path);
6348 if (!j)
6349 return -ENOMEM;
6350
6351 r = strv_consume(&l, j);
6352 if (r < 0)
6353 return r;
6354 }
6355
211a3d87
LB
6356 STRV_FOREACH(symlink, c->directories[t].items[i].symlinks) {
6357 j = path_join(prefix[t], *symlink);
7f622a19
YW
6358 if (!j)
6359 return -ENOMEM;
6360
6361 r = strv_consume(&l, j);
6362 if (r < 0)
6363 return r;
6364 }
4c2f5842
LP
6365 }
6366 }
6367
6368 *ret = TAKE_PTR(l);
6369 return 0;
6370}
6371
6372int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
6373 ExecCleanMask mask = 0;
6374
6375 assert(c);
6376 assert(ret);
6377
6378 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
211a3d87 6379 if (c->directories[t].n_items > 0)
4c2f5842
LP
6380 mask |= 1U << t;
6381
6382 *ret = mask;
6383 return 0;
6384}
6385
b58b4116 6386void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 6387 assert(s);
5cb5a6ff 6388
2ed26ed0
LP
6389 *s = (ExecStatus) {
6390 .pid = pid,
6391 };
6392
b58b4116
LP
6393 dual_timestamp_get(&s->start_timestamp);
6394}
6395
34cf6c43 6396void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
6397 assert(s);
6398
d46b79bb 6399 if (s->pid != pid)
2ed26ed0
LP
6400 *s = (ExecStatus) {
6401 .pid = pid,
6402 };
b58b4116 6403
63983207 6404 dual_timestamp_get(&s->exit_timestamp);
9fb86720 6405
034c6ed7
LP
6406 s->code = code;
6407 s->status = status;
169c1bda 6408
6f765baf
LP
6409 if (context && context->utmp_id)
6410 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
6411}
6412
6a1d4d9f
LP
6413void exec_status_reset(ExecStatus *s) {
6414 assert(s);
6415
6416 *s = (ExecStatus) {};
6417}
6418
34cf6c43 6419void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
6420 assert(s);
6421 assert(f);
6422
9fb86720
LP
6423 if (s->pid <= 0)
6424 return;
6425
4c940960
LP
6426 prefix = strempty(prefix);
6427
9fb86720 6428 fprintf(f,
ccd06097
ZJS
6429 "%sPID: "PID_FMT"\n",
6430 prefix, s->pid);
9fb86720 6431
af9d16e1 6432 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
6433 fprintf(f,
6434 "%sStart Timestamp: %s\n",
04f5c018 6435 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 6436
af9d16e1 6437 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
6438 fprintf(f,
6439 "%sExit Timestamp: %s\n"
6440 "%sExit Code: %s\n"
6441 "%sExit Status: %i\n",
04f5c018 6442 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
6443 prefix, sigchld_code_to_string(s->code),
6444 prefix, s->status);
5cb5a6ff 6445}
44d8db9e 6446
34cf6c43 6447static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6448 _cleanup_free_ char *cmd = NULL;
4c940960 6449 const char *prefix2;
44d8db9e
LP
6450
6451 assert(c);
6452 assert(f);
6453
4c940960 6454 prefix = strempty(prefix);
63c372cb 6455 prefix2 = strjoina(prefix, "\t");
44d8db9e 6456
4ef15008 6457 cmd = quote_command_line(c->argv, SHELL_ESCAPE_EMPTY);
38553034 6458
44d8db9e
LP
6459 fprintf(f,
6460 "%sCommand Line: %s\n",
38553034 6461 prefix, strnull(cmd));
44d8db9e 6462
9fb86720 6463 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6464}
6465
6466void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6467 assert(f);
6468
4c940960 6469 prefix = strempty(prefix);
44d8db9e 6470
03677889
YW
6471 LIST_FOREACH(command, i, c)
6472 exec_command_dump(i, f, prefix);
44d8db9e 6473}
94f04347 6474
a6a80b4f
LP
6475void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6476 ExecCommand *end;
6477
6478 assert(l);
6479 assert(e);
6480
6481 if (*l) {
35b8ca3a 6482 /* It's kind of important, that we keep the order here */
cc232fa0 6483 end = LIST_FIND_TAIL(command, *l);
71fda00f 6484 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f 6485 } else
3ff67ec4 6486 *l = e;
a6a80b4f
LP
6487}
6488
26fd040d
LP
6489int exec_command_set(ExecCommand *c, const char *path, ...) {
6490 va_list ap;
6491 char **l, *p;
6492
6493 assert(c);
6494 assert(path);
6495
6496 va_start(ap, path);
6497 l = strv_new_ap(path, ap);
6498 va_end(ap);
6499
6500 if (!l)
6501 return -ENOMEM;
6502
250a918d
LP
6503 p = strdup(path);
6504 if (!p) {
26fd040d
LP
6505 strv_free(l);
6506 return -ENOMEM;
6507 }
6508
6897dfe8 6509 free_and_replace(c->path, p);
26fd040d 6510
130d3d22 6511 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6512}
6513
86b23b07 6514int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6515 _cleanup_strv_free_ char **l = NULL;
86b23b07 6516 va_list ap;
86b23b07
JS
6517 int r;
6518
6519 assert(c);
6520 assert(path);
6521
6522 va_start(ap, path);
6523 l = strv_new_ap(path, ap);
6524 va_end(ap);
6525
6526 if (!l)
6527 return -ENOMEM;
6528
e287086b 6529 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6530 if (r < 0)
86b23b07 6531 return r;
86b23b07
JS
6532
6533 return 0;
6534}
6535
437f3e35
LP
6536static char *destroy_tree(char *path) {
6537 if (!path)
6538 return NULL;
9c0c6701 6539
437f3e35
LP
6540 if (!path_equal(path, RUN_SYSTEMD_EMPTY)) {
6541 log_debug("Spawning process to nuke '%s'", path);
9c0c6701 6542
437f3e35
LP
6543 (void) asynchronous_rm_rf(path, REMOVE_ROOT|REMOVE_SUBVOLUME|REMOVE_PHYSICAL);
6544 }
9c0c6701 6545
437f3e35 6546 return mfree(path);
9c0c6701
DDM
6547}
6548
e52a696a 6549static ExecSharedRuntime* exec_shared_runtime_free(ExecSharedRuntime *rt) {
e8a565cb
YW
6550 if (!rt)
6551 return NULL;
6552
6553 if (rt->manager)
e76506b7 6554 (void) hashmap_remove(rt->manager->exec_shared_runtime_by_id, rt->id);
e8a565cb 6555
e52a696a
DDM
6556 rt->id = mfree(rt->id);
6557 rt->tmp_dir = mfree(rt->tmp_dir);
6558 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6559 safe_close_pair(rt->netns_storage_socket);
6560 safe_close_pair(rt->ipcns_storage_socket);
6561 return mfree(rt);
6562}
6563
6564DEFINE_TRIVIAL_UNREF_FUNC(ExecSharedRuntime, exec_shared_runtime, exec_shared_runtime_free);
6565DEFINE_TRIVIAL_CLEANUP_FUNC(ExecSharedRuntime*, exec_shared_runtime_free);
6566
6567ExecSharedRuntime* exec_shared_runtime_destroy(ExecSharedRuntime *rt) {
e52a696a
DDM
6568 if (!rt)
6569 return NULL;
6570
6571 assert(rt->n_ref > 0);
6572 rt->n_ref--;
6573
6574 if (rt->n_ref > 0)
6575 return NULL;
56a13a49 6576
437f3e35
LP
6577 rt->tmp_dir = destroy_tree(rt->tmp_dir);
6578 rt->var_tmp_dir = destroy_tree(rt->var_tmp_dir);
e8a565cb 6579
e52a696a 6580 return exec_shared_runtime_free(rt);
e8a565cb
YW
6581}
6582
e76506b7 6583static int exec_shared_runtime_allocate(ExecSharedRuntime **ret, const char *id) {
56a13a49 6584 _cleanup_free_ char *id_copy = NULL;
e76506b7 6585 ExecSharedRuntime *n;
613b411c 6586
8e8009dc 6587 assert(ret);
613b411c 6588
56a13a49
ZJS
6589 id_copy = strdup(id);
6590 if (!id_copy)
6591 return -ENOMEM;
6592
e76506b7 6593 n = new(ExecSharedRuntime, 1);
8e8009dc 6594 if (!n)
613b411c
LP
6595 return -ENOMEM;
6596
e76506b7 6597 *n = (ExecSharedRuntime) {
56a13a49 6598 .id = TAKE_PTR(id_copy),
19ee48a6
YW
6599 .netns_storage_socket = PIPE_EBADF,
6600 .ipcns_storage_socket = PIPE_EBADF,
8e8009dc
LP
6601 };
6602
6603 *ret = n;
613b411c
LP
6604 return 0;
6605}
6606
e76506b7 6607static int exec_shared_runtime_add(
e8a565cb
YW
6608 Manager *m,
6609 const char *id,
56a13a49
ZJS
6610 char **tmp_dir,
6611 char **var_tmp_dir,
6612 int netns_storage_socket[2],
a70581ff 6613 int ipcns_storage_socket[2],
e76506b7 6614 ExecSharedRuntime **ret) {
e8a565cb 6615
e76506b7 6616 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt = NULL;
613b411c
LP
6617 int r;
6618
e8a565cb 6619 assert(m);
613b411c
LP
6620 assert(id);
6621
a70581ff 6622 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6623
e76506b7 6624 r = exec_shared_runtime_allocate(&rt, id);
613b411c
LP
6625 if (r < 0)
6626 return r;
6627
e76506b7 6628 r = hashmap_ensure_put(&m->exec_shared_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6629 if (r < 0)
6630 return r;
e8a565cb 6631
56a13a49
ZJS
6632 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6633 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6634 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6635
6636 if (netns_storage_socket) {
56a13a49
ZJS
6637 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6638 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6639 }
6640
a70581ff
XR
6641 if (ipcns_storage_socket) {
6642 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6643 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6644 }
6645
e8a565cb
YW
6646 rt->manager = m;
6647
6648 if (ret)
6649 *ret = rt;
e76506b7 6650 /* do not remove created ExecSharedRuntime object when the operation succeeds. */
56a13a49 6651 TAKE_PTR(rt);
e8a565cb
YW
6652 return 0;
6653}
6654
e76506b7 6655static int exec_shared_runtime_make(
74aaf59b
LP
6656 Manager *m,
6657 const ExecContext *c,
6658 const char *id,
e76506b7 6659 ExecSharedRuntime **ret) {
74aaf59b 6660
56a13a49 6661 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
19ee48a6 6662 _cleanup_close_pair_ int netns_storage_socket[2] = PIPE_EBADF, ipcns_storage_socket[2] = PIPE_EBADF;
e8a565cb
YW
6663 int r;
6664
6665 assert(m);
6666 assert(c);
6667 assert(id);
6668
e76506b7 6669 /* It is not necessary to create ExecSharedRuntime object. */
fde36d25 6670 if (!exec_needs_network_namespace(c) && !exec_needs_ipc_namespace(c) && !c->private_tmp) {
74aaf59b 6671 *ret = NULL;
e8a565cb 6672 return 0;
74aaf59b 6673 }
e8a565cb 6674
efa2f3a1
TM
6675 if (c->private_tmp &&
6676 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6677 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6678 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6679 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6680 if (r < 0)
6681 return r;
6682 }
6683
fbbb9697 6684 if (exec_needs_network_namespace(c)) {
e8a565cb
YW
6685 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6686 return -errno;
6687 }
6688
fde36d25 6689 if (exec_needs_ipc_namespace(c)) {
a70581ff
XR
6690 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6691 return -errno;
6692 }
6693
e76506b7 6694 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6695 if (r < 0)
6696 return r;
6697
613b411c
LP
6698 return 1;
6699}
6700
e76506b7
DDM
6701int exec_shared_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecSharedRuntime **ret) {
6702 ExecSharedRuntime *rt;
e8a565cb 6703 int r;
613b411c 6704
e8a565cb
YW
6705 assert(m);
6706 assert(id);
6707 assert(ret);
6708
e76506b7 6709 rt = hashmap_get(m->exec_shared_runtime_by_id, id);
e8a565cb 6710 if (rt)
e76506b7 6711 /* We already have an ExecSharedRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6712 goto ref;
6713
74aaf59b
LP
6714 if (!create) {
6715 *ret = NULL;
e8a565cb 6716 return 0;
74aaf59b 6717 }
e8a565cb
YW
6718
6719 /* If not found, then create a new object. */
e76506b7 6720 r = exec_shared_runtime_make(m, c, id, &rt);
74aaf59b 6721 if (r < 0)
e8a565cb 6722 return r;
74aaf59b 6723 if (r == 0) {
e76506b7 6724 /* When r == 0, it is not necessary to create ExecSharedRuntime object. */
74aaf59b
LP
6725 *ret = NULL;
6726 return 0;
6727 }
613b411c 6728
e8a565cb
YW
6729ref:
6730 /* increment reference counter. */
6731 rt->n_ref++;
6732 *ret = rt;
6733 return 1;
6734}
613b411c 6735
e76506b7
DDM
6736int exec_shared_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6737 ExecSharedRuntime *rt;
e8a565cb
YW
6738
6739 assert(m);
613b411c
LP
6740 assert(f);
6741 assert(fds);
6742
e76506b7 6743 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb 6744 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6745
e8a565cb
YW
6746 if (rt->tmp_dir)
6747 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6748
e8a565cb
YW
6749 if (rt->var_tmp_dir)
6750 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6751
e8a565cb
YW
6752 if (rt->netns_storage_socket[0] >= 0) {
6753 int copy;
613b411c 6754
e8a565cb
YW
6755 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6756 if (copy < 0)
6757 return copy;
613b411c 6758
e8a565cb
YW
6759 fprintf(f, " netns-socket-0=%i", copy);
6760 }
613b411c 6761
e8a565cb
YW
6762 if (rt->netns_storage_socket[1] >= 0) {
6763 int copy;
613b411c 6764
e8a565cb
YW
6765 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6766 if (copy < 0)
6767 return copy;
613b411c 6768
e8a565cb
YW
6769 fprintf(f, " netns-socket-1=%i", copy);
6770 }
6771
a70581ff
XR
6772 if (rt->ipcns_storage_socket[0] >= 0) {
6773 int copy;
6774
6775 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6776 if (copy < 0)
6777 return copy;
6778
6779 fprintf(f, " ipcns-socket-0=%i", copy);
6780 }
6781
6782 if (rt->ipcns_storage_socket[1] >= 0) {
6783 int copy;
6784
6785 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6786 if (copy < 0)
6787 return copy;
6788
6789 fprintf(f, " ipcns-socket-1=%i", copy);
6790 }
6791
e8a565cb 6792 fputc('\n', f);
613b411c
LP
6793 }
6794
6795 return 0;
6796}
6797
e76506b7
DDM
6798int exec_shared_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6799 _cleanup_(exec_shared_runtime_freep) ExecSharedRuntime *rt_create = NULL;
154eb43f 6800 ExecSharedRuntime *rt = NULL;
613b411c
LP
6801 int r;
6802
e8a565cb
YW
6803 /* This is for the migration from old (v237 or earlier) deserialization text.
6804 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
e76506b7 6805 * Even if the ExecSharedRuntime object originally created by the other unit, we cannot judge
e8a565cb
YW
6806 * so or not from the serialized text, then we always creates a new object owned by this. */
6807
6808 assert(u);
613b411c
LP
6809 assert(key);
6810 assert(value);
6811
e76506b7 6812 /* Manager manages ExecSharedRuntime objects by the unit id.
e8a565cb
YW
6813 * So, we omit the serialized text when the unit does not have id (yet?)... */
6814 if (isempty(u->id)) {
6815 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6816 return 0;
6817 }
613b411c 6818
154eb43f
LB
6819 if (u->manager) {
6820 if (hashmap_ensure_allocated(&u->manager->exec_shared_runtime_by_id, &string_hash_ops) < 0)
6821 return log_oom();
e8a565cb 6822
154eb43f
LB
6823 rt = hashmap_get(u->manager->exec_shared_runtime_by_id, u->id);
6824 }
e8a565cb 6825 if (!rt) {
e76506b7 6826 if (exec_shared_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6827 return log_oom();
613b411c 6828
e8a565cb
YW
6829 rt = rt_create;
6830 }
6831
6832 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6833 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6834 return -ENOMEM;
613b411c
LP
6835
6836 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6837 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6838 return -ENOMEM;
613b411c
LP
6839
6840 } else if (streq(key, "netns-socket-0")) {
6841 int fd;
6842
e652663a 6843 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6844 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6845 return 0;
613b411c 6846 }
e8a565cb
YW
6847
6848 safe_close(rt->netns_storage_socket[0]);
6849 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6850
613b411c
LP
6851 } else if (streq(key, "netns-socket-1")) {
6852 int fd;
6853
e652663a 6854 if ((fd = parse_fd(value)) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6855 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6856 return 0;
613b411c 6857 }
e8a565cb
YW
6858
6859 safe_close(rt->netns_storage_socket[1]);
6860 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6861
613b411c
LP
6862 } else
6863 return 0;
6864
e76506b7 6865 /* If the object is newly created, then put it to the hashmap which manages ExecSharedRuntime objects. */
154eb43f 6866 if (rt_create && u->manager) {
e76506b7 6867 r = hashmap_put(u->manager->exec_shared_runtime_by_id, rt_create->id, rt_create);
e8a565cb 6868 if (r < 0) {
3fe91079 6869 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6870 return 0;
6871 }
613b411c 6872
e8a565cb 6873 rt_create->manager = u->manager;
613b411c 6874
e8a565cb 6875 /* Avoid cleanup */
56a13a49 6876 TAKE_PTR(rt_create);
e8a565cb 6877 }
98b47d54 6878
e8a565cb
YW
6879 return 1;
6880}
613b411c 6881
e76506b7 6882int exec_shared_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
56a13a49
ZJS
6883 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6884 char *id = NULL;
a70581ff 6885 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
99534007 6886 const char *p, *v = ASSERT_PTR(value);
e8a565cb 6887 size_t n;
613b411c 6888
e8a565cb 6889 assert(m);
e8a565cb 6890 assert(fds);
98b47d54 6891
e8a565cb 6892 n = strcspn(v, " ");
2f82562b 6893 id = strndupa_safe(v, n);
e8a565cb
YW
6894 if (v[n] != ' ')
6895 goto finalize;
6896 p = v + n + 1;
6897
6898 v = startswith(p, "tmp-dir=");
6899 if (v) {
6900 n = strcspn(v, " ");
56a13a49
ZJS
6901 tmp_dir = strndup(v, n);
6902 if (!tmp_dir)
6903 return log_oom();
e8a565cb
YW
6904 if (v[n] != ' ')
6905 goto finalize;
6906 p = v + n + 1;
6907 }
6908
6909 v = startswith(p, "var-tmp-dir=");
6910 if (v) {
6911 n = strcspn(v, " ");
56a13a49
ZJS
6912 var_tmp_dir = strndup(v, n);
6913 if (!var_tmp_dir)
6914 return log_oom();
e8a565cb
YW
6915 if (v[n] != ' ')
6916 goto finalize;
6917 p = v + n + 1;
6918 }
6919
6920 v = startswith(p, "netns-socket-0=");
6921 if (v) {
6922 char *buf;
6923
6924 n = strcspn(v, " ");
2f82562b 6925 buf = strndupa_safe(v, n);
c413bb28 6926
e652663a
DT
6927 netns_fdpair[0] = parse_fd(buf);
6928 if (netns_fdpair[0] < 0)
6929 return log_debug_errno(netns_fdpair[0], "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6930 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6931 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6932 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6933 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6934 if (v[n] != ' ')
6935 goto finalize;
6936 p = v + n + 1;
613b411c
LP
6937 }
6938
e8a565cb
YW
6939 v = startswith(p, "netns-socket-1=");
6940 if (v) {
6941 char *buf;
98b47d54 6942
e8a565cb 6943 n = strcspn(v, " ");
2f82562b 6944 buf = strndupa_safe(v, n);
a70581ff 6945
e652663a
DT
6946 netns_fdpair[1] = parse_fd(buf);
6947 if (netns_fdpair[1] < 0)
6948 return log_debug_errno(netns_fdpair[1], "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6949 if (!fdset_contains(fds, netns_fdpair[1]))
6950 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6951 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6952 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6953 if (v[n] != ' ')
6954 goto finalize;
6955 p = v + n + 1;
6956 }
6957
6958 v = startswith(p, "ipcns-socket-0=");
6959 if (v) {
6960 char *buf;
6961
6962 n = strcspn(v, " ");
2f82562b 6963 buf = strndupa_safe(v, n);
a70581ff 6964
e652663a
DT
6965 ipcns_fdpair[0] = parse_fd(buf);
6966 if (ipcns_fdpair[0] < 0)
6967 return log_debug_errno(ipcns_fdpair[0], "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
a70581ff
XR
6968 if (!fdset_contains(fds, ipcns_fdpair[0]))
6969 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6970 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6971 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6972 if (v[n] != ' ')
6973 goto finalize;
6974 p = v + n + 1;
6975 }
6976
6977 v = startswith(p, "ipcns-socket-1=");
6978 if (v) {
6979 char *buf;
6980
6981 n = strcspn(v, " ");
2f82562b 6982 buf = strndupa_safe(v, n);
a70581ff 6983
e652663a
DT
6984 ipcns_fdpair[1] = parse_fd(buf);
6985 if (ipcns_fdpair[1] < 0)
6986 return log_debug_errno(ipcns_fdpair[1], "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
a70581ff 6987 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6988 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6989 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6990 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6991 }
98b47d54 6992
e8a565cb 6993finalize:
e76506b7 6994 r = exec_shared_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6995 if (r < 0)
56a13a49
ZJS
6996 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6997 return 0;
e8a565cb 6998}
613b411c 6999
e76506b7
DDM
7000void exec_shared_runtime_vacuum(Manager *m) {
7001 ExecSharedRuntime *rt;
e8a565cb
YW
7002
7003 assert(m);
7004
e76506b7 7005 /* Free unreferenced ExecSharedRuntime objects. This is used after manager deserialization process. */
e8a565cb 7006
e76506b7 7007 HASHMAP_FOREACH(rt, m->exec_shared_runtime_by_id) {
e8a565cb
YW
7008 if (rt->n_ref > 0)
7009 continue;
7010
e52a696a 7011 (void) exec_shared_runtime_free(rt);
e8a565cb 7012 }
613b411c
LP
7013}
7014
9c0c6701
DDM
7015int exec_runtime_make(
7016 const Unit *unit,
7017 const ExecContext *context,
7018 ExecSharedRuntime *shared,
7019 DynamicCreds *creds,
7020 ExecRuntime **ret) {
7021 _cleanup_close_pair_ int ephemeral_storage_socket[2] = PIPE_EBADF;
7022 _cleanup_free_ char *ephemeral = NULL;
28135da3 7023 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
9c0c6701 7024 int r;
28135da3 7025
9c0c6701
DDM
7026 assert(unit);
7027 assert(context);
28135da3
DDM
7028 assert(ret);
7029
9c0c6701 7030 if (!shared && !creds && !exec_needs_ephemeral(context)) {
28135da3
DDM
7031 *ret = NULL;
7032 return 0;
7033 }
7034
9c0c6701
DDM
7035 if (exec_needs_ephemeral(context)) {
7036 r = mkdir_p("/var/lib/systemd/ephemeral-trees", 0755);
7037 if (r < 0)
7038 return r;
7039
7040 r = tempfn_random_child("/var/lib/systemd/ephemeral-trees", unit->id, &ephemeral);
7041 if (r < 0)
7042 return r;
7043
7044 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ephemeral_storage_socket) < 0)
7045 return -errno;
7046 }
7047
28135da3
DDM
7048 rt = new(ExecRuntime, 1);
7049 if (!rt)
7050 return -ENOMEM;
7051
7052 *rt = (ExecRuntime) {
7053 .shared = shared,
15220772 7054 .dynamic_creds = creds,
9c0c6701
DDM
7055 .ephemeral_copy = TAKE_PTR(ephemeral),
7056 .ephemeral_storage_socket[0] = TAKE_FD(ephemeral_storage_socket[0]),
7057 .ephemeral_storage_socket[1] = TAKE_FD(ephemeral_storage_socket[1]),
28135da3
DDM
7058 };
7059
7060 *ret = TAKE_PTR(rt);
7061 return 1;
7062}
7063
7064ExecRuntime* exec_runtime_free(ExecRuntime *rt) {
7065 if (!rt)
7066 return NULL;
7067
7068 exec_shared_runtime_unref(rt->shared);
15220772 7069 dynamic_creds_unref(rt->dynamic_creds);
9c0c6701 7070
437f3e35 7071 rt->ephemeral_copy = destroy_tree(rt->ephemeral_copy);
9c0c6701 7072
9c0c6701 7073 safe_close_pair(rt->ephemeral_storage_socket);
28135da3
DDM
7074 return mfree(rt);
7075}
7076
7077ExecRuntime* exec_runtime_destroy(ExecRuntime *rt) {
7078 if (!rt)
7079 return NULL;
7080
7081 rt->shared = exec_shared_runtime_destroy(rt->shared);
15220772 7082 rt->dynamic_creds = dynamic_creds_destroy(rt->dynamic_creds);
28135da3
DDM
7083 return exec_runtime_free(rt);
7084}
7085
b9c04eaf
YW
7086void exec_params_clear(ExecParameters *p) {
7087 if (!p)
7088 return;
7089
c3f8a065
LP
7090 p->environment = strv_free(p->environment);
7091 p->fd_names = strv_free(p->fd_names);
154eb43f 7092 p->files_env = strv_free(p->files_env);
c3f8a065
LP
7093 p->fds = mfree(p->fds);
7094 p->exec_fd = safe_close(p->exec_fd);
154eb43f
LB
7095 p->user_lookup_fd = safe_close(p->user_lookup_fd);
7096 p->bpf_outer_map_fd = -EBADF;
b9c04eaf
YW
7097}
7098
211a3d87
LB
7099void exec_directory_done(ExecDirectory *d) {
7100 if (!d)
7101 return;
7102
7103 for (size_t i = 0; i < d->n_items; i++) {
7104 free(d->items[i].path);
7105 strv_free(d->items[i].symlinks);
7106 }
7107
7108 d->items = mfree(d->items);
7109 d->n_items = 0;
7110 d->mode = 0755;
7111}
7112
564e5c98
YW
7113static ExecDirectoryItem *exec_directory_find(ExecDirectory *d, const char *path) {
7114 assert(d);
7115 assert(path);
7116
7117 for (size_t i = 0; i < d->n_items; i++)
7118 if (path_equal(d->items[i].path, path))
7119 return &d->items[i];
7120
7121 return NULL;
7122}
7123
7124int exec_directory_add(ExecDirectory *d, const char *path, const char *symlink) {
211a3d87
LB
7125 _cleanup_strv_free_ char **s = NULL;
7126 _cleanup_free_ char *p = NULL;
564e5c98
YW
7127 ExecDirectoryItem *existing;
7128 int r;
211a3d87
LB
7129
7130 assert(d);
211a3d87
LB
7131 assert(path);
7132
564e5c98
YW
7133 existing = exec_directory_find(d, path);
7134 if (existing) {
7135 r = strv_extend(&existing->symlinks, symlink);
7136 if (r < 0)
7137 return r;
7138
7139 return 0; /* existing item is updated */
7140 }
7141
211a3d87
LB
7142 p = strdup(path);
7143 if (!p)
7144 return -ENOMEM;
7145
564e5c98
YW
7146 if (symlink) {
7147 s = strv_new(symlink);
211a3d87
LB
7148 if (!s)
7149 return -ENOMEM;
7150 }
7151
564e5c98 7152 if (!GREEDY_REALLOC(d->items, d->n_items + 1))
211a3d87
LB
7153 return -ENOMEM;
7154
564e5c98 7155 d->items[d->n_items++] = (ExecDirectoryItem) {
211a3d87
LB
7156 .path = TAKE_PTR(p),
7157 .symlinks = TAKE_PTR(s),
7158 };
7159
564e5c98 7160 return 1; /* new item is added */
211a3d87
LB
7161}
7162
a2ab603c
YW
7163static int exec_directory_item_compare_func(const ExecDirectoryItem *a, const ExecDirectoryItem *b) {
7164 assert(a);
7165 assert(b);
7166
7167 return path_compare(a->path, b->path);
7168}
7169
7170void exec_directory_sort(ExecDirectory *d) {
7171 assert(d);
7172
7173 /* Sort the exec directories to make always parent directories processed at first in
7174 * setup_exec_directory(), e.g., even if StateDirectory=foo/bar foo, we need to create foo at first,
7175 * then foo/bar. Also, set .only_create flag if one of the parent directories is contained in the
7176 * list. See also comments in setup_exec_directory() and issue #24783. */
7177
7178 if (d->n_items <= 1)
7179 return;
7180
7181 typesafe_qsort(d->items, d->n_items, exec_directory_item_compare_func);
7182
7183 for (size_t i = 1; i < d->n_items; i++)
7184 for (size_t j = 0; j < i; j++)
7185 if (path_startswith(d->items[i].path, d->items[j].path)) {
7186 d->items[i].only_create = true;
7187 break;
7188 }
211a3d87
LB
7189}
7190
4fb8f1e8
LP
7191ExecCleanMask exec_clean_mask_from_string(const char *s) {
7192 ExecDirectoryType t;
7193
7194 assert(s);
7195
7196 if (streq(s, "all"))
7197 return EXEC_CLEAN_ALL;
7198 if (streq(s, "fdstore"))
7199 return EXEC_CLEAN_FDSTORE;
7200
7201 t = exec_resource_type_from_string(s);
7202 if (t < 0)
7203 return (ExecCleanMask) t;
7204
7205 return 1U << t;
7206}
7207
80876c20
LP
7208static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
7209 [EXEC_INPUT_NULL] = "null",
7210 [EXEC_INPUT_TTY] = "tty",
7211 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 7212 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
7213 [EXEC_INPUT_SOCKET] = "socket",
7214 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 7215 [EXEC_INPUT_DATA] = "data",
2038c3f5 7216 [EXEC_INPUT_FILE] = "file",
80876c20
LP
7217};
7218
8a0867d6
LP
7219DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
7220
94f04347 7221static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 7222 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 7223 [EXEC_OUTPUT_NULL] = "null",
80876c20 7224 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 7225 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 7226 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
7227 [EXEC_OUTPUT_JOURNAL] = "journal",
7228 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
7229 [EXEC_OUTPUT_SOCKET] = "socket",
7230 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 7231 [EXEC_OUTPUT_FILE] = "file",
566b7d23 7232 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 7233 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
7234};
7235
7236DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
7237
7238static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
7239 [EXEC_UTMP_INIT] = "init",
7240 [EXEC_UTMP_LOGIN] = "login",
7241 [EXEC_UTMP_USER] = "user",
7242};
7243
7244DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
7245
7246static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
7247 [EXEC_PRESERVE_NO] = "no",
7248 [EXEC_PRESERVE_YES] = "yes",
7249 [EXEC_PRESERVE_RESTART] = "restart",
7250};
7251
7252DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 7253
6b7b2ed9 7254/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 7255static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
7256 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
7257 [EXEC_DIRECTORY_STATE] = "StateDirectory",
7258 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
7259 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
7260 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
7261};
7262
7263DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 7264
211a3d87
LB
7265/* This table maps ExecDirectoryType to the symlink setting it is configured with in the unit */
7266static const char* const exec_directory_type_symlink_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7267 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectorySymlink",
7268 [EXEC_DIRECTORY_STATE] = "StateDirectorySymlink",
7269 [EXEC_DIRECTORY_CACHE] = "CacheDirectorySymlink",
7270 [EXEC_DIRECTORY_LOGS] = "LogsDirectorySymlink",
7271 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectorySymlink",
7272};
7273
7274DEFINE_STRING_TABLE_LOOKUP(exec_directory_type_symlink, ExecDirectoryType);
7275
6b7b2ed9
LP
7276/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
7277 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
7278 * directories, specifically .timer units with their timestamp touch file. */
7279static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7280 [EXEC_DIRECTORY_RUNTIME] = "runtime",
7281 [EXEC_DIRECTORY_STATE] = "state",
7282 [EXEC_DIRECTORY_CACHE] = "cache",
7283 [EXEC_DIRECTORY_LOGS] = "logs",
7284 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
7285};
7286
7287DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
7288
7289/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
7290 * the service payload in. */
fb2042dd
YW
7291static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
7292 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
7293 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
7294 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
7295 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
7296 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
7297};
7298
7299DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
7300
b1edf445
LP
7301static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
7302 [EXEC_KEYRING_INHERIT] = "inherit",
7303 [EXEC_KEYRING_PRIVATE] = "private",
7304 [EXEC_KEYRING_SHARED] = "shared",
7305};
7306
7307DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);