]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
tree-wide: add missing whitespace at the end of comments
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
fdb3deca 46#include "cgroup-setup.h"
bb0c0d6f 47#include "chown-recursive.h"
da681e1b 48#include "cpu-set-util.h"
f6a6225e 49#include "def.h"
686d13b9 50#include "env-file.h"
4d1a6904 51#include "env-util.h"
17df7223 52#include "errno-list.h"
3ffd4af2 53#include "execute.h"
8dd4c05b 54#include "exit-status.h"
3ffd4af2 55#include "fd-util.h"
bb0c0d6f 56#include "fileio.h"
f97b34a6 57#include "format-util.h"
f4f15635 58#include "fs-util.h"
7d50b32a 59#include "glob-util.h"
0389f4fa 60#include "hexdecoct.h"
c004493c 61#include "io-util.h"
8dd4c05b 62#include "ioprio.h"
a1164ae3 63#include "label.h"
8dd4c05b
LP
64#include "log.h"
65#include "macro.h"
e8a565cb 66#include "manager.h"
2a341bb9 67#include "manager-dump.h"
0a970718 68#include "memory-util.h"
f5947a5e 69#include "missing_fs.h"
8dd4c05b 70#include "mkdir.h"
21935150 71#include "mount-util.h"
bb0c0d6f 72#include "mountpoint-util.h"
8dd4c05b 73#include "namespace.h"
6bedfcbb 74#include "parse-util.h"
8dd4c05b 75#include "path-util.h"
0b452006 76#include "process-util.h"
d3dcf4e3 77#include "random-util.h"
78f22b97 78#include "rlimit-util.h"
8dd4c05b 79#include "rm-rf.h"
349cc4a5 80#if HAVE_SECCOMP
3ffd4af2
LP
81#include "seccomp-util.h"
82#endif
07d46372 83#include "securebits-util.h"
8dd4c05b 84#include "selinux-util.h"
24882e06 85#include "signal-util.h"
8dd4c05b 86#include "smack-util.h"
57b7a260 87#include "socket-util.h"
fd63e712 88#include "special.h"
949befd3 89#include "stat-util.h"
8b43440b 90#include "string-table.h"
07630cea 91#include "string-util.h"
8dd4c05b 92#include "strv.h"
7ccbd1ae 93#include "syslog-util.h"
8dd4c05b 94#include "terminal-util.h"
bb0c0d6f 95#include "tmpfile-util.h"
566b7d23 96#include "umask-util.h"
2d3b784d 97#include "unit-serialize.h"
b1d4f8e1 98#include "user-util.h"
8dd4c05b 99#include "utmp-wtmp.h"
5cb5a6ff 100
e056b01d 101#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 102#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 103
531dca78
LP
104#define SNDBUF_SIZE (8*1024*1024)
105
da6053d0 106static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
107 if (n_fds <= 0)
108 return 0;
109
a0d40ac5
LP
110 /* Modifies the fds array! (sorts it) */
111
034c6ed7
LP
112 assert(fds);
113
5b10116e
ZJS
114 for (int start = 0;;) {
115 int restart_from = -1;
034c6ed7 116
5b10116e 117 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
118 int nfd;
119
120 /* Already at right index? */
121 if (fds[i] == i+3)
122 continue;
123
3cc2aff1
LP
124 nfd = fcntl(fds[i], F_DUPFD, i + 3);
125 if (nfd < 0)
034c6ed7
LP
126 return -errno;
127
03e334a1 128 safe_close(fds[i]);
034c6ed7
LP
129 fds[i] = nfd;
130
131 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 132 * let's remember that and try again from here */
034c6ed7
LP
133 if (nfd != i+3 && restart_from < 0)
134 restart_from = i;
135 }
136
137 if (restart_from < 0)
138 break;
139
140 start = restart_from;
141 }
142
143 return 0;
144}
145
25b583d7 146static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 147 size_t n_fds;
e2c76839 148 int r;
47a71eed 149
25b583d7 150 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
151 if (n_fds <= 0)
152 return 0;
153
154 assert(fds);
155
9b141911
FB
156 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
157 * O_NONBLOCK only applies to socket activation though. */
47a71eed 158
5b10116e 159 for (size_t i = 0; i < n_fds; i++) {
47a71eed 160
9b141911
FB
161 if (i < n_socket_fds) {
162 r = fd_nonblock(fds[i], nonblock);
163 if (r < 0)
164 return r;
165 }
47a71eed 166
451a074f
LP
167 /* We unconditionally drop FD_CLOEXEC from the fds,
168 * since after all we want to pass these fds to our
169 * children */
47a71eed 170
3cc2aff1
LP
171 r = fd_cloexec(fds[i], false);
172 if (r < 0)
e2c76839 173 return r;
47a71eed
LP
174 }
175
176 return 0;
177}
178
1e22b5cd 179static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
180 assert(context);
181
1e22b5cd
LP
182 if (context->stdio_as_fds)
183 return NULL;
184
80876c20
LP
185 if (context->tty_path)
186 return context->tty_path;
187
188 return "/dev/console";
189}
190
1e22b5cd
LP
191static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
192 const char *path;
193
6ea832a2
LP
194 assert(context);
195
1e22b5cd 196 path = exec_context_tty_path(context);
6ea832a2 197
1e22b5cd
LP
198 if (context->tty_vhangup) {
199 if (p && p->stdin_fd >= 0)
200 (void) terminal_vhangup_fd(p->stdin_fd);
201 else if (path)
202 (void) terminal_vhangup(path);
203 }
6ea832a2 204
1e22b5cd
LP
205 if (context->tty_reset) {
206 if (p && p->stdin_fd >= 0)
207 (void) reset_terminal_fd(p->stdin_fd, true);
208 else if (path)
209 (void) reset_terminal(path);
210 }
211
212 if (context->tty_vt_disallocate && path)
213 (void) vt_disallocate(path);
6ea832a2
LP
214}
215
6af760f3
LP
216static bool is_terminal_input(ExecInput i) {
217 return IN_SET(i,
218 EXEC_INPUT_TTY,
219 EXEC_INPUT_TTY_FORCE,
220 EXEC_INPUT_TTY_FAIL);
221}
222
3a1286b6 223static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
224 return IN_SET(o,
225 EXEC_OUTPUT_TTY,
6af760f3
LP
226 EXEC_OUTPUT_KMSG_AND_CONSOLE,
227 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
228}
229
aac8c0c3
LP
230static bool is_kmsg_output(ExecOutput o) {
231 return IN_SET(o,
232 EXEC_OUTPUT_KMSG,
233 EXEC_OUTPUT_KMSG_AND_CONSOLE);
234}
235
6af760f3
LP
236static bool exec_context_needs_term(const ExecContext *c) {
237 assert(c);
238
239 /* Return true if the execution context suggests we should set $TERM to something useful. */
240
241 if (is_terminal_input(c->std_input))
242 return true;
243
244 if (is_terminal_output(c->std_output))
245 return true;
246
247 if (is_terminal_output(c->std_error))
248 return true;
249
250 return !!c->tty_path;
3a1286b6
MS
251}
252
80876c20 253static int open_null_as(int flags, int nfd) {
046a82c1 254 int fd;
071830ff 255
80876c20 256 assert(nfd >= 0);
071830ff 257
613b411c
LP
258 fd = open("/dev/null", flags|O_NOCTTY);
259 if (fd < 0)
071830ff
LP
260 return -errno;
261
046a82c1 262 return move_fd(fd, nfd, false);
071830ff
LP
263}
264
91dd5f7c
LP
265static int connect_journal_socket(
266 int fd,
267 const char *log_namespace,
268 uid_t uid,
269 gid_t gid) {
270
f36a9d59
ZJS
271 union sockaddr_union sa;
272 socklen_t sa_len;
524daa8c
ZJS
273 uid_t olduid = UID_INVALID;
274 gid_t oldgid = GID_INVALID;
91dd5f7c 275 const char *j;
524daa8c
ZJS
276 int r;
277
91dd5f7c
LP
278 j = log_namespace ?
279 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
280 "/run/systemd/journal/stdout";
281 r = sockaddr_un_set_path(&sa.un, j);
282 if (r < 0)
283 return r;
f36a9d59 284 sa_len = r;
91dd5f7c 285
cad93f29 286 if (gid_is_valid(gid)) {
524daa8c
ZJS
287 oldgid = getgid();
288
92a17af9 289 if (setegid(gid) < 0)
524daa8c
ZJS
290 return -errno;
291 }
292
cad93f29 293 if (uid_is_valid(uid)) {
524daa8c
ZJS
294 olduid = getuid();
295
92a17af9 296 if (seteuid(uid) < 0) {
524daa8c
ZJS
297 r = -errno;
298 goto restore_gid;
299 }
300 }
301
f36a9d59 302 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
524daa8c
ZJS
303
304 /* If we fail to restore the uid or gid, things will likely
305 fail later on. This should only happen if an LSM interferes. */
306
cad93f29 307 if (uid_is_valid(uid))
524daa8c
ZJS
308 (void) seteuid(olduid);
309
310 restore_gid:
cad93f29 311 if (gid_is_valid(gid))
524daa8c
ZJS
312 (void) setegid(oldgid);
313
314 return r;
315}
316
fd1f9c89 317static int connect_logger_as(
34cf6c43 318 const Unit *unit,
fd1f9c89 319 const ExecContext *context,
af635cf3 320 const ExecParameters *params,
fd1f9c89
LP
321 ExecOutput output,
322 const char *ident,
fd1f9c89
LP
323 int nfd,
324 uid_t uid,
325 gid_t gid) {
326
2ac1ff68
EV
327 _cleanup_close_ int fd = -1;
328 int r;
071830ff
LP
329
330 assert(context);
af635cf3 331 assert(params);
80876c20
LP
332 assert(output < _EXEC_OUTPUT_MAX);
333 assert(ident);
334 assert(nfd >= 0);
071830ff 335
54fe0cdb
LP
336 fd = socket(AF_UNIX, SOCK_STREAM, 0);
337 if (fd < 0)
80876c20 338 return -errno;
071830ff 339
91dd5f7c 340 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
341 if (r < 0)
342 return r;
071830ff 343
2ac1ff68 344 if (shutdown(fd, SHUT_RD) < 0)
80876c20 345 return -errno;
071830ff 346
fd1f9c89 347 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 348
2ac1ff68 349 if (dprintf(fd,
62bca2c6 350 "%s\n"
80876c20
LP
351 "%s\n"
352 "%i\n"
54fe0cdb
LP
353 "%i\n"
354 "%i\n"
355 "%i\n"
4f4a1dbf 356 "%i\n",
c867611e 357 context->syslog_identifier ?: ident,
af635cf3 358 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
359 context->syslog_priority,
360 !!context->syslog_level_prefix,
f3dc6af2 361 false,
aac8c0c3 362 is_kmsg_output(output),
2ac1ff68
EV
363 is_terminal_output(output)) < 0)
364 return -errno;
80876c20 365
2ac1ff68 366 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 367}
2ac1ff68 368
3a274a21 369static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 370 int fd;
071830ff 371
80876c20
LP
372 assert(path);
373 assert(nfd >= 0);
fd1f9c89 374
3a274a21 375 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 376 if (fd < 0)
80876c20 377 return fd;
071830ff 378
046a82c1 379 return move_fd(fd, nfd, false);
80876c20 380}
071830ff 381
2038c3f5 382static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
383 union sockaddr_union sa;
384 socklen_t sa_len;
15a3e96f 385 _cleanup_close_ int fd = -1;
86fca584 386 int r;
071830ff 387
80876c20 388 assert(path);
071830ff 389
2038c3f5
LP
390 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
391 flags |= O_CREAT;
392
393 fd = open(path, flags|O_NOCTTY, mode);
394 if (fd >= 0)
15a3e96f 395 return TAKE_FD(fd);
071830ff 396
2038c3f5
LP
397 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
398 return -errno;
2038c3f5
LP
399
400 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
401
86fca584
ZJS
402 r = sockaddr_un_set_path(&sa.un, path);
403 if (r < 0)
404 return r == -EINVAL ? -ENXIO : r;
405 sa_len = r;
406
2038c3f5
LP
407 fd = socket(AF_UNIX, SOCK_STREAM, 0);
408 if (fd < 0)
409 return -errno;
410
86fca584 411 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 412 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 413 * indication that this wasn't an AF_UNIX socket after all */
071830ff 414
2038c3f5
LP
415 if ((flags & O_ACCMODE) == O_RDONLY)
416 r = shutdown(fd, SHUT_WR);
417 else if ((flags & O_ACCMODE) == O_WRONLY)
418 r = shutdown(fd, SHUT_RD);
419 else
86fca584 420 r = 0;
15a3e96f 421 if (r < 0)
2038c3f5 422 return -errno;
2038c3f5 423
15a3e96f 424 return TAKE_FD(fd);
80876c20 425}
071830ff 426
08f3be7a
LP
427static int fixup_input(
428 const ExecContext *context,
429 int socket_fd,
430 bool apply_tty_stdin) {
431
432 ExecInput std_input;
433
434 assert(context);
435
436 std_input = context->std_input;
1e3ad081
LP
437
438 if (is_terminal_input(std_input) && !apply_tty_stdin)
439 return EXEC_INPUT_NULL;
071830ff 440
03fd9c49 441 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
442 return EXEC_INPUT_NULL;
443
08f3be7a
LP
444 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
445 return EXEC_INPUT_NULL;
446
03fd9c49 447 return std_input;
4f2d528d
LP
448}
449
7966a916 450static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 451
7966a916 452 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
453 return EXEC_OUTPUT_INHERIT;
454
7966a916 455 return output;
4f2d528d
LP
456}
457
a34ceba6
LP
458static int setup_input(
459 const ExecContext *context,
460 const ExecParameters *params,
52c239d7 461 int socket_fd,
2caa38e9 462 const int named_iofds[static 3]) {
a34ceba6 463
4f2d528d
LP
464 ExecInput i;
465
466 assert(context);
a34ceba6 467 assert(params);
2caa38e9 468 assert(named_iofds);
a34ceba6
LP
469
470 if (params->stdin_fd >= 0) {
471 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
472 return -errno;
473
474 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
475 if (isatty(STDIN_FILENO)) {
476 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
477 (void) reset_terminal_fd(STDIN_FILENO, true);
478 }
a34ceba6
LP
479
480 return STDIN_FILENO;
481 }
4f2d528d 482
08f3be7a 483 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
484
485 switch (i) {
071830ff 486
80876c20
LP
487 case EXEC_INPUT_NULL:
488 return open_null_as(O_RDONLY, STDIN_FILENO);
489
490 case EXEC_INPUT_TTY:
491 case EXEC_INPUT_TTY_FORCE:
492 case EXEC_INPUT_TTY_FAIL: {
046a82c1 493 int fd;
071830ff 494
1e22b5cd 495 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
496 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
497 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
498 ACQUIRE_TERMINAL_WAIT,
3a43da28 499 USEC_INFINITY);
970edce6 500 if (fd < 0)
80876c20
LP
501 return fd;
502
046a82c1 503 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
504 }
505
4f2d528d 506 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
507 assert(socket_fd >= 0);
508
4f2d528d
LP
509 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
510
52c239d7 511 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
512 assert(named_iofds[STDIN_FILENO] >= 0);
513
52c239d7
LB
514 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
515 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
516
08f3be7a
LP
517 case EXEC_INPUT_DATA: {
518 int fd;
519
520 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
521 if (fd < 0)
522 return fd;
523
524 return move_fd(fd, STDIN_FILENO, false);
525 }
526
2038c3f5
LP
527 case EXEC_INPUT_FILE: {
528 bool rw;
529 int fd;
530
531 assert(context->stdio_file[STDIN_FILENO]);
532
533 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
534 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
535
536 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
537 if (fd < 0)
538 return fd;
539
540 return move_fd(fd, STDIN_FILENO, false);
541 }
542
80876c20
LP
543 default:
544 assert_not_reached("Unknown input type");
545 }
546}
547
41fc585a
LP
548static bool can_inherit_stderr_from_stdout(
549 const ExecContext *context,
550 ExecOutput o,
551 ExecOutput e) {
552
553 assert(context);
554
555 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
556 * stderr fd */
557
558 if (e == EXEC_OUTPUT_INHERIT)
559 return true;
560 if (e != o)
561 return false;
562
563 if (e == EXEC_OUTPUT_NAMED_FD)
564 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
565
8d7dab1f 566 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
567 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
568
569 return true;
570}
571
a34ceba6 572static int setup_output(
34cf6c43 573 const Unit *unit,
a34ceba6
LP
574 const ExecContext *context,
575 const ExecParameters *params,
576 int fileno,
577 int socket_fd,
2caa38e9 578 const int named_iofds[static 3],
a34ceba6 579 const char *ident,
7bce046b
LP
580 uid_t uid,
581 gid_t gid,
582 dev_t *journal_stream_dev,
583 ino_t *journal_stream_ino) {
a34ceba6 584
4f2d528d
LP
585 ExecOutput o;
586 ExecInput i;
47c1d80d 587 int r;
4f2d528d 588
f2341e0a 589 assert(unit);
80876c20 590 assert(context);
a34ceba6 591 assert(params);
80876c20 592 assert(ident);
7bce046b
LP
593 assert(journal_stream_dev);
594 assert(journal_stream_ino);
80876c20 595
a34ceba6
LP
596 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
597
598 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
599 return -errno;
600
601 return STDOUT_FILENO;
602 }
603
604 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
605 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
606 return -errno;
607
608 return STDERR_FILENO;
609 }
610
08f3be7a 611 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 612 o = fixup_output(context->std_output, socket_fd);
4f2d528d 613
eb17e935
MS
614 if (fileno == STDERR_FILENO) {
615 ExecOutput e;
616 e = fixup_output(context->std_error, socket_fd);
80876c20 617
eb17e935
MS
618 /* This expects the input and output are already set up */
619
620 /* Don't change the stderr file descriptor if we inherit all
621 * the way and are not on a tty */
622 if (e == EXEC_OUTPUT_INHERIT &&
623 o == EXEC_OUTPUT_INHERIT &&
624 i == EXEC_INPUT_NULL &&
625 !is_terminal_input(context->std_input) &&
7966a916 626 getppid() != 1)
eb17e935
MS
627 return fileno;
628
629 /* Duplicate from stdout if possible */
41fc585a 630 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 631 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 632
eb17e935 633 o = e;
80876c20 634
eb17e935 635 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
636 /* If input got downgraded, inherit the original value */
637 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 638 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 639
08f3be7a
LP
640 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
641 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 642 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 643
acb591e4
LP
644 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
645 if (getppid() != 1)
eb17e935 646 return fileno;
94f04347 647
eb17e935
MS
648 /* We need to open /dev/null here anew, to get the right access mode. */
649 return open_null_as(O_WRONLY, fileno);
071830ff 650 }
94f04347 651
eb17e935 652 switch (o) {
80876c20
LP
653
654 case EXEC_OUTPUT_NULL:
eb17e935 655 return open_null_as(O_WRONLY, fileno);
80876c20
LP
656
657 case EXEC_OUTPUT_TTY:
4f2d528d 658 if (is_terminal_input(i))
eb17e935 659 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
660
661 /* We don't reset the terminal if this is just about output */
1e22b5cd 662 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 663
9a6bca7a 664 case EXEC_OUTPUT_KMSG:
28dbc1e8 665 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
666 case EXEC_OUTPUT_JOURNAL:
667 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 668 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 669 if (r < 0) {
7966a916
ZJS
670 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
671 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 672 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
673 } else {
674 struct stat st;
675
676 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
677 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
678 * services to detect whether they are connected to the journal or not.
679 *
680 * If both stdout and stderr are connected to a stream then let's make sure to store the data
681 * about STDERR as that's usually the best way to do logging. */
7bce046b 682
ab2116b1
LP
683 if (fstat(fileno, &st) >= 0 &&
684 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
685 *journal_stream_dev = st.st_dev;
686 *journal_stream_ino = st.st_ino;
687 }
47c1d80d
MS
688 }
689 return r;
4f2d528d
LP
690
691 case EXEC_OUTPUT_SOCKET:
692 assert(socket_fd >= 0);
e75a9ed1 693
eb17e935 694 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 695
52c239d7 696 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
697 assert(named_iofds[fileno] >= 0);
698
52c239d7
LB
699 (void) fd_nonblock(named_iofds[fileno], false);
700 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
701
566b7d23 702 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
703 case EXEC_OUTPUT_FILE_APPEND:
704 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 705 bool rw;
566b7d23 706 int fd, flags;
2038c3f5
LP
707
708 assert(context->stdio_file[fileno]);
709
710 rw = context->std_input == EXEC_INPUT_FILE &&
711 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
712
713 if (rw)
714 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
715
566b7d23
ZD
716 flags = O_WRONLY;
717 if (o == EXEC_OUTPUT_FILE_APPEND)
718 flags |= O_APPEND;
8d7dab1f
LW
719 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
720 flags |= O_TRUNC;
566b7d23
ZD
721
722 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
723 if (fd < 0)
724 return fd;
725
566b7d23 726 return move_fd(fd, fileno, 0);
2038c3f5
LP
727 }
728
94f04347 729 default:
80876c20 730 assert_not_reached("Unknown error type");
94f04347 731 }
071830ff
LP
732}
733
02a51aba 734static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 735 int r;
02a51aba
LP
736
737 assert(fd >= 0);
02a51aba 738
1ff74fb6 739 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
740 if (isatty(fd) < 1) {
741 if (IN_SET(errno, EINVAL, ENOTTY))
742 return 0; /* not a tty */
1ff74fb6 743
02a51aba 744 return -errno;
4b3b5bc7 745 }
02a51aba 746
4b3b5bc7 747 /* This might fail. What matters are the results. */
f2df231f 748 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
749 if (r < 0)
750 return r;
02a51aba 751
4b3b5bc7 752 return 1;
02a51aba
LP
753}
754
7d5ceb64 755static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
756 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
757 int r;
80876c20 758
80876c20
LP
759 assert(_saved_stdin);
760 assert(_saved_stdout);
761
af6da548
LP
762 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
763 if (saved_stdin < 0)
764 return -errno;
80876c20 765
af6da548 766 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
767 if (saved_stdout < 0)
768 return -errno;
80876c20 769
8854d795 770 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
771 if (fd < 0)
772 return fd;
80876c20 773
af6da548
LP
774 r = chown_terminal(fd, getuid());
775 if (r < 0)
3d18b167 776 return r;
02a51aba 777
3d18b167
LP
778 r = reset_terminal_fd(fd, true);
779 if (r < 0)
780 return r;
80876c20 781
2b33ab09 782 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 783 fd = -1;
2b33ab09
LP
784 if (r < 0)
785 return r;
80876c20
LP
786
787 *_saved_stdin = saved_stdin;
788 *_saved_stdout = saved_stdout;
789
3d18b167 790 saved_stdin = saved_stdout = -1;
80876c20 791
3d18b167 792 return 0;
80876c20
LP
793}
794
63d77c92 795static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
796 assert(err < 0);
797
798 if (err == -ETIMEDOUT)
63d77c92 799 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
800 else {
801 errno = -err;
63d77c92 802 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
803 }
804}
805
63d77c92 806static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 807 _cleanup_close_ int fd = -1;
80876c20 808
3b20f877 809 assert(vc);
80876c20 810
7d5ceb64 811 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 812 if (fd < 0)
3b20f877 813 return;
80876c20 814
63d77c92 815 write_confirm_error_fd(err, fd, u);
af6da548 816}
80876c20 817
3d18b167 818static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 819 int r = 0;
80876c20 820
af6da548
LP
821 assert(saved_stdin);
822 assert(saved_stdout);
823
824 release_terminal();
825
826 if (*saved_stdin >= 0)
80876c20 827 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 828 r = -errno;
80876c20 829
af6da548 830 if (*saved_stdout >= 0)
80876c20 831 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 832 r = -errno;
80876c20 833
3d18b167
LP
834 *saved_stdin = safe_close(*saved_stdin);
835 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
836
837 return r;
838}
839
3b20f877
FB
840enum {
841 CONFIRM_PRETEND_FAILURE = -1,
842 CONFIRM_PRETEND_SUCCESS = 0,
843 CONFIRM_EXECUTE = 1,
844};
845
eedf223a 846static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 847 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 848 _cleanup_free_ char *e = NULL;
3b20f877 849 char c;
af6da548 850
3b20f877 851 /* For any internal errors, assume a positive response. */
7d5ceb64 852 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 853 if (r < 0) {
63d77c92 854 write_confirm_error(r, vc, u);
3b20f877
FB
855 return CONFIRM_EXECUTE;
856 }
af6da548 857
b0eb2944
FB
858 /* confirm_spawn might have been disabled while we were sleeping. */
859 if (manager_is_confirm_spawn_disabled(u->manager)) {
860 r = 1;
861 goto restore_stdio;
862 }
af6da548 863
2bcd3c26
FB
864 e = ellipsize(cmdline, 60, 100);
865 if (!e) {
866 log_oom();
867 r = CONFIRM_EXECUTE;
868 goto restore_stdio;
869 }
af6da548 870
d172b175 871 for (;;) {
539622bd 872 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 873 if (r < 0) {
63d77c92 874 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
875 r = CONFIRM_EXECUTE;
876 goto restore_stdio;
877 }
af6da548 878
d172b175 879 switch (c) {
b0eb2944
FB
880 case 'c':
881 printf("Resuming normal execution.\n");
882 manager_disable_confirm_spawn();
883 r = 1;
884 break;
dd6f9ac0
FB
885 case 'D':
886 unit_dump(u, stdout, " ");
887 continue; /* ask again */
d172b175
FB
888 case 'f':
889 printf("Failing execution.\n");
890 r = CONFIRM_PRETEND_FAILURE;
891 break;
892 case 'h':
b0eb2944
FB
893 printf(" c - continue, proceed without asking anymore\n"
894 " D - dump, show the state of the unit\n"
dd6f9ac0 895 " f - fail, don't execute the command and pretend it failed\n"
d172b175 896 " h - help\n"
eedf223a 897 " i - info, show a short summary of the unit\n"
56fde33a 898 " j - jobs, show jobs that are in progress\n"
d172b175
FB
899 " s - skip, don't execute the command and pretend it succeeded\n"
900 " y - yes, execute the command\n");
dd6f9ac0 901 continue; /* ask again */
eedf223a
FB
902 case 'i':
903 printf(" Description: %s\n"
904 " Unit: %s\n"
905 " Command: %s\n",
906 u->id, u->description, cmdline);
907 continue; /* ask again */
56fde33a
FB
908 case 'j':
909 manager_dump_jobs(u->manager, stdout, " ");
910 continue; /* ask again */
539622bd
FB
911 case 'n':
912 /* 'n' was removed in favor of 'f'. */
913 printf("Didn't understand 'n', did you mean 'f'?\n");
914 continue; /* ask again */
d172b175
FB
915 case 's':
916 printf("Skipping execution.\n");
917 r = CONFIRM_PRETEND_SUCCESS;
918 break;
919 case 'y':
920 r = CONFIRM_EXECUTE;
921 break;
922 default:
923 assert_not_reached("Unhandled choice");
924 }
3b20f877 925 break;
3b20f877 926 }
af6da548 927
3b20f877 928restore_stdio:
af6da548 929 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 930 return r;
80876c20
LP
931}
932
4d885bd3
DH
933static int get_fixed_user(const ExecContext *c, const char **user,
934 uid_t *uid, gid_t *gid,
935 const char **home, const char **shell) {
81a2b7ce 936 int r;
4d885bd3 937 const char *name;
81a2b7ce 938
4d885bd3 939 assert(c);
81a2b7ce 940
23deef88
LP
941 if (!c->user)
942 return 0;
943
4d885bd3
DH
944 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
945 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 946
23deef88 947 name = c->user;
fafff8f1 948 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
949 if (r < 0)
950 return r;
81a2b7ce 951
4d885bd3
DH
952 *user = name;
953 return 0;
954}
955
956static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
957 int r;
958 const char *name;
959
960 assert(c);
961
962 if (!c->group)
963 return 0;
964
965 name = c->group;
fafff8f1 966 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
967 if (r < 0)
968 return r;
969
970 *group = name;
971 return 0;
972}
973
cdc5d5c5
DH
974static int get_supplementary_groups(const ExecContext *c, const char *user,
975 const char *group, gid_t gid,
976 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
977 char **i;
978 int r, k = 0;
979 int ngroups_max;
980 bool keep_groups = false;
981 gid_t *groups = NULL;
982 _cleanup_free_ gid_t *l_gids = NULL;
983
984 assert(c);
985
bbeea271
DH
986 /*
987 * If user is given, then lookup GID and supplementary groups list.
988 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
989 * here and as early as possible so we keep the list of supplementary
990 * groups of the caller.
bbeea271
DH
991 */
992 if (user && gid_is_valid(gid) && gid != 0) {
993 /* First step, initialize groups from /etc/groups */
994 if (initgroups(user, gid) < 0)
995 return -errno;
996
997 keep_groups = true;
998 }
999
ac6e8be6 1000 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1001 return 0;
1002
366ddd25
DH
1003 /*
1004 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1005 * be positive, otherwise fail.
1006 */
1007 errno = 0;
1008 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1009 if (ngroups_max <= 0)
1010 return errno_or_else(EOPNOTSUPP);
366ddd25 1011
4d885bd3
DH
1012 l_gids = new(gid_t, ngroups_max);
1013 if (!l_gids)
1014 return -ENOMEM;
81a2b7ce 1015
4d885bd3
DH
1016 if (keep_groups) {
1017 /*
1018 * Lookup the list of groups that the user belongs to, we
1019 * avoid NSS lookups here too for gid=0.
1020 */
1021 k = ngroups_max;
1022 if (getgrouplist(user, gid, l_gids, &k) < 0)
1023 return -EINVAL;
1024 } else
1025 k = 0;
81a2b7ce 1026
4d885bd3
DH
1027 STRV_FOREACH(i, c->supplementary_groups) {
1028 const char *g;
81a2b7ce 1029
4d885bd3
DH
1030 if (k >= ngroups_max)
1031 return -E2BIG;
81a2b7ce 1032
4d885bd3 1033 g = *i;
fafff8f1 1034 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1035 if (r < 0)
1036 return r;
81a2b7ce 1037
4d885bd3
DH
1038 k++;
1039 }
81a2b7ce 1040
4d885bd3
DH
1041 /*
1042 * Sets ngids to zero to drop all supplementary groups, happens
1043 * when we are under root and SupplementaryGroups= is empty.
1044 */
1045 if (k == 0) {
1046 *ngids = 0;
1047 return 0;
1048 }
81a2b7ce 1049
4d885bd3
DH
1050 /* Otherwise get the final list of supplementary groups */
1051 groups = memdup(l_gids, sizeof(gid_t) * k);
1052 if (!groups)
1053 return -ENOMEM;
1054
1055 *supplementary_gids = groups;
1056 *ngids = k;
1057
1058 groups = NULL;
1059
1060 return 0;
1061}
1062
34cf6c43 1063static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1064 int r;
1065
709dbeac
YW
1066 /* Handle SupplementaryGroups= if it is not empty */
1067 if (ngids > 0) {
4d885bd3
DH
1068 r = maybe_setgroups(ngids, supplementary_gids);
1069 if (r < 0)
97f0e76f 1070 return r;
4d885bd3 1071 }
81a2b7ce 1072
4d885bd3
DH
1073 if (gid_is_valid(gid)) {
1074 /* Then set our gids */
1075 if (setresgid(gid, gid, gid) < 0)
1076 return -errno;
81a2b7ce
LP
1077 }
1078
1079 return 0;
1080}
1081
dbdc4098
TK
1082static int set_securebits(int bits, int mask) {
1083 int current, applied;
1084 current = prctl(PR_GET_SECUREBITS);
1085 if (current < 0)
1086 return -errno;
1087 /* Clear all securebits defined in mask and set bits */
1088 applied = (current & ~mask) | bits;
1089 if (current == applied)
1090 return 0;
1091 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1092 return -errno;
1093 return 1;
1094}
1095
81a2b7ce 1096static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1097 assert(context);
dbdc4098 1098 int r;
81a2b7ce 1099
4d885bd3
DH
1100 if (!uid_is_valid(uid))
1101 return 0;
1102
479050b3 1103 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1104 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1105 * required, so we also need keep-caps in this case.
1106 */
81a2b7ce 1107
dbdc4098 1108 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1109
1110 /* First step: If we need to keep capabilities but
1111 * drop privileges we need to make sure we keep our
cbb21cca 1112 * caps, while we drop privileges. */
693ced48 1113 if (uid != 0) {
dbdc4098
TK
1114 /* Add KEEP_CAPS to the securebits */
1115 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1116 if (r < 0)
1117 return r;
693ced48 1118 }
81a2b7ce
LP
1119 }
1120
479050b3 1121 /* Second step: actually set the uids */
81a2b7ce
LP
1122 if (setresuid(uid, uid, uid) < 0)
1123 return -errno;
1124
1125 /* At this point we should have all necessary capabilities but
1126 are otherwise a normal user. However, the caps might got
1127 corrupted due to the setresuid() so we need clean them up
1128 later. This is done outside of this call. */
1129
1130 return 0;
1131}
1132
349cc4a5 1133#if HAVE_PAM
5b6319dc
LP
1134
1135static int null_conv(
1136 int num_msg,
1137 const struct pam_message **msg,
1138 struct pam_response **resp,
1139 void *appdata_ptr) {
1140
1141 /* We don't support conversations */
1142
1143 return PAM_CONV_ERR;
1144}
1145
cefc33ae
LP
1146#endif
1147
5b6319dc
LP
1148static int setup_pam(
1149 const char *name,
1150 const char *user,
940c5210 1151 uid_t uid,
2d6fce8d 1152 gid_t gid,
5b6319dc 1153 const char *tty,
2065ca69 1154 char ***env,
5b8d1f6b 1155 const int fds[], size_t n_fds) {
5b6319dc 1156
349cc4a5 1157#if HAVE_PAM
cefc33ae 1158
5b6319dc
LP
1159 static const struct pam_conv conv = {
1160 .conv = null_conv,
1161 .appdata_ptr = NULL
1162 };
1163
2d7c6aa2 1164 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1165 pam_handle_t *handle = NULL;
d6e5f3ad 1166 sigset_t old_ss;
7bb70b6e 1167 int pam_code = PAM_SUCCESS, r;
84eada2f 1168 char **nv, **e = NULL;
5b6319dc
LP
1169 bool close_session = false;
1170 pid_t pam_pid = 0, parent_pid;
970edce6 1171 int flags = 0;
5b6319dc
LP
1172
1173 assert(name);
1174 assert(user);
2065ca69 1175 assert(env);
5b6319dc
LP
1176
1177 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1178 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1179 * systemd via the cgroup logic. It will then remove the PAM
1180 * session again. The parent process will exec() the actual
1181 * daemon. We do things this way to ensure that the main PID
1182 * of the daemon is the one we initially fork()ed. */
1183
7bb70b6e
LP
1184 r = barrier_create(&barrier);
1185 if (r < 0)
2d7c6aa2
DH
1186 goto fail;
1187
553d2243 1188 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1189 flags |= PAM_SILENT;
1190
f546241b
ZJS
1191 pam_code = pam_start(name, user, &conv, &handle);
1192 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1193 handle = NULL;
1194 goto fail;
1195 }
1196
3cd24c1a
LP
1197 if (!tty) {
1198 _cleanup_free_ char *q = NULL;
1199
1200 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1201 * out if that's the case, and read the TTY off it. */
1202
1203 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1204 tty = strjoina("/dev/", q);
1205 }
1206
f546241b
ZJS
1207 if (tty) {
1208 pam_code = pam_set_item(handle, PAM_TTY, tty);
1209 if (pam_code != PAM_SUCCESS)
5b6319dc 1210 goto fail;
f546241b 1211 }
5b6319dc 1212
84eada2f
JW
1213 STRV_FOREACH(nv, *env) {
1214 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1215 if (pam_code != PAM_SUCCESS)
1216 goto fail;
1217 }
1218
970edce6 1219 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1220 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1221 goto fail;
1222
3bb39ea9
DG
1223 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1224 if (pam_code != PAM_SUCCESS)
46d7c6af 1225 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1226
970edce6 1227 pam_code = pam_open_session(handle, flags);
f546241b 1228 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1229 goto fail;
1230
1231 close_session = true;
1232
f546241b
ZJS
1233 e = pam_getenvlist(handle);
1234 if (!e) {
5b6319dc
LP
1235 pam_code = PAM_BUF_ERR;
1236 goto fail;
1237 }
1238
1239 /* Block SIGTERM, so that we know that it won't get lost in
1240 * the child */
ce30c8dc 1241
72c0a2c2 1242 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1243
df0ff127 1244 parent_pid = getpid_cached();
5b6319dc 1245
4c253ed1
LP
1246 r = safe_fork("(sd-pam)", 0, &pam_pid);
1247 if (r < 0)
5b6319dc 1248 goto fail;
4c253ed1 1249 if (r == 0) {
7bb70b6e 1250 int sig, ret = EXIT_PAM;
5b6319dc
LP
1251
1252 /* The child's job is to reset the PAM session on
1253 * termination */
2d7c6aa2 1254 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1255
1da37e58
ZJS
1256 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1257 * those fds are open here that have been opened by PAM. */
4c253ed1 1258 (void) close_many(fds, n_fds);
5b6319dc 1259
940c5210
AK
1260 /* Drop privileges - we don't need any to pam_close_session
1261 * and this will make PR_SET_PDEATHSIG work in most cases.
1262 * If this fails, ignore the error - but expect sd-pam threads
1263 * to fail to exit normally */
2d6fce8d 1264
97f0e76f
LP
1265 r = maybe_setgroups(0, NULL);
1266 if (r < 0)
1267 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1268 if (setresgid(gid, gid, gid) < 0)
1269 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1270 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1271 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1272
9c274488 1273 (void) ignore_signals(SIGPIPE);
ce30c8dc 1274
940c5210
AK
1275 /* Wait until our parent died. This will only work if
1276 * the above setresuid() succeeds, otherwise the kernel
1277 * will not allow unprivileged parents kill their privileged
1278 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1279 * to do the rest for us. */
1280 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1281 goto child_finish;
1282
2d7c6aa2
DH
1283 /* Tell the parent that our setup is done. This is especially
1284 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1285 * setup might race against our setresuid(2) call.
1286 *
1287 * If the parent aborted, we'll detect this below, hence ignore
1288 * return failure here. */
1289 (void) barrier_place(&barrier);
2d7c6aa2 1290
643f4706 1291 /* Check if our parent process might already have died? */
5b6319dc 1292 if (getppid() == parent_pid) {
d6e5f3ad
DM
1293 sigset_t ss;
1294
1295 assert_se(sigemptyset(&ss) >= 0);
1296 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1297
3dead8d9
LP
1298 for (;;) {
1299 if (sigwait(&ss, &sig) < 0) {
1300 if (errno == EINTR)
1301 continue;
1302
1303 goto child_finish;
1304 }
5b6319dc 1305
3dead8d9
LP
1306 assert(sig == SIGTERM);
1307 break;
1308 }
5b6319dc
LP
1309 }
1310
3bb39ea9
DG
1311 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1312 if (pam_code != PAM_SUCCESS)
1313 goto child_finish;
1314
3dead8d9 1315 /* If our parent died we'll end the session */
f546241b 1316 if (getppid() != parent_pid) {
970edce6 1317 pam_code = pam_close_session(handle, flags);
f546241b 1318 if (pam_code != PAM_SUCCESS)
5b6319dc 1319 goto child_finish;
f546241b 1320 }
5b6319dc 1321
7bb70b6e 1322 ret = 0;
5b6319dc
LP
1323
1324 child_finish:
970edce6 1325 pam_end(handle, pam_code | flags);
7bb70b6e 1326 _exit(ret);
5b6319dc
LP
1327 }
1328
2d7c6aa2
DH
1329 barrier_set_role(&barrier, BARRIER_PARENT);
1330
5b6319dc
LP
1331 /* If the child was forked off successfully it will do all the
1332 * cleanups, so forget about the handle here. */
1333 handle = NULL;
1334
3b8bddde 1335 /* Unblock SIGTERM again in the parent */
72c0a2c2 1336 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1337
1338 /* We close the log explicitly here, since the PAM modules
1339 * might have opened it, but we don't want this fd around. */
1340 closelog();
1341
2d7c6aa2
DH
1342 /* Synchronously wait for the child to initialize. We don't care for
1343 * errors as we cannot recover. However, warn loudly if it happens. */
1344 if (!barrier_place_and_sync(&barrier))
1345 log_error("PAM initialization failed");
1346
130d3d22 1347 return strv_free_and_replace(*env, e);
5b6319dc
LP
1348
1349fail:
970edce6
ZJS
1350 if (pam_code != PAM_SUCCESS) {
1351 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1352 r = -EPERM; /* PAM errors do not map to errno */
1353 } else
1354 log_error_errno(r, "PAM failed: %m");
9ba35398 1355
5b6319dc
LP
1356 if (handle) {
1357 if (close_session)
970edce6 1358 pam_code = pam_close_session(handle, flags);
5b6319dc 1359
970edce6 1360 pam_end(handle, pam_code | flags);
5b6319dc
LP
1361 }
1362
1363 strv_free(e);
5b6319dc
LP
1364 closelog();
1365
7bb70b6e 1366 return r;
cefc33ae
LP
1367#else
1368 return 0;
5b6319dc 1369#endif
cefc33ae 1370}
5b6319dc 1371
5d6b1584
LP
1372static void rename_process_from_path(const char *path) {
1373 char process_name[11];
1374 const char *p;
1375 size_t l;
1376
1377 /* This resulting string must fit in 10 chars (i.e. the length
1378 * of "/sbin/init") to look pretty in /bin/ps */
1379
2b6bf07d 1380 p = basename(path);
5d6b1584
LP
1381 if (isempty(p)) {
1382 rename_process("(...)");
1383 return;
1384 }
1385
1386 l = strlen(p);
1387 if (l > 8) {
1388 /* The end of the process name is usually more
1389 * interesting, since the first bit might just be
1390 * "systemd-" */
1391 p = p + l - 8;
1392 l = 8;
1393 }
1394
1395 process_name[0] = '(';
1396 memcpy(process_name+1, p, l);
1397 process_name[1+l] = ')';
1398 process_name[1+l+1] = 0;
1399
1400 rename_process(process_name);
1401}
1402
469830d1
LP
1403static bool context_has_address_families(const ExecContext *c) {
1404 assert(c);
1405
6b000af4 1406 return c->address_families_allow_list ||
469830d1
LP
1407 !set_isempty(c->address_families);
1408}
1409
1410static bool context_has_syscall_filters(const ExecContext *c) {
1411 assert(c);
1412
6b000af4 1413 return c->syscall_allow_list ||
8cfa775f 1414 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1415}
1416
9df2cdd8
TM
1417static bool context_has_syscall_logs(const ExecContext *c) {
1418 assert(c);
1419
1420 return c->syscall_log_allow_list ||
1421 !hashmap_isempty(c->syscall_log);
1422}
1423
469830d1
LP
1424static bool context_has_no_new_privileges(const ExecContext *c) {
1425 assert(c);
1426
1427 if (c->no_new_privileges)
1428 return true;
1429
1430 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1431 return false;
1432
1433 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1434 return c->lock_personality ||
469830d1 1435 c->memory_deny_write_execute ||
0538d2a8 1436 c->private_devices ||
fc64760d 1437 c->protect_clock ||
0538d2a8 1438 c->protect_hostname ||
469830d1
LP
1439 c->protect_kernel_tunables ||
1440 c->protect_kernel_modules ||
84703040 1441 c->protect_kernel_logs ||
0538d2a8
YW
1442 context_has_address_families(c) ||
1443 exec_context_restrict_namespaces_set(c) ||
1444 c->restrict_realtime ||
1445 c->restrict_suid_sgid ||
78e864e5 1446 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1447 context_has_syscall_filters(c) ||
1448 context_has_syscall_logs(c);
469830d1
LP
1449}
1450
bb0c0d6f
LP
1451static bool exec_context_has_credentials(const ExecContext *context) {
1452
1453 assert(context);
1454
1455 return !hashmap_isempty(context->set_credentials) ||
1456 context->load_credentials;
1457}
1458
349cc4a5 1459#if HAVE_SECCOMP
17df7223 1460
83f12b27 1461static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1462
1463 if (is_seccomp_available())
1464 return false;
1465
f673b62d 1466 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1467 return true;
83f12b27
FS
1468}
1469
165a31c0 1470static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1471 uint32_t negative_action, default_action, action;
165a31c0 1472 int r;
8351ceae 1473
469830d1 1474 assert(u);
c0467cf3 1475 assert(c);
8351ceae 1476
469830d1 1477 if (!context_has_syscall_filters(c))
83f12b27
FS
1478 return 0;
1479
469830d1
LP
1480 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1481 return 0;
e9642be2 1482
005bfaf1 1483 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1484
6b000af4 1485 if (c->syscall_allow_list) {
469830d1
LP
1486 default_action = negative_action;
1487 action = SCMP_ACT_ALLOW;
7c66bae2 1488 } else {
469830d1
LP
1489 default_action = SCMP_ACT_ALLOW;
1490 action = negative_action;
57183d11 1491 }
8351ceae 1492
165a31c0 1493 if (needs_ambient_hack) {
6b000af4 1494 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1495 if (r < 0)
1496 return r;
1497 }
1498
b54f36c6 1499 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1500}
1501
9df2cdd8
TM
1502static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1503#ifdef SCMP_ACT_LOG
1504 uint32_t default_action, action;
1505#endif
1506
1507 assert(u);
1508 assert(c);
1509
1510 if (!context_has_syscall_logs(c))
1511 return 0;
1512
1513#ifdef SCMP_ACT_LOG
1514 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1515 return 0;
1516
1517 if (c->syscall_log_allow_list) {
1518 /* Log nothing but the ones listed */
1519 default_action = SCMP_ACT_ALLOW;
1520 action = SCMP_ACT_LOG;
1521 } else {
1522 /* Log everything but the ones listed */
1523 default_action = SCMP_ACT_LOG;
1524 action = SCMP_ACT_ALLOW;
1525 }
1526
1527 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1528#else
1529 /* old libseccomp */
1530 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1531 return 0;
1532#endif
1533}
1534
469830d1
LP
1535static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1536 assert(u);
4298d0b5
LP
1537 assert(c);
1538
469830d1 1539 if (set_isempty(c->syscall_archs))
83f12b27
FS
1540 return 0;
1541
469830d1
LP
1542 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1543 return 0;
4298d0b5 1544
469830d1
LP
1545 return seccomp_restrict_archs(c->syscall_archs);
1546}
4298d0b5 1547
469830d1
LP
1548static int apply_address_families(const Unit* u, const ExecContext *c) {
1549 assert(u);
1550 assert(c);
4298d0b5 1551
469830d1
LP
1552 if (!context_has_address_families(c))
1553 return 0;
4298d0b5 1554
469830d1
LP
1555 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1556 return 0;
4298d0b5 1557
6b000af4 1558 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1559}
4298d0b5 1560
83f12b27 1561static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1562 assert(u);
f3e43635
TM
1563 assert(c);
1564
469830d1 1565 if (!c->memory_deny_write_execute)
83f12b27
FS
1566 return 0;
1567
469830d1
LP
1568 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1569 return 0;
f3e43635 1570
469830d1 1571 return seccomp_memory_deny_write_execute();
f3e43635
TM
1572}
1573
83f12b27 1574static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1575 assert(u);
f4170c67
LP
1576 assert(c);
1577
469830d1 1578 if (!c->restrict_realtime)
83f12b27
FS
1579 return 0;
1580
469830d1
LP
1581 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1582 return 0;
f4170c67 1583
469830d1 1584 return seccomp_restrict_realtime();
f4170c67
LP
1585}
1586
f69567cb
LP
1587static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1588 assert(u);
1589 assert(c);
1590
1591 if (!c->restrict_suid_sgid)
1592 return 0;
1593
1594 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1595 return 0;
1596
1597 return seccomp_restrict_suid_sgid();
1598}
1599
59e856c7 1600static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1601 assert(u);
59eeb84b
LP
1602 assert(c);
1603
1604 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1605 * let's protect even those systems where this is left on in the kernel. */
1606
469830d1 1607 if (!c->protect_kernel_tunables)
59eeb84b
LP
1608 return 0;
1609
469830d1
LP
1610 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1611 return 0;
59eeb84b 1612
469830d1 1613 return seccomp_protect_sysctl();
59eeb84b
LP
1614}
1615
59e856c7 1616static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1617 assert(u);
502d704e
DH
1618 assert(c);
1619
25a8d8a0 1620 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1621
469830d1
LP
1622 if (!c->protect_kernel_modules)
1623 return 0;
1624
502d704e
DH
1625 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1626 return 0;
1627
b54f36c6 1628 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1629}
1630
84703040
KK
1631static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1632 assert(u);
1633 assert(c);
1634
1635 if (!c->protect_kernel_logs)
1636 return 0;
1637
1638 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1639 return 0;
1640
1641 return seccomp_protect_syslog();
1642}
1643
daf8f72b 1644static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1645 assert(u);
1646 assert(c);
1647
1648 if (!c->protect_clock)
1649 return 0;
1650
1651 if (skip_seccomp_unavailable(u, "ProtectClock="))
1652 return 0;
1653
1654 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1655}
1656
59e856c7 1657static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1658 assert(u);
ba128bb8
LP
1659 assert(c);
1660
8f81a5f6 1661 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1662
469830d1
LP
1663 if (!c->private_devices)
1664 return 0;
1665
ba128bb8
LP
1666 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1667 return 0;
1668
b54f36c6 1669 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1670}
1671
34cf6c43 1672static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1673 assert(u);
add00535
LP
1674 assert(c);
1675
1676 if (!exec_context_restrict_namespaces_set(c))
1677 return 0;
1678
1679 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1680 return 0;
1681
1682 return seccomp_restrict_namespaces(c->restrict_namespaces);
1683}
1684
78e864e5 1685static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1686 unsigned long personality;
1687 int r;
78e864e5
TM
1688
1689 assert(u);
1690 assert(c);
1691
1692 if (!c->lock_personality)
1693 return 0;
1694
1695 if (skip_seccomp_unavailable(u, "LockPersonality="))
1696 return 0;
1697
e8132d63
LP
1698 personality = c->personality;
1699
1700 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1701 if (personality == PERSONALITY_INVALID) {
1702
1703 r = opinionated_personality(&personality);
1704 if (r < 0)
1705 return r;
1706 }
78e864e5
TM
1707
1708 return seccomp_lock_personality(personality);
1709}
1710
c0467cf3 1711#endif
8351ceae 1712
daf8f72b 1713static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1714 assert(u);
1715 assert(c);
1716
1717 if (!c->protect_hostname)
1718 return 0;
1719
1720 if (ns_type_supported(NAMESPACE_UTS)) {
1721 if (unshare(CLONE_NEWUTS) < 0) {
1722 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1723 *ret_exit_status = EXIT_NAMESPACE;
1724 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1725 }
1726
1727 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1728 }
1729 } else
1730 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1731
1732#if HAVE_SECCOMP
8f3e342f
ZJS
1733 int r;
1734
daf8f72b
LP
1735 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1736 return 0;
1737
1738 r = seccomp_protect_hostname();
1739 if (r < 0) {
1740 *ret_exit_status = EXIT_SECCOMP;
1741 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1742 }
1743#endif
1744
1745 return 0;
1746}
1747
3042bbeb 1748static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1749 assert(idle_pipe);
1750
54eb2300
LP
1751 idle_pipe[1] = safe_close(idle_pipe[1]);
1752 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1753
1754 if (idle_pipe[0] >= 0) {
1755 int r;
1756
1757 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1758
1759 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1760 ssize_t n;
1761
31a7eb86 1762 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1763 n = write(idle_pipe[3], "x", 1);
1764 if (n > 0)
cd972d69 1765 /* Wait for systemd to react to the signal above. */
54756dce 1766 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1767 }
1768
54eb2300 1769 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1770
1771 }
1772
54eb2300 1773 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1774}
1775
fb2042dd
YW
1776static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1777
7cae38c4 1778static int build_environment(
34cf6c43 1779 const Unit *u,
9fa95f85 1780 const ExecContext *c,
1e22b5cd 1781 const ExecParameters *p,
da6053d0 1782 size_t n_fds,
7cae38c4
LP
1783 const char *home,
1784 const char *username,
1785 const char *shell,
7bce046b
LP
1786 dev_t journal_stream_dev,
1787 ino_t journal_stream_ino,
7cae38c4
LP
1788 char ***ret) {
1789
1790 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1791 size_t n_env = 0;
7cae38c4
LP
1792 char *x;
1793
4b58153d 1794 assert(u);
7cae38c4 1795 assert(c);
7c1cb6f1 1796 assert(p);
7cae38c4
LP
1797 assert(ret);
1798
dc4e2940 1799#define N_ENV_VARS 17
8d5bb13d 1800 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1801 if (!our_env)
1802 return -ENOMEM;
1803
1804 if (n_fds > 0) {
8dd4c05b
LP
1805 _cleanup_free_ char *joined = NULL;
1806
df0ff127 1807 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1808 return -ENOMEM;
1809 our_env[n_env++] = x;
1810
da6053d0 1811 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1812 return -ENOMEM;
1813 our_env[n_env++] = x;
8dd4c05b 1814
1e22b5cd 1815 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1816 if (!joined)
1817 return -ENOMEM;
1818
605405c6 1819 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1820 if (!x)
1821 return -ENOMEM;
1822 our_env[n_env++] = x;
7cae38c4
LP
1823 }
1824
b08af3b1 1825 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1826 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1827 return -ENOMEM;
1828 our_env[n_env++] = x;
1829
1e22b5cd 1830 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1831 return -ENOMEM;
1832 our_env[n_env++] = x;
1833 }
1834
fd63e712
LP
1835 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1836 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1837 * check the database directly. */
ac647978 1838 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1839 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1840 if (!x)
1841 return -ENOMEM;
1842 our_env[n_env++] = x;
1843 }
1844
7cae38c4 1845 if (home) {
b910cc72 1846 x = strjoin("HOME=", home);
7cae38c4
LP
1847 if (!x)
1848 return -ENOMEM;
7bbead1d 1849
4ff361cc 1850 path_simplify(x + 5);
7cae38c4
LP
1851 our_env[n_env++] = x;
1852 }
1853
1854 if (username) {
b910cc72 1855 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1856 if (!x)
1857 return -ENOMEM;
1858 our_env[n_env++] = x;
1859
b910cc72 1860 x = strjoin("USER=", username);
7cae38c4
LP
1861 if (!x)
1862 return -ENOMEM;
1863 our_env[n_env++] = x;
1864 }
1865
1866 if (shell) {
b910cc72 1867 x = strjoin("SHELL=", shell);
7cae38c4
LP
1868 if (!x)
1869 return -ENOMEM;
7bbead1d 1870
4ff361cc 1871 path_simplify(x + 6);
7cae38c4
LP
1872 our_env[n_env++] = x;
1873 }
1874
4b58153d
LP
1875 if (!sd_id128_is_null(u->invocation_id)) {
1876 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1877 return -ENOMEM;
1878
1879 our_env[n_env++] = x;
1880 }
1881
6af760f3
LP
1882 if (exec_context_needs_term(c)) {
1883 const char *tty_path, *term = NULL;
1884
1885 tty_path = exec_context_tty_path(c);
1886
e8cf09b2
LP
1887 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1888 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1889 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1890
e8cf09b2 1891 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1892 term = getenv("TERM");
e8cf09b2 1893
6af760f3
LP
1894 if (!term)
1895 term = default_term_for_tty(tty_path);
7cae38c4 1896
b910cc72 1897 x = strjoin("TERM=", term);
7cae38c4
LP
1898 if (!x)
1899 return -ENOMEM;
1900 our_env[n_env++] = x;
1901 }
1902
7bce046b
LP
1903 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1904 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1905 return -ENOMEM;
1906
1907 our_env[n_env++] = x;
1908 }
1909
91dd5f7c
LP
1910 if (c->log_namespace) {
1911 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1912 if (!x)
1913 return -ENOMEM;
1914
1915 our_env[n_env++] = x;
1916 }
1917
5b10116e 1918 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
fb2042dd
YW
1919 _cleanup_free_ char *pre = NULL, *joined = NULL;
1920 const char *n;
1921
1922 if (!p->prefix[t])
1923 continue;
1924
1925 if (strv_isempty(c->directories[t].paths))
1926 continue;
1927
1928 n = exec_directory_env_name_to_string(t);
1929 if (!n)
1930 continue;
1931
1932 pre = strjoin(p->prefix[t], "/");
1933 if (!pre)
1934 return -ENOMEM;
1935
48904c8b 1936 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
fb2042dd
YW
1937 if (!joined)
1938 return -ENOMEM;
1939
1940 x = strjoin(n, "=", joined);
1941 if (!x)
1942 return -ENOMEM;
1943
1944 our_env[n_env++] = x;
1945 }
1946
bb0c0d6f
LP
1947 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1948 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1949 if (!x)
1950 return -ENOMEM;
1951
1952 our_env[n_env++] = x;
1953 }
1954
dc4e2940
YW
1955 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1956 return -ENOMEM;
1957
1958 our_env[n_env++] = x;
1959
7cae38c4 1960 our_env[n_env++] = NULL;
8d5bb13d
LP
1961 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1962#undef N_ENV_VARS
7cae38c4 1963
ae2a15bc 1964 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1965
1966 return 0;
1967}
1968
b4c14404
FB
1969static int build_pass_environment(const ExecContext *c, char ***ret) {
1970 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1971 size_t n_env = 0;
b4c14404
FB
1972 char **i;
1973
1974 STRV_FOREACH(i, c->pass_environment) {
1975 _cleanup_free_ char *x = NULL;
1976 char *v;
1977
1978 v = getenv(*i);
1979 if (!v)
1980 continue;
605405c6 1981 x = strjoin(*i, "=", v);
b4c14404
FB
1982 if (!x)
1983 return -ENOMEM;
00819cc1 1984
319a4f4b 1985 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 1986 return -ENOMEM;
00819cc1 1987
1cc6c93a 1988 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1989 pass_env[n_env] = NULL;
b4c14404
FB
1990 }
1991
ae2a15bc 1992 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1993
1994 return 0;
1995}
1996
5e8deb94 1997bool exec_needs_mount_namespace(
8b44a3d2
LP
1998 const ExecContext *context,
1999 const ExecParameters *params,
4657abb5 2000 const ExecRuntime *runtime) {
8b44a3d2
LP
2001
2002 assert(context);
8b44a3d2 2003
915e6d16
LP
2004 if (context->root_image)
2005 return true;
2006
2a624c36
AP
2007 if (!strv_isempty(context->read_write_paths) ||
2008 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2009 !strv_isempty(context->inaccessible_paths) ||
2010 !strv_isempty(context->exec_paths) ||
2011 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2012 return true;
2013
42b1d8e0 2014 if (context->n_bind_mounts > 0)
d2d6c096
LP
2015 return true;
2016
2abd4e38
YW
2017 if (context->n_temporary_filesystems > 0)
2018 return true;
2019
b3d13314
LB
2020 if (context->n_mount_images > 0)
2021 return true;
2022
93f59701
LB
2023 if (context->n_extension_images > 0)
2024 return true;
2025
37ed15d7 2026 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2027 return true;
2028
2029 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2030 return true;
2031
8b44a3d2 2032 if (context->private_devices ||
228af36f 2033 context->private_mounts ||
8b44a3d2 2034 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2035 context->protect_home != PROTECT_HOME_NO ||
2036 context->protect_kernel_tunables ||
c575770b 2037 context->protect_kernel_modules ||
94a7b275 2038 context->protect_kernel_logs ||
4e399953
LP
2039 context->protect_control_groups ||
2040 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2041 context->proc_subset != PROC_SUBSET_ALL ||
2042 context->private_ipc ||
2043 context->ipc_namespace_path)
8b44a3d2
LP
2044 return true;
2045
37c56f89 2046 if (context->root_directory) {
5e98086d 2047 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2048 return true;
2049
5b10116e 2050 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2051 if (params && !params->prefix[t])
37c56f89
YW
2052 continue;
2053
2054 if (!strv_isempty(context->directories[t].paths))
2055 return true;
2056 }
2057 }
5d997827 2058
42b1d8e0 2059 if (context->dynamic_user &&
b43ee82f 2060 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
2061 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2062 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2063 return true;
2064
91dd5f7c
LP
2065 if (context->log_namespace)
2066 return true;
2067
8b44a3d2
LP
2068 return false;
2069}
2070
5749f855 2071static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2072 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2073 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2074 _cleanup_close_ int unshare_ready_fd = -1;
2075 _cleanup_(sigkill_waitp) pid_t pid = 0;
2076 uint64_t c = 1;
d251207d
LP
2077 ssize_t n;
2078 int r;
2079
5749f855
AZ
2080 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2081 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2082 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2083 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2084 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2085 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2086 * continues execution normally.
2087 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2088 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2089
5749f855
AZ
2090 /* Can only set up multiple mappings with CAP_SETUID. */
2091 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2092 r = asprintf(&uid_map,
5749f855 2093 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2094 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2095 ouid, ouid, uid, uid);
2096 else
2097 r = asprintf(&uid_map,
2098 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2099 ouid, ouid);
d251207d 2100
5749f855
AZ
2101 if (r < 0)
2102 return -ENOMEM;
2103
2104 /* Can only set up multiple mappings with CAP_SETGID. */
2105 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2106 r = asprintf(&gid_map,
5749f855 2107 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2108 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2109 ogid, ogid, gid, gid);
2110 else
2111 r = asprintf(&gid_map,
2112 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2113 ogid, ogid);
2114
2115 if (r < 0)
2116 return -ENOMEM;
d251207d
LP
2117
2118 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2119 * namespace. */
2120 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2121 if (unshare_ready_fd < 0)
2122 return -errno;
2123
2124 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2125 * failed. */
2126 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2127 return -errno;
2128
4c253ed1
LP
2129 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2130 if (r < 0)
2131 return r;
2132 if (r == 0) {
d251207d
LP
2133 _cleanup_close_ int fd = -1;
2134 const char *a;
2135 pid_t ppid;
2136
2137 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2138 * here, after the parent opened its own user namespace. */
2139
2140 ppid = getppid();
2141 errno_pipe[0] = safe_close(errno_pipe[0]);
2142
2143 /* Wait until the parent unshared the user namespace */
2144 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2145 r = -errno;
2146 goto child_fail;
2147 }
2148
2149 /* Disable the setgroups() system call in the child user namespace, for good. */
2150 a = procfs_file_alloca(ppid, "setgroups");
2151 fd = open(a, O_WRONLY|O_CLOEXEC);
2152 if (fd < 0) {
2153 if (errno != ENOENT) {
2154 r = -errno;
2155 goto child_fail;
2156 }
2157
2158 /* If the file is missing the kernel is too old, let's continue anyway. */
2159 } else {
2160 if (write(fd, "deny\n", 5) < 0) {
2161 r = -errno;
2162 goto child_fail;
2163 }
2164
2165 fd = safe_close(fd);
2166 }
2167
2168 /* First write the GID map */
2169 a = procfs_file_alloca(ppid, "gid_map");
2170 fd = open(a, O_WRONLY|O_CLOEXEC);
2171 if (fd < 0) {
2172 r = -errno;
2173 goto child_fail;
2174 }
2175 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2176 r = -errno;
2177 goto child_fail;
2178 }
2179 fd = safe_close(fd);
2180
2181 /* The write the UID map */
2182 a = procfs_file_alloca(ppid, "uid_map");
2183 fd = open(a, O_WRONLY|O_CLOEXEC);
2184 if (fd < 0) {
2185 r = -errno;
2186 goto child_fail;
2187 }
2188 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2189 r = -errno;
2190 goto child_fail;
2191 }
2192
2193 _exit(EXIT_SUCCESS);
2194
2195 child_fail:
2196 (void) write(errno_pipe[1], &r, sizeof(r));
2197 _exit(EXIT_FAILURE);
2198 }
2199
2200 errno_pipe[1] = safe_close(errno_pipe[1]);
2201
2202 if (unshare(CLONE_NEWUSER) < 0)
2203 return -errno;
2204
2205 /* Let the child know that the namespace is ready now */
2206 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2207 return -errno;
2208
2209 /* Try to read an error code from the child */
2210 n = read(errno_pipe[0], &r, sizeof(r));
2211 if (n < 0)
2212 return -errno;
2213 if (n == sizeof(r)) { /* an error code was sent to us */
2214 if (r < 0)
2215 return r;
2216 return -EIO;
2217 }
2218 if (n != 0) /* on success we should have read 0 bytes */
2219 return -EIO;
2220
2e87a1fd
LP
2221 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2222 pid = 0;
d251207d
LP
2223 if (r < 0)
2224 return r;
2e87a1fd 2225 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2226 return -EIO;
2227
2228 return 0;
2229}
2230
494d0247
YW
2231static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2232 if (!context->dynamic_user)
2233 return false;
2234
2235 if (type == EXEC_DIRECTORY_CONFIGURATION)
2236 return false;
2237
2238 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2239 return false;
2240
2241 return true;
2242}
2243
3536f49e 2244static int setup_exec_directory(
07689d5d
LP
2245 const ExecContext *context,
2246 const ExecParameters *params,
2247 uid_t uid,
3536f49e 2248 gid_t gid,
3536f49e
YW
2249 ExecDirectoryType type,
2250 int *exit_status) {
07689d5d 2251
72fd1768 2252 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2253 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2254 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2255 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2256 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2257 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2258 };
07689d5d
LP
2259 char **rt;
2260 int r;
2261
2262 assert(context);
2263 assert(params);
72fd1768 2264 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2265 assert(exit_status);
07689d5d 2266
3536f49e
YW
2267 if (!params->prefix[type])
2268 return 0;
2269
8679efde 2270 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2271 if (!uid_is_valid(uid))
2272 uid = 0;
2273 if (!gid_is_valid(gid))
2274 gid = 0;
2275 }
2276
2277 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 2278 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2279
edbfeb12 2280 p = path_join(params->prefix[type], *rt);
3536f49e
YW
2281 if (!p) {
2282 r = -ENOMEM;
2283 goto fail;
2284 }
07689d5d 2285
23a7448e
YW
2286 r = mkdir_parents_label(p, 0755);
2287 if (r < 0)
3536f49e 2288 goto fail;
23a7448e 2289
494d0247 2290 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2291 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2292 * case we want to avoid leaving a directory around fully accessible that is owned by
2293 * a dynamic user whose UID is later on reused. To lock this down we use the same
2294 * trick used by container managers to prohibit host users to get access to files of
2295 * the same UID in containers: we place everything inside a directory that has an
2296 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2297 * for unprivileged host code. We then use fs namespacing to make this directory
2298 * permeable for the service itself.
6c47cd7d 2299 *
3f5b1508
LP
2300 * Specifically: for a service which wants a special directory "foo/" we first create
2301 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2302 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2303 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2304 * unprivileged host users can't look into it. Inside of the namespace of the unit
2305 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2306 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2307 * for the service and making sure it only gets access to the dirs it needs but no
2308 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2309 *
3f5b1508
LP
2310 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2311 * to be owned by the service itself.
2312 *
2313 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2314 * for sharing files or sockets with other services. */
6c47cd7d 2315
4ede9802
LP
2316 pp = path_join(params->prefix[type], "private");
2317 if (!pp) {
6c47cd7d
LP
2318 r = -ENOMEM;
2319 goto fail;
2320 }
2321
2322 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2323 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2324 if (r < 0)
2325 goto fail;
2326
4ede9802 2327 if (!path_extend(&pp, *rt)) {
6c47cd7d
LP
2328 r = -ENOMEM;
2329 goto fail;
2330 }
2331
2332 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2333 r = mkdir_parents_label(pp, 0755);
2334 if (r < 0)
2335 goto fail;
2336
949befd3
LP
2337 if (is_dir(p, false) > 0 &&
2338 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2339
2340 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2341 * it over. Most likely the service has been upgraded from one that didn't use
2342 * DynamicUser=1, to one that does. */
2343
cf52c45d
LP
2344 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2345 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2346 exec_directory_type_to_string(type), p, pp);
2347
949befd3
LP
2348 if (rename(p, pp) < 0) {
2349 r = -errno;
2350 goto fail;
2351 }
2352 } else {
2353 /* Otherwise, create the actual directory for the service */
2354
2355 r = mkdir_label(pp, context->directories[type].mode);
2356 if (r < 0 && r != -EEXIST)
2357 goto fail;
2358 }
6c47cd7d 2359
6c47cd7d 2360 /* And link it up from the original place */
6c9c51e5 2361 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2362 if (r < 0)
2363 goto fail;
2364
6c47cd7d 2365 } else {
5c6d40d1
LP
2366 _cleanup_free_ char *target = NULL;
2367
2368 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2369 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2370 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2371
2372 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2373 * by DynamicUser=1 (see above)?
2374 *
2375 * We do this for all directory types except for ConfigurationDirectory=,
2376 * since they all support the private/ symlink logic at least in some
2377 * configurations, see above. */
5c6d40d1 2378
578dc69f
YW
2379 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2380 if (r < 0)
2381 goto fail;
2382
5c6d40d1
LP
2383 q = path_join(params->prefix[type], "private", *rt);
2384 if (!q) {
2385 r = -ENOMEM;
2386 goto fail;
2387 }
2388
578dc69f
YW
2389 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2390 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2391 if (r < 0)
2392 goto fail;
2393
2394 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2395
2396 /* Hmm, apparently DynamicUser= was once turned on for this service,
2397 * but is no longer. Let's move the directory back up. */
2398
cf52c45d
LP
2399 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2400 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2401 exec_directory_type_to_string(type), q, p);
2402
5c6d40d1
LP
2403 if (unlink(p) < 0) {
2404 r = -errno;
2405 goto fail;
2406 }
2407
2408 if (rename(q, p) < 0) {
2409 r = -errno;
2410 goto fail;
2411 }
2412 }
2413 }
2414
6c47cd7d 2415 r = mkdir_label(p, context->directories[type].mode);
d484580c 2416 if (r < 0) {
d484580c
LP
2417 if (r != -EEXIST)
2418 goto fail;
2419
206e9864
LP
2420 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2421 struct stat st;
2422
2423 /* Don't change the owner/access mode of the configuration directory,
2424 * as in the common case it is not written to by a service, and shall
2425 * not be writable. */
2426
2427 if (stat(p, &st) < 0) {
2428 r = -errno;
2429 goto fail;
2430 }
2431
2432 /* Still complain if the access mode doesn't match */
2433 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2434 log_warning("%s \'%s\' already exists but the mode is different. "
2435 "(File system: %o %sMode: %o)",
2436 exec_directory_type_to_string(type), *rt,
2437 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2438
6cff72eb 2439 continue;
206e9864 2440 }
6cff72eb 2441 }
a1164ae3 2442 }
07689d5d 2443
206e9864 2444 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2445 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2446 * current UID/GID ownership.) */
2447 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2448 if (r < 0)
2449 goto fail;
c71b2eb7 2450
607b358e
LP
2451 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2452 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2453 * assignments to exist. */
607b358e 2454 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2455 if (r < 0)
3536f49e 2456 goto fail;
07689d5d
LP
2457 }
2458
2459 return 0;
3536f49e
YW
2460
2461fail:
2462 *exit_status = exit_status_table[type];
3536f49e 2463 return r;
07689d5d
LP
2464}
2465
bb0c0d6f
LP
2466static int write_credential(
2467 int dfd,
2468 const char *id,
2469 const void *data,
2470 size_t size,
2471 uid_t uid,
2472 bool ownership_ok) {
2473
2474 _cleanup_(unlink_and_freep) char *tmp = NULL;
2475 _cleanup_close_ int fd = -1;
2476 int r;
2477
2478 r = tempfn_random_child("", "cred", &tmp);
2479 if (r < 0)
2480 return r;
2481
2482 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2483 if (fd < 0) {
2484 tmp = mfree(tmp);
2485 return -errno;
2486 }
2487
2488 r = loop_write(fd, data, size, /* do_pool = */ false);
2489 if (r < 0)
2490 return r;
2491
2492 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2493 return -errno;
2494
2495 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2496 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2497 if (r < 0) {
2498 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2499 return r;
2500
2501 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2502 * to express: that the user gets read access and nothing
2503 * else. But if the backing fs can't support that (e.g. ramfs)
2504 * then we can use file ownership instead. But that's only safe if
2505 * we can then re-mount the whole thing read-only, so that the
2506 * user can no longer chmod() the file to gain write access. */
2507 return r;
2508
f5fbe71d 2509 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2510 return -errno;
2511 }
2512 }
2513
2514 if (renameat(dfd, tmp, dfd, id) < 0)
2515 return -errno;
2516
2517 tmp = mfree(tmp);
2518 return 0;
2519}
2520
2521#define CREDENTIALS_BYTES_MAX (1024LU * 1024LU) /* Refuse to pass more than 1M, after all this is unswappable memory */
2522
2523static int acquire_credentials(
2524 const ExecContext *context,
2525 const ExecParameters *params,
d3dcf4e3 2526 const char *unit,
bb0c0d6f
LP
2527 const char *p,
2528 uid_t uid,
2529 bool ownership_ok) {
2530
2531 uint64_t left = CREDENTIALS_BYTES_MAX;
2532 _cleanup_close_ int dfd = -1;
2533 ExecSetCredential *sc;
2534 char **id, **fn;
bb0c0d6f
LP
2535 int r;
2536
2537 assert(context);
2538 assert(p);
2539
2540 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2541 if (dfd < 0)
2542 return -errno;
2543
69e3234d 2544 /* First we use the literally specified credentials. Note that they might be overridden again below,
bb0c0d6f 2545 * and thus act as a "default" if the same credential is specified multiple times */
90e74a66 2546 HASHMAP_FOREACH(sc, context->set_credentials) {
bb0c0d6f
LP
2547 size_t add;
2548
2549 add = strlen(sc->id) + sc->size;
2550 if (add > left)
2551 return -E2BIG;
2552
2553 r = write_credential(dfd, sc->id, sc->data, sc->size, uid, ownership_ok);
2554 if (r < 0)
2555 return r;
2556
2557 left -= add;
2558 }
2559
2560 /* Then, load credential off disk (or acquire via AF_UNIX socket) */
2561 STRV_FOREACH_PAIR(id, fn, context->load_credentials) {
2562 ReadFullFileFlags flags = READ_FULL_FILE_SECURE;
2563 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2564 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2565 bool missing_ok = true;
bb0c0d6f
LP
2566 const char *source;
2567 size_t size, add;
2568
2569 if (path_is_absolute(*fn)) {
2570 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
2571 source = *fn;
2572 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2573
2574 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2575 * via the source socket address in case we read off an AF_UNIX socket. */
2576 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, *id) < 0)
2577 return -ENOMEM;
2578
fc682be2
LP
2579 missing_ok = false;
2580
bb0c0d6f
LP
2581 } else if (params->received_credentials) {
2582 /* If this is a relative path, take it relative to the credentials we received
2583 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2584 * on a credential store, i.e. this is guaranteed to be regular files. */
2585 j = path_join(params->received_credentials, *fn);
2586 if (!j)
2587 return -ENOMEM;
2588
2589 source = j;
2590 } else
2591 source = NULL;
2592
2593 if (source)
986311c2 2594 r = read_full_file_full(AT_FDCWD, source, UINT64_MAX, SIZE_MAX, flags, bindname, &data, &size);
bb0c0d6f
LP
2595 else
2596 r = -ENOENT;
fc682be2
LP
2597 if (r == -ENOENT && (missing_ok || faccessat(dfd, *id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)) {
2598 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2599 * will get clear errors if we don't pass such a missing credential on as they
2600 * themselves will get ENOENT when trying to read them, which should not be much
2601 * worse than when we handle the error here and make it fatal.
2602 *
2603 * Also, if the source file doesn't exist, but we already acquired the key otherwise,
2604 * then don't fail either. */
2605 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", *fn);
bb0c0d6f 2606 continue;
fc682be2 2607 }
bb0c0d6f 2608 if (r < 0)
fc682be2 2609 return log_debug_errno(r, "Failed to read credential '%s': %m", *fn);
bb0c0d6f
LP
2610
2611 add = strlen(*id) + size;
2612 if (add > left)
2613 return -E2BIG;
2614
2615 r = write_credential(dfd, *id, data, size, uid, ownership_ok);
2616 if (r < 0)
2617 return r;
2618
2619 left -= add;
2620 }
2621
2622 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2623 return -errno;
2624
2625 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2626 * accessible */
2627
2628 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2629 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2630 if (r < 0) {
2631 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2632 return r;
2633
2634 if (!ownership_ok)
2635 return r;
2636
f5fbe71d 2637 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2638 return -errno;
2639 }
2640 }
2641
2642 return 0;
2643}
2644
2645static int setup_credentials_internal(
2646 const ExecContext *context,
2647 const ExecParameters *params,
d3dcf4e3 2648 const char *unit,
bb0c0d6f
LP
2649 const char *final, /* This is where the credential store shall eventually end up at */
2650 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2651 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2652 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2653 uid_t uid) {
2654
2655 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2656 * if we mounted something; false if we definitely can't mount anything */
2657 bool final_mounted;
2658 const char *where;
2659
2660 assert(context);
2661 assert(final);
2662 assert(workspace);
2663
2664 if (reuse_workspace) {
2665 r = path_is_mount_point(workspace, NULL, 0);
2666 if (r < 0)
2667 return r;
2668 if (r > 0)
2669 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2670 else
2671 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2672 } else
2673 workspace_mounted = -1; /* ditto */
2674
2675 r = path_is_mount_point(final, NULL, 0);
2676 if (r < 0)
2677 return r;
2678 if (r > 0) {
2679 /* If the final place already has something mounted, we use that. If the workspace also has
2680 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2681 * different). */
2682 final_mounted = true;
2683
2684 if (workspace_mounted < 0) {
2685 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2686 * the final version to the workspace, and make it writable, so that we can make
2687 * changes */
2688
21935150
LP
2689 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2690 if (r < 0)
2691 return r;
bb0c0d6f 2692
21935150
LP
2693 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2694 if (r < 0)
2695 return r;
bb0c0d6f
LP
2696
2697 workspace_mounted = true;
2698 }
2699 } else
2700 final_mounted = false;
2701
2702 if (workspace_mounted < 0) {
2703 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2704 for (int try = 0;; try++) {
2705
2706 if (try == 0) {
2707 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2708 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2709 if (r >= 0) {
bb0c0d6f
LP
2710 workspace_mounted = true;
2711 break;
2712 }
2713
2714 } else if (try == 1) {
2715 _cleanup_free_ char *opts = NULL;
2716
2717 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%lu", CREDENTIALS_BYTES_MAX) < 0)
2718 return -ENOMEM;
2719
2720 /* Fall back to "tmpfs" otherwise */
21935150
LP
2721 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2722 if (r >= 0) {
bb0c0d6f
LP
2723 workspace_mounted = true;
2724 break;
2725 }
2726
2727 } else {
2728 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2729 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2730 if (r < 0) {
2731 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2732 return r;
bb0c0d6f
LP
2733
2734 if (must_mount) /* If we it's not OK to use the plain directory
2735 * fallback, propagate all errors too */
21935150 2736 return r;
bb0c0d6f
LP
2737
2738 /* If we lack privileges to bind mount stuff, then let's gracefully
2739 * proceed for compat with container envs, and just use the final dir
2740 * as is. */
2741
2742 workspace_mounted = false;
2743 break;
2744 }
2745
2746 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2747 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2748 if (r < 0)
2749 return r;
bb0c0d6f
LP
2750
2751 workspace_mounted = true;
2752 break;
2753 }
2754 }
2755 }
2756
2757 assert(!must_mount || workspace_mounted > 0);
2758 where = workspace_mounted ? workspace : final;
2759
d3dcf4e3 2760 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2761 if (r < 0)
2762 return r;
2763
2764 if (workspace_mounted) {
2765 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2766 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2767 if (r < 0)
2768 return r;
bb0c0d6f
LP
2769
2770 /* And mount it to the final place, read-only */
21935150
LP
2771 if (final_mounted)
2772 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2773 else
2774 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2775 if (r < 0)
2776 return r;
bb0c0d6f
LP
2777 } else {
2778 _cleanup_free_ char *parent = NULL;
2779
2780 /* If we do not have our own mount put used the plain directory fallback, then we need to
2781 * open access to the top-level credential directory and the per-service directory now */
2782
2783 parent = dirname_malloc(final);
2784 if (!parent)
2785 return -ENOMEM;
2786 if (chmod(parent, 0755) < 0)
2787 return -errno;
2788 }
2789
2790 return 0;
2791}
2792
2793static int setup_credentials(
2794 const ExecContext *context,
2795 const ExecParameters *params,
2796 const char *unit,
2797 uid_t uid) {
2798
2799 _cleanup_free_ char *p = NULL, *q = NULL;
2800 const char *i;
2801 int r;
2802
2803 assert(context);
2804 assert(params);
2805
2806 if (!exec_context_has_credentials(context))
2807 return 0;
2808
2809 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2810 return -EINVAL;
2811
2812 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2813 * and the subdir we mount over with a read-only file system readable by the service's user */
2814 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2815 if (!q)
2816 return -ENOMEM;
2817
2818 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2819 if (r < 0 && r != -EEXIST)
2820 return r;
2821
2822 p = path_join(q, unit);
2823 if (!p)
2824 return -ENOMEM;
2825
2826 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2827 if (r < 0 && r != -EEXIST)
2828 return r;
2829
2830 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2831 if (r < 0) {
2832 _cleanup_free_ char *t = NULL, *u = NULL;
2833
2834 /* If this is not a privilege or support issue then propagate the error */
2835 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2836 return r;
2837
2838 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2839 * it into place, so that users can't access half-initialized credential stores. */
2840 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2841 if (!t)
2842 return -ENOMEM;
2843
2844 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2845 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2846 * after it is fully set up */
2847 u = path_join(t, unit);
2848 if (!u)
2849 return -ENOMEM;
2850
2851 FOREACH_STRING(i, t, u) {
2852 r = mkdir_label(i, 0700);
2853 if (r < 0 && r != -EEXIST)
2854 return r;
2855 }
2856
2857 r = setup_credentials_internal(
2858 context,
2859 params,
d3dcf4e3 2860 unit,
bb0c0d6f
LP
2861 p, /* final mount point */
2862 u, /* temporary workspace to overmount */
2863 true, /* reuse the workspace if it is already a mount */
2864 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2865 uid);
2866
2867 (void) rmdir(u); /* remove the workspace again if we can. */
2868
2869 if (r < 0)
2870 return r;
2871
2872 } else if (r == 0) {
2873
2874 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2875 * we can use the same directory for all cases, after turning off propagation. Question
2876 * though is: where do we turn off propagation exactly, and where do we place the workspace
2877 * directory? We need some place that is guaranteed to be a mount point in the host, and
2878 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2879 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2880 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2881 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2882 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2883 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2884 * propagation on the former, and then overmount the latter.
2885 *
2886 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2887 * for this purpose, but there are few other candidates that work equally well for us, and
2888 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 2889 * that no one else sees this should be OK to do. */
bb0c0d6f 2890
21935150
LP
2891 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2892 if (r < 0)
bb0c0d6f
LP
2893 goto child_fail;
2894
2895 r = setup_credentials_internal(
2896 context,
2897 params,
d3dcf4e3 2898 unit,
bb0c0d6f
LP
2899 p, /* final mount point */
2900 "/dev/shm", /* temporary workspace to overmount */
2901 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2902 true, /* insist that something is mounted, do not allow fallback to plain directory */
2903 uid);
2904 if (r < 0)
2905 goto child_fail;
2906
2907 _exit(EXIT_SUCCESS);
2908
2909 child_fail:
2910 _exit(EXIT_FAILURE);
2911 }
2912
2913 return 0;
2914}
2915
92b423b9 2916#if ENABLE_SMACK
cefc33ae
LP
2917static int setup_smack(
2918 const ExecContext *context,
b83d5050 2919 int executable_fd) {
cefc33ae
LP
2920 int r;
2921
2922 assert(context);
b83d5050 2923 assert(executable_fd >= 0);
cefc33ae 2924
cefc33ae
LP
2925 if (context->smack_process_label) {
2926 r = mac_smack_apply_pid(0, context->smack_process_label);
2927 if (r < 0)
2928 return r;
2929 }
2930#ifdef SMACK_DEFAULT_PROCESS_LABEL
2931 else {
2932 _cleanup_free_ char *exec_label = NULL;
2933
b83d5050 2934 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 2935 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2936 return r;
2937
2938 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2939 if (r < 0)
2940 return r;
2941 }
cefc33ae
LP
2942#endif
2943
2944 return 0;
2945}
92b423b9 2946#endif
cefc33ae 2947
6c47cd7d
LP
2948static int compile_bind_mounts(
2949 const ExecContext *context,
2950 const ExecParameters *params,
2951 BindMount **ret_bind_mounts,
da6053d0 2952 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2953 char ***ret_empty_directories) {
2954
2955 _cleanup_strv_free_ char **empty_directories = NULL;
2956 BindMount *bind_mounts;
5b10116e 2957 size_t n, h = 0;
6c47cd7d
LP
2958 int r;
2959
2960 assert(context);
2961 assert(params);
2962 assert(ret_bind_mounts);
2963 assert(ret_n_bind_mounts);
2964 assert(ret_empty_directories);
2965
2966 n = context->n_bind_mounts;
5b10116e 2967 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
2968 if (!params->prefix[t])
2969 continue;
2970
2971 n += strv_length(context->directories[t].paths);
2972 }
2973
2974 if (n <= 0) {
2975 *ret_bind_mounts = NULL;
2976 *ret_n_bind_mounts = 0;
2977 *ret_empty_directories = NULL;
2978 return 0;
2979 }
2980
2981 bind_mounts = new(BindMount, n);
2982 if (!bind_mounts)
2983 return -ENOMEM;
2984
5b10116e 2985 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
2986 BindMount *item = context->bind_mounts + i;
2987 char *s, *d;
2988
2989 s = strdup(item->source);
2990 if (!s) {
2991 r = -ENOMEM;
2992 goto finish;
2993 }
2994
2995 d = strdup(item->destination);
2996 if (!d) {
2997 free(s);
2998 r = -ENOMEM;
2999 goto finish;
3000 }
3001
3002 bind_mounts[h++] = (BindMount) {
3003 .source = s,
3004 .destination = d,
3005 .read_only = item->read_only,
3006 .recursive = item->recursive,
3007 .ignore_enoent = item->ignore_enoent,
3008 };
3009 }
3010
5b10116e 3011 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3012 char **suffix;
3013
3014 if (!params->prefix[t])
3015 continue;
3016
3017 if (strv_isempty(context->directories[t].paths))
3018 continue;
3019
494d0247 3020 if (exec_directory_is_private(context, t) &&
74e12520 3021 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3022 char *private_root;
3023
3024 /* So this is for a dynamic user, and we need to make sure the process can access its own
3025 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3026 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3027
657ee2d8 3028 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3029 if (!private_root) {
3030 r = -ENOMEM;
3031 goto finish;
3032 }
3033
3034 r = strv_consume(&empty_directories, private_root);
a635a7ae 3035 if (r < 0)
6c47cd7d 3036 goto finish;
6c47cd7d
LP
3037 }
3038
3039 STRV_FOREACH(suffix, context->directories[t].paths) {
3040 char *s, *d;
3041
494d0247 3042 if (exec_directory_is_private(context, t))
657ee2d8 3043 s = path_join(params->prefix[t], "private", *suffix);
6c47cd7d 3044 else
657ee2d8 3045 s = path_join(params->prefix[t], *suffix);
6c47cd7d
LP
3046 if (!s) {
3047 r = -ENOMEM;
3048 goto finish;
3049 }
3050
494d0247 3051 if (exec_directory_is_private(context, t) &&
74e12520 3052 exec_context_with_rootfs(context))
5609f688
YW
3053 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3054 * directory is not created on the root directory. So, let's bind-mount the directory
3055 * on the 'non-private' place. */
657ee2d8 3056 d = path_join(params->prefix[t], *suffix);
5609f688
YW
3057 else
3058 d = strdup(s);
6c47cd7d
LP
3059 if (!d) {
3060 free(s);
3061 r = -ENOMEM;
3062 goto finish;
3063 }
3064
3065 bind_mounts[h++] = (BindMount) {
3066 .source = s,
3067 .destination = d,
3068 .read_only = false,
9ce4e4b0 3069 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3070 .recursive = true,
3071 .ignore_enoent = false,
3072 };
3073 }
3074 }
3075
3076 assert(h == n);
3077
3078 *ret_bind_mounts = bind_mounts;
3079 *ret_n_bind_mounts = n;
ae2a15bc 3080 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3081
3082 return (int) n;
3083
3084finish:
3085 bind_mount_free_many(bind_mounts, h);
3086 return r;
3087}
3088
4e677599
LP
3089static bool insist_on_sandboxing(
3090 const ExecContext *context,
3091 const char *root_dir,
3092 const char *root_image,
3093 const BindMount *bind_mounts,
3094 size_t n_bind_mounts) {
3095
4e677599
LP
3096 assert(context);
3097 assert(n_bind_mounts == 0 || bind_mounts);
3098
3099 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3100 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3101 * rearrange stuff in a way we cannot ignore gracefully. */
3102
3103 if (context->n_temporary_filesystems > 0)
3104 return true;
3105
3106 if (root_dir || root_image)
3107 return true;
3108
b3d13314
LB
3109 if (context->n_mount_images > 0)
3110 return true;
3111
4e677599
LP
3112 if (context->dynamic_user)
3113 return true;
3114
3115 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3116 * essential. */
5b10116e 3117 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3118 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3119 return true;
3120
91dd5f7c
LP
3121 if (context->log_namespace)
3122 return true;
3123
4e677599
LP
3124 return false;
3125}
3126
6818c54c 3127static int apply_mount_namespace(
34cf6c43 3128 const Unit *u,
9f71ba8d 3129 ExecCommandFlags command_flags,
6818c54c
LP
3130 const ExecContext *context,
3131 const ExecParameters *params,
7cc5ef5f
ZJS
3132 const ExecRuntime *runtime,
3133 char **error_path) {
6818c54c 3134
7bcef4ef 3135 _cleanup_strv_free_ char **empty_directories = NULL;
56a13a49 3136 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3137 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3138 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3139 NamespaceInfo ns_info;
165a31c0 3140 bool needs_sandboxing;
6c47cd7d 3141 BindMount *bind_mounts = NULL;
da6053d0 3142 size_t n_bind_mounts = 0;
6818c54c 3143 int r;
93c6bb51 3144
2b3c1b9e
DH
3145 assert(context);
3146
915e6d16
LP
3147 if (params->flags & EXEC_APPLY_CHROOT) {
3148 root_image = context->root_image;
3149
3150 if (!root_image)
3151 root_dir = context->root_directory;
3152 }
93c6bb51 3153
6c47cd7d
LP
3154 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3155 if (r < 0)
3156 return r;
3157
9f71ba8d 3158 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3159 if (needs_sandboxing) {
3160 /* The runtime struct only contains the parent of the private /tmp,
3161 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3162 * that is sticky, and that's the one we want to use here.
3163 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3164
3165 if (context->private_tmp && runtime) {
56a13a49
ZJS
3166 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3167 tmp_dir = runtime->tmp_dir;
3168 else if (runtime->tmp_dir)
3169 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3170
3171 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3172 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3173 else if (runtime->var_tmp_dir)
56a13a49 3174 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3175 }
3176
b5a33299
YW
3177 ns_info = (NamespaceInfo) {
3178 .ignore_protect_paths = false,
3179 .private_dev = context->private_devices,
3180 .protect_control_groups = context->protect_control_groups,
3181 .protect_kernel_tunables = context->protect_kernel_tunables,
3182 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3183 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3184 .protect_hostname = context->protect_hostname,
5e98086d 3185 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3186 .private_mounts = context->private_mounts,
52b3d652
LP
3187 .protect_home = context->protect_home,
3188 .protect_system = context->protect_system,
4e399953
LP
3189 .protect_proc = context->protect_proc,
3190 .proc_subset = context->proc_subset,
80271a44 3191 .private_ipc = context->private_ipc || context->ipc_namespace_path,
b5a33299 3192 };
ecf63c91 3193 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3194 /*
3195 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3196 * sandbox info, otherwise enforce it, don't ignore protected paths and
3197 * fail if we are enable to apply the sandbox inside the mount namespace.
3198 */
3199 ns_info = (NamespaceInfo) {
3200 .ignore_protect_paths = true,
3201 };
3202 else
3203 ns_info = (NamespaceInfo) {};
b5a33299 3204
37ed15d7
FB
3205 if (context->mount_flags == MS_SHARED)
3206 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3207
a631cbfa
LP
3208 if (exec_context_has_credentials(context) &&
3209 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3210 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3211 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3212 if (!creds_path) {
3213 r = -ENOMEM;
3214 goto finalize;
3215 }
bbb4e7f3
LP
3216 }
3217
5e8deb94
LB
3218 if (MANAGER_IS_SYSTEM(u->manager)) {
3219 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3220 if (!propagate_dir) {
3221 r = -ENOMEM;
3222 goto finalize;
3223 }
3224
5e8deb94 3225 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3226 if (!incoming_dir) {
3227 r = -ENOMEM;
3228 goto finalize;
3229 }
5e8deb94
LB
3230 }
3231
18d73705 3232 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3233 &ns_info, context->read_write_paths,
165a31c0
LP
3234 needs_sandboxing ? context->read_only_paths : NULL,
3235 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3236 needs_sandboxing ? context->exec_paths : NULL,
3237 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d
LP
3238 empty_directories,
3239 bind_mounts,
3240 n_bind_mounts,
2abd4e38
YW
3241 context->temporary_filesystems,
3242 context->n_temporary_filesystems,
b3d13314
LB
3243 context->mount_images,
3244 context->n_mount_images,
56a13a49
ZJS
3245 tmp_dir,
3246 var_tmp_dir,
bbb4e7f3 3247 creds_path,
91dd5f7c 3248 context->log_namespace,
915e6d16 3249 context->mount_flags,
d4d55b0d
LB
3250 context->root_hash, context->root_hash_size, context->root_hash_path,
3251 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3252 context->root_verity,
93f59701
LB
3253 context->extension_images,
3254 context->n_extension_images,
5e8deb94
LB
3255 propagate_dir,
3256 incoming_dir,
3bdc25a4 3257 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3258 error_path);
93c6bb51 3259
1beab8b0 3260 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3261 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3262 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3263 * completely different execution environment. */
aca835ed 3264 if (r == -ENOANO) {
4e677599
LP
3265 if (insist_on_sandboxing(
3266 context,
3267 root_dir, root_image,
3268 bind_mounts,
3269 n_bind_mounts)) {
3270 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3271 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3272 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3273
3274 r = -EOPNOTSUPP;
3275 } else {
aca835ed 3276 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3277 r = 0;
aca835ed 3278 }
93c6bb51
DH
3279 }
3280
8062e643 3281finalize:
4e677599 3282 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3283 return r;
3284}
3285
915e6d16
LP
3286static int apply_working_directory(
3287 const ExecContext *context,
3288 const ExecParameters *params,
3289 const char *home,
376fecf6 3290 int *exit_status) {
915e6d16 3291
6732edab 3292 const char *d, *wd;
2b3c1b9e
DH
3293
3294 assert(context);
376fecf6 3295 assert(exit_status);
2b3c1b9e 3296
6732edab
LP
3297 if (context->working_directory_home) {
3298
376fecf6
LP
3299 if (!home) {
3300 *exit_status = EXIT_CHDIR;
6732edab 3301 return -ENXIO;
376fecf6 3302 }
6732edab 3303
2b3c1b9e 3304 wd = home;
6732edab 3305
14eb3285
LP
3306 } else
3307 wd = empty_to_root(context->working_directory);
e7f1e7c6 3308
fa97f630 3309 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3310 d = wd;
fa97f630 3311 else
3b0e5bb5 3312 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3313
376fecf6
LP
3314 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3315 *exit_status = EXIT_CHDIR;
2b3c1b9e 3316 return -errno;
376fecf6 3317 }
e7f1e7c6
DH
3318
3319 return 0;
3320}
3321
fa97f630
JB
3322static int apply_root_directory(
3323 const ExecContext *context,
3324 const ExecParameters *params,
3325 const bool needs_mount_ns,
3326 int *exit_status) {
3327
3328 assert(context);
3329 assert(exit_status);
3330
5b10116e 3331 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3332 if (!needs_mount_ns && context->root_directory)
3333 if (chroot(context->root_directory) < 0) {
3334 *exit_status = EXIT_CHROOT;
3335 return -errno;
3336 }
fa97f630
JB
3337
3338 return 0;
3339}
3340
b1edf445 3341static int setup_keyring(
34cf6c43 3342 const Unit *u,
b1edf445
LP
3343 const ExecContext *context,
3344 const ExecParameters *p,
3345 uid_t uid, gid_t gid) {
3346
74dd6b51 3347 key_serial_t keyring;
e64c2d0b
DJL
3348 int r = 0;
3349 uid_t saved_uid;
3350 gid_t saved_gid;
74dd6b51
LP
3351
3352 assert(u);
b1edf445 3353 assert(context);
74dd6b51
LP
3354 assert(p);
3355
3356 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3357 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3358 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3359 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3360 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3361 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3362
b1edf445
LP
3363 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3364 return 0;
3365
e64c2d0b
DJL
3366 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3367 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3368 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3369 * & group is just as nasty as acquiring a reference to the user keyring. */
3370
3371 saved_uid = getuid();
3372 saved_gid = getgid();
3373
3374 if (gid_is_valid(gid) && gid != saved_gid) {
3375 if (setregid(gid, -1) < 0)
3376 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3377 }
3378
3379 if (uid_is_valid(uid) && uid != saved_uid) {
3380 if (setreuid(uid, -1) < 0) {
3381 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3382 goto out;
3383 }
3384 }
3385
74dd6b51
LP
3386 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3387 if (keyring == -1) {
3388 if (errno == ENOSYS)
8002fb97 3389 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3390 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3391 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3392 else if (errno == EDQUOT)
8002fb97 3393 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3394 else
e64c2d0b 3395 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3396
e64c2d0b 3397 goto out;
74dd6b51
LP
3398 }
3399
e64c2d0b
DJL
3400 /* When requested link the user keyring into the session keyring. */
3401 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3402
3403 if (keyctl(KEYCTL_LINK,
3404 KEY_SPEC_USER_KEYRING,
3405 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3406 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3407 goto out;
3408 }
3409 }
3410
3411 /* Restore uid/gid back */
3412 if (uid_is_valid(uid) && uid != saved_uid) {
3413 if (setreuid(saved_uid, -1) < 0) {
3414 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3415 goto out;
3416 }
3417 }
3418
3419 if (gid_is_valid(gid) && gid != saved_gid) {
3420 if (setregid(saved_gid, -1) < 0)
3421 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3422 }
3423
3424 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3425 if (!sd_id128_is_null(u->invocation_id)) {
3426 key_serial_t key;
3427
3428 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3429 if (key == -1)
8002fb97 3430 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3431 else {
3432 if (keyctl(KEYCTL_SETPERM, key,
3433 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3434 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3435 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3436 }
3437 }
3438
e64c2d0b 3439out:
37b22b3b 3440 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3441 /* no extra logging, as only the first already reported error matters */
3442 if (getuid() != saved_uid)
3443 (void) setreuid(saved_uid, -1);
b1edf445 3444
e64c2d0b
DJL
3445 if (getgid() != saved_gid)
3446 (void) setregid(saved_gid, -1);
b1edf445 3447
e64c2d0b 3448 return r;
74dd6b51
LP
3449}
3450
3042bbeb 3451static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3452 assert(array);
3453 assert(n);
2caa38e9 3454 assert(pair);
29206d46
LP
3455
3456 if (pair[0] >= 0)
3457 array[(*n)++] = pair[0];
3458 if (pair[1] >= 0)
3459 array[(*n)++] = pair[1];
3460}
3461
a34ceba6
LP
3462static int close_remaining_fds(
3463 const ExecParameters *params,
34cf6c43
YW
3464 const ExecRuntime *runtime,
3465 const DynamicCreds *dcreds,
00d9ef85 3466 int user_lookup_fd,
a34ceba6 3467 int socket_fd,
5b8d1f6b 3468 const int *fds, size_t n_fds) {
a34ceba6 3469
da6053d0 3470 size_t n_dont_close = 0;
00d9ef85 3471 int dont_close[n_fds + 12];
a34ceba6
LP
3472
3473 assert(params);
3474
3475 if (params->stdin_fd >= 0)
3476 dont_close[n_dont_close++] = params->stdin_fd;
3477 if (params->stdout_fd >= 0)
3478 dont_close[n_dont_close++] = params->stdout_fd;
3479 if (params->stderr_fd >= 0)
3480 dont_close[n_dont_close++] = params->stderr_fd;
3481
3482 if (socket_fd >= 0)
3483 dont_close[n_dont_close++] = socket_fd;
3484 if (n_fds > 0) {
3485 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3486 n_dont_close += n_fds;
3487 }
3488
a70581ff 3489 if (runtime) {
29206d46 3490 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3491 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3492 }
29206d46
LP
3493
3494 if (dcreds) {
3495 if (dcreds->user)
3496 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3497 if (dcreds->group)
3498 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3499 }
3500
00d9ef85
LP
3501 if (user_lookup_fd >= 0)
3502 dont_close[n_dont_close++] = user_lookup_fd;
3503
a34ceba6
LP
3504 return close_all_fds(dont_close, n_dont_close);
3505}
3506
00d9ef85
LP
3507static int send_user_lookup(
3508 Unit *unit,
3509 int user_lookup_fd,
3510 uid_t uid,
3511 gid_t gid) {
3512
3513 assert(unit);
3514
3515 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3516 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3517 * specified. */
3518
3519 if (user_lookup_fd < 0)
3520 return 0;
3521
3522 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3523 return 0;
3524
3525 if (writev(user_lookup_fd,
3526 (struct iovec[]) {
e6a7ec4b
LP
3527 IOVEC_INIT(&uid, sizeof(uid)),
3528 IOVEC_INIT(&gid, sizeof(gid)),
3529 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3530 return -errno;
3531
3532 return 0;
3533}
3534
6732edab
LP
3535static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3536 int r;
3537
3538 assert(c);
3539 assert(home);
3540 assert(buf);
3541
3542 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3543
3544 if (*home)
3545 return 0;
3546
3547 if (!c->working_directory_home)
3548 return 0;
3549
6732edab
LP
3550 r = get_home_dir(buf);
3551 if (r < 0)
3552 return r;
3553
3554 *home = *buf;
3555 return 1;
3556}
3557
da50b85a
LP
3558static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3559 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3560 int r;
3561
3562 assert(c);
3563 assert(p);
3564 assert(ret);
3565
3566 assert(c->dynamic_user);
3567
3568 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3569 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3570 * directories. */
3571
5b10116e 3572 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3573 char **i;
3574
3575 if (t == EXEC_DIRECTORY_CONFIGURATION)
3576 continue;
3577
3578 if (!p->prefix[t])
3579 continue;
3580
3581 STRV_FOREACH(i, c->directories[t].paths) {
3582 char *e;
3583
494d0247 3584 if (exec_directory_is_private(c, t))
657ee2d8 3585 e = path_join(p->prefix[t], "private", *i);
494d0247
YW
3586 else
3587 e = path_join(p->prefix[t], *i);
da50b85a
LP
3588 if (!e)
3589 return -ENOMEM;
3590
3591 r = strv_consume(&list, e);
3592 if (r < 0)
3593 return r;
3594 }
3595 }
3596
ae2a15bc 3597 *ret = TAKE_PTR(list);
da50b85a
LP
3598
3599 return 0;
3600}
3601
34cf6c43
YW
3602static char *exec_command_line(char **argv);
3603
78f93209
LP
3604static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3605 bool using_subcgroup;
3606 char *p;
3607
3608 assert(params);
3609 assert(ret);
3610
3611 if (!params->cgroup_path)
3612 return -EINVAL;
3613
3614 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3615 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3616 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3617 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3618 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3619 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3620 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3621 * flag, which is only passed for the former statements, not for the latter. */
3622
3623 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3624 if (using_subcgroup)
657ee2d8 3625 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3626 else
3627 p = strdup(params->cgroup_path);
3628 if (!p)
3629 return -ENOMEM;
3630
3631 *ret = p;
3632 return using_subcgroup;
3633}
3634
e2b2fb7f
MS
3635static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3636 _cleanup_(cpu_set_reset) CPUSet s = {};
3637 int r;
3638
3639 assert(c);
3640 assert(ret);
3641
3642 if (!c->numa_policy.nodes.set) {
3643 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3644 return 0;
3645 }
3646
3647 r = numa_to_cpu_set(&c->numa_policy, &s);
3648 if (r < 0)
3649 return r;
3650
3651 cpu_set_reset(ret);
3652
3653 return cpu_set_add_all(ret, &s);
3654}
3655
3656bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3657 assert(c);
3658
3659 return c->cpu_affinity_from_numa;
3660}
3661
1da37e58
ZJS
3662static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3663 int r;
3664
3665 assert(fds);
3666 assert(n_fds);
3667 assert(*n_fds < fds_size);
3668 assert(ret_fd);
3669
3670 if (fd < 0) {
3671 *ret_fd = -1;
3672 return 0;
3673 }
3674
3675 if (fd < 3 + (int) *n_fds) {
3676 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3677 * the fds we pass to the process (or which are closed only during execve). */
3678
3679 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3680 if (r < 0)
3681 return -errno;
3682
3683 CLOSE_AND_REPLACE(fd, r);
3684 }
3685
3686 *ret_fd = fds[*n_fds] = fd;
3687 (*n_fds) ++;
3688 return 1;
3689}
3690
ff0af2a1 3691static int exec_child(
f2341e0a 3692 Unit *unit,
34cf6c43 3693 const ExecCommand *command,
ff0af2a1
LP
3694 const ExecContext *context,
3695 const ExecParameters *params,
3696 ExecRuntime *runtime,
29206d46 3697 DynamicCreds *dcreds,
ff0af2a1 3698 int socket_fd,
2caa38e9 3699 const int named_iofds[static 3],
4c47affc 3700 int *fds,
da6053d0 3701 size_t n_socket_fds,
25b583d7 3702 size_t n_storage_fds,
ff0af2a1 3703 char **files_env,
00d9ef85 3704 int user_lookup_fd,
12145637 3705 int *exit_status) {
d35fbf6b 3706
7ca69792 3707 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3708 int r, ngids = 0, exec_fd;
4d885bd3
DH
3709 _cleanup_free_ gid_t *supplementary_gids = NULL;
3710 const char *username = NULL, *groupname = NULL;
5686391b 3711 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3712 const char *home = NULL, *shell = NULL;
7ca69792 3713 char **final_argv = NULL;
7bce046b
LP
3714 dev_t journal_stream_dev = 0;
3715 ino_t journal_stream_ino = 0;
5749f855 3716 bool userns_set_up = false;
165a31c0
LP
3717 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3718 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3719 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3720 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3721#if HAVE_SELINUX
7f59dd35 3722 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3723 bool use_selinux = false;
ecfbc84f 3724#endif
f9fa32f0 3725#if ENABLE_SMACK
43b1f709 3726 bool use_smack = false;
ecfbc84f 3727#endif
349cc4a5 3728#if HAVE_APPARMOR
43b1f709 3729 bool use_apparmor = false;
ecfbc84f 3730#endif
5749f855
AZ
3731 uid_t saved_uid = getuid();
3732 gid_t saved_gid = getgid();
fed1e721
LP
3733 uid_t uid = UID_INVALID;
3734 gid_t gid = GID_INVALID;
1da37e58
ZJS
3735 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3736 n_keep_fds; /* total number of fds not to close */
165a31c0 3737 int secure_bits;
afb11bf1
DG
3738 _cleanup_free_ gid_t *gids_after_pam = NULL;
3739 int ngids_after_pam = 0;
034c6ed7 3740
f2341e0a 3741 assert(unit);
5cb5a6ff
LP
3742 assert(command);
3743 assert(context);
d35fbf6b 3744 assert(params);
ff0af2a1 3745 assert(exit_status);
d35fbf6b
DM
3746
3747 rename_process_from_path(command->path);
3748
9c274488
LP
3749 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3750 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3751 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3752 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3753 SIGNALS_IGNORE);
d35fbf6b
DM
3754
3755 if (context->ignore_sigpipe)
9c274488 3756 (void) ignore_signals(SIGPIPE);
d35fbf6b 3757
ff0af2a1
LP
3758 r = reset_signal_mask();
3759 if (r < 0) {
3760 *exit_status = EXIT_SIGNAL_MASK;
12145637 3761 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3762 }
034c6ed7 3763
d35fbf6b
DM
3764 if (params->idle_pipe)
3765 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3766
2c027c62
LP
3767 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3768 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3769 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3770 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3771
d35fbf6b 3772 log_forget_fds();
2c027c62 3773 log_set_open_when_needed(true);
4f2d528d 3774
40a80078
LP
3775 /* In case anything used libc syslog(), close this here, too */
3776 closelog();
3777
b83d5050 3778 int keep_fds[n_fds + 2];
1da37e58
ZJS
3779 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3780 n_keep_fds = n_fds;
3781
3782 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3783 if (r < 0) {
3784 *exit_status = EXIT_FDS;
3785 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3786 }
3787
3788 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3789 if (r < 0) {
3790 *exit_status = EXIT_FDS;
12145637 3791 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3792 }
3793
0af07108
ZJS
3794 if (!context->same_pgrp &&
3795 setsid() < 0) {
3796 *exit_status = EXIT_SETSID;
3797 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3798 }
9e2f7c11 3799
1e22b5cd 3800 exec_context_tty_reset(context, params);
d35fbf6b 3801
c891efaf 3802 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3803 const char *vc = params->confirm_spawn;
3b20f877
FB
3804 _cleanup_free_ char *cmdline = NULL;
3805
ee39ca20 3806 cmdline = exec_command_line(command->argv);
3b20f877 3807 if (!cmdline) {
0460aa5c 3808 *exit_status = EXIT_MEMORY;
12145637 3809 return log_oom();
3b20f877 3810 }
d35fbf6b 3811
eedf223a 3812 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3813 if (r != CONFIRM_EXECUTE) {
3814 if (r == CONFIRM_PRETEND_SUCCESS) {
3815 *exit_status = EXIT_SUCCESS;
3816 return 0;
3817 }
ff0af2a1 3818 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
3819 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3820 "Execution cancelled by the user");
d35fbf6b
DM
3821 }
3822 }
1a63a750 3823
d521916d
LP
3824 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3825 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3826 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3827 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3828 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3829 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3830 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3831 *exit_status = EXIT_MEMORY;
3832 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3833 }
3834
29206d46 3835 if (context->dynamic_user && dcreds) {
da50b85a 3836 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 3837
d521916d 3838 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 3839 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
3840 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3841 *exit_status = EXIT_USER;
12145637 3842 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
3843 }
3844
da50b85a
LP
3845 r = compile_suggested_paths(context, params, &suggested_paths);
3846 if (r < 0) {
3847 *exit_status = EXIT_MEMORY;
3848 return log_oom();
3849 }
3850
3851 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
3852 if (r < 0) {
3853 *exit_status = EXIT_USER;
d85ff944
YW
3854 if (r == -EILSEQ)
3855 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3856 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 3857 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 3858 }
524daa8c 3859
70dd455c 3860 if (!uid_is_valid(uid)) {
29206d46 3861 *exit_status = EXIT_USER;
d85ff944 3862 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
3863 }
3864
3865 if (!gid_is_valid(gid)) {
3866 *exit_status = EXIT_USER;
d85ff944 3867 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 3868 }
5bc7452b 3869
29206d46
LP
3870 if (dcreds->user)
3871 username = dcreds->user->name;
3872
3873 } else {
4d885bd3
DH
3874 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3875 if (r < 0) {
3876 *exit_status = EXIT_USER;
12145637 3877 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 3878 }
5bc7452b 3879
4d885bd3
DH
3880 r = get_fixed_group(context, &groupname, &gid);
3881 if (r < 0) {
3882 *exit_status = EXIT_GROUP;
12145637 3883 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 3884 }
cdc5d5c5 3885 }
29206d46 3886
cdc5d5c5
DH
3887 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3888 r = get_supplementary_groups(context, username, groupname, gid,
3889 &supplementary_gids, &ngids);
3890 if (r < 0) {
3891 *exit_status = EXIT_GROUP;
12145637 3892 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 3893 }
5bc7452b 3894
00d9ef85
LP
3895 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3896 if (r < 0) {
3897 *exit_status = EXIT_USER;
12145637 3898 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
3899 }
3900
3901 user_lookup_fd = safe_close(user_lookup_fd);
3902
6732edab
LP
3903 r = acquire_home(context, uid, &home, &home_buffer);
3904 if (r < 0) {
3905 *exit_status = EXIT_CHDIR;
12145637 3906 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
3907 }
3908
d35fbf6b
DM
3909 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3910 * must sure to drop O_NONBLOCK */
3911 if (socket_fd >= 0)
a34ceba6 3912 (void) fd_nonblock(socket_fd, false);
acbb0225 3913
4c70a4a7
MS
3914 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3915 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3916 if (params->cgroup_path) {
3917 _cleanup_free_ char *p = NULL;
3918
3919 r = exec_parameters_get_cgroup_path(params, &p);
3920 if (r < 0) {
3921 *exit_status = EXIT_CGROUP;
3922 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3923 }
3924
3925 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3926 if (r < 0) {
3927 *exit_status = EXIT_CGROUP;
3928 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3929 }
3930 }
3931
a8d08f39 3932 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 3933 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
3934 if (r < 0) {
3935 *exit_status = EXIT_NETWORK;
3936 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3937 }
3938 }
3939
a70581ff
XR
3940 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3941 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3942 if (r < 0) {
3943 *exit_status = EXIT_NAMESPACE;
3944 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3945 }
3946 }
3947
52c239d7 3948 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
3949 if (r < 0) {
3950 *exit_status = EXIT_STDIN;
12145637 3951 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 3952 }
034c6ed7 3953
52c239d7 3954 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3955 if (r < 0) {
3956 *exit_status = EXIT_STDOUT;
12145637 3957 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
3958 }
3959
52c239d7 3960 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3961 if (r < 0) {
3962 *exit_status = EXIT_STDERR;
12145637 3963 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
3964 }
3965
d35fbf6b 3966 if (context->oom_score_adjust_set) {
9f8168eb
LP
3967 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3968 * prohibit write access to this file, and we shouldn't trip up over that. */
3969 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 3970 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 3971 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 3972 else if (r < 0) {
ff0af2a1 3973 *exit_status = EXIT_OOM_ADJUST;
12145637 3974 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 3975 }
d35fbf6b
DM
3976 }
3977
ad21e542
ZJS
3978 if (context->coredump_filter_set) {
3979 r = set_coredump_filter(context->coredump_filter);
3980 if (ERRNO_IS_PRIVILEGE(r))
3981 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
3982 else if (r < 0)
3983 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
3984 }
3985
39090201
DJL
3986 if (context->nice_set) {
3987 r = setpriority_closest(context->nice);
3988 if (r < 0)
3989 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
3990 }
613b411c 3991
d35fbf6b
DM
3992 if (context->cpu_sched_set) {
3993 struct sched_param param = {
3994 .sched_priority = context->cpu_sched_priority,
3995 };
3996
ff0af2a1
LP
3997 r = sched_setscheduler(0,
3998 context->cpu_sched_policy |
3999 (context->cpu_sched_reset_on_fork ?
4000 SCHED_RESET_ON_FORK : 0),
4001 &param);
4002 if (r < 0) {
4003 *exit_status = EXIT_SETSCHEDULER;
12145637 4004 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4005 }
d35fbf6b 4006 }
fc9b2a84 4007
e2b2fb7f
MS
4008 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4009 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4010 const CPUSet *cpu_set;
4011
4012 if (context->cpu_affinity_from_numa) {
4013 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4014 if (r < 0) {
4015 *exit_status = EXIT_CPUAFFINITY;
4016 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4017 }
4018
4019 cpu_set = &converted_cpu_set;
4020 } else
4021 cpu_set = &context->cpu_set;
4022
4023 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4024 *exit_status = EXIT_CPUAFFINITY;
12145637 4025 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4026 }
e2b2fb7f 4027 }
034c6ed7 4028
b070c7c0
MS
4029 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4030 r = apply_numa_policy(&context->numa_policy);
4031 if (r == -EOPNOTSUPP)
33fe9e3f 4032 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4033 else if (r < 0) {
4034 *exit_status = EXIT_NUMA_POLICY;
4035 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4036 }
4037 }
4038
d35fbf6b
DM
4039 if (context->ioprio_set)
4040 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4041 *exit_status = EXIT_IOPRIO;
12145637 4042 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4043 }
da726a4d 4044
d35fbf6b
DM
4045 if (context->timer_slack_nsec != NSEC_INFINITY)
4046 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4047 *exit_status = EXIT_TIMERSLACK;
12145637 4048 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4049 }
9eba9da4 4050
21022b9d
LP
4051 if (context->personality != PERSONALITY_INVALID) {
4052 r = safe_personality(context->personality);
4053 if (r < 0) {
ff0af2a1 4054 *exit_status = EXIT_PERSONALITY;
12145637 4055 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4056 }
21022b9d 4057 }
94f04347 4058
d35fbf6b 4059 if (context->utmp_id)
df0ff127 4060 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 4061 context->tty_path,
023a4f67
LP
4062 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4063 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4064 USER_PROCESS,
6a93917d 4065 username);
d35fbf6b 4066
08f67696 4067 if (uid_is_valid(uid)) {
ff0af2a1
LP
4068 r = chown_terminal(STDIN_FILENO, uid);
4069 if (r < 0) {
4070 *exit_status = EXIT_STDIN;
12145637 4071 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4072 }
d35fbf6b 4073 }
8e274523 4074
4e1dfa45 4075 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4076 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4077 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4078 * touch a single hierarchy too. */
584b8688 4079 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4080 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4081 if (r < 0) {
4082 *exit_status = EXIT_CGROUP;
12145637 4083 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4084 }
d35fbf6b 4085 }
034c6ed7 4086
5b10116e 4087 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 4088 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
4089 if (r < 0)
4090 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4091 }
94f04347 4092
bb0c0d6f
LP
4093 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4094 r = setup_credentials(context, params, unit->id, uid);
4095 if (r < 0) {
4096 *exit_status = EXIT_CREDENTIALS;
4097 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4098 }
4099 }
4100
7bce046b 4101 r = build_environment(
fd63e712 4102 unit,
7bce046b
LP
4103 context,
4104 params,
4105 n_fds,
4106 home,
4107 username,
4108 shell,
4109 journal_stream_dev,
4110 journal_stream_ino,
4111 &our_env);
2065ca69
JW
4112 if (r < 0) {
4113 *exit_status = EXIT_MEMORY;
12145637 4114 return log_oom();
2065ca69
JW
4115 }
4116
4117 r = build_pass_environment(context, &pass_env);
4118 if (r < 0) {
4119 *exit_status = EXIT_MEMORY;
12145637 4120 return log_oom();
2065ca69
JW
4121 }
4122
4123 accum_env = strv_env_merge(5,
4124 params->environment,
4125 our_env,
4126 pass_env,
4127 context->environment,
44e5d006 4128 files_env);
2065ca69
JW
4129 if (!accum_env) {
4130 *exit_status = EXIT_MEMORY;
12145637 4131 return log_oom();
2065ca69 4132 }
1280503b 4133 accum_env = strv_env_clean(accum_env);
2065ca69 4134
096424d1 4135 (void) umask(context->umask);
b213e1c1 4136
b1edf445 4137 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4138 if (r < 0) {
4139 *exit_status = EXIT_KEYRING;
12145637 4140 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4141 }
4142
165a31c0 4143 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4144 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4145
165a31c0
LP
4146 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4147 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4148
165a31c0
LP
4149 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4150 if (needs_ambient_hack)
4151 needs_setuid = false;
4152 else
4153 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4154
4155 if (needs_sandboxing) {
7f18ef0a
FK
4156 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4157 * present. The actual MAC context application will happen later, as late as possible, to avoid
4158 * impacting our own code paths. */
4159
349cc4a5 4160#if HAVE_SELINUX
43b1f709 4161 use_selinux = mac_selinux_use();
7f18ef0a 4162#endif
f9fa32f0 4163#if ENABLE_SMACK
43b1f709 4164 use_smack = mac_smack_use();
7f18ef0a 4165#endif
349cc4a5 4166#if HAVE_APPARMOR
43b1f709 4167 use_apparmor = mac_apparmor_use();
7f18ef0a 4168#endif
165a31c0 4169 }
7f18ef0a 4170
ce932d2d
LP
4171 if (needs_sandboxing) {
4172 int which_failed;
4173
4174 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4175 * is set here. (See below.) */
4176
4177 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4178 if (r < 0) {
4179 *exit_status = EXIT_LIMITS;
4180 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4181 }
4182 }
4183
0af07108 4184 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4185 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4186 * wins here. (See above.) */
4187
1da37e58 4188 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4189 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4190 if (r < 0) {
4191 *exit_status = EXIT_PAM;
4192 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4193 }
ac45f971 4194
0af07108
ZJS
4195 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4196 if (ngids_after_pam < 0) {
4197 *exit_status = EXIT_MEMORY;
4198 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4199 }
b213e1c1 4200 }
5749f855 4201
0af07108 4202 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4203 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4204 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4205 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4206
4207 userns_set_up = true;
4208 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4209 if (r < 0) {
4210 *exit_status = EXIT_USER;
4211 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4212 }
4213 }
4214
a8d08f39
LP
4215 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4216
6e2d7c4f 4217 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4218 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4219 if (r == -EPERM)
4220 log_unit_warning_errno(unit, r,
4221 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4222 else if (r < 0) {
6e2d7c4f
MS
4223 *exit_status = EXIT_NETWORK;
4224 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4225 }
a8d08f39
LP
4226 } else if (context->network_namespace_path) {
4227 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4228 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4229 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4230 } else
4231 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4232 }
169c1bda 4233
a70581ff
XR
4234 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4235
4236 if (ns_type_supported(NAMESPACE_IPC)) {
4237 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4238 if (r == -EPERM)
4239 log_unit_warning_errno(unit, r,
4240 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4241 else if (r < 0) {
4242 *exit_status = EXIT_NAMESPACE;
4243 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4244 }
4245 } else if (context->ipc_namespace_path) {
4246 *exit_status = EXIT_NAMESPACE;
4247 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4248 "IPCNamespacePath= is not supported, refusing.");
4249 } else
4250 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4251 }
4252
ee818b89 4253 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 4254 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4255 _cleanup_free_ char *error_path = NULL;
4256
9f71ba8d 4257 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4258 if (r < 0) {
4259 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4260 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4261 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4262 }
d35fbf6b 4263 }
81a2b7ce 4264
daf8f72b
LP
4265 if (needs_sandboxing) {
4266 r = apply_protect_hostname(unit, context, exit_status);
4267 if (r < 0)
4268 return r;
aecd5ac6
TM
4269 }
4270
5749f855
AZ
4271 /* Drop groups as early as possible.
4272 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4273 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4274 if (needs_setuid) {
afb11bf1
DG
4275 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4276 int ngids_to_enforce = 0;
4277
4278 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4279 ngids,
4280 gids_after_pam,
4281 ngids_after_pam,
4282 &gids_to_enforce);
4283 if (ngids_to_enforce < 0) {
4284 *exit_status = EXIT_MEMORY;
4285 return log_unit_error_errno(unit,
4286 ngids_to_enforce,
4287 "Failed to merge group lists. Group membership might be incorrect: %m");
4288 }
4289
4290 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4291 if (r < 0) {
4292 *exit_status = EXIT_GROUP;
12145637 4293 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4294 }
165a31c0 4295 }
096424d1 4296
5749f855
AZ
4297 /* If the user namespace was not set up above, try to do it now.
4298 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4299 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4300 * case of mount namespaces being less privileged when the mount point list is copied from a
4301 * different user namespace). */
9008e1ac 4302
5749f855
AZ
4303 if (needs_sandboxing && context->private_users && !userns_set_up) {
4304 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4305 if (r < 0) {
4306 *exit_status = EXIT_USER;
4307 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4308 }
4309 }
4310
9f71ba8d
ZJS
4311 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4312 * shall execute. */
4313
4314 _cleanup_free_ char *executable = NULL;
b83d5050
ZJS
4315 _cleanup_close_ int executable_fd = -1;
4316 r = find_executable_full(command->path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4317 if (r < 0) {
4318 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4319 log_unit_struct_errno(unit, LOG_INFO, r,
4320 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4321 LOG_UNIT_INVOCATION_ID(unit),
4322 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4323 command->path),
4324 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4325 return 0;
4326 }
4327
4328 *exit_status = EXIT_EXEC;
c2503e35
RH
4329
4330 return log_unit_struct_errno(unit, LOG_INFO, r,
4331 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4332 LOG_UNIT_INVOCATION_ID(unit),
4333 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4334 command->path),
4335 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4336 }
4337
b83d5050
ZJS
4338 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4339 if (r < 0) {
4340 *exit_status = EXIT_FDS;
4341 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4342 }
4343
9f71ba8d
ZJS
4344#if HAVE_SELINUX
4345 if (needs_sandboxing && use_selinux && params->selinux_context_net && socket_fd >= 0) {
4346 r = mac_selinux_get_child_mls_label(socket_fd, executable, context->selinux_context, &mac_selinux_context_net);
4347 if (r < 0) {
4348 *exit_status = EXIT_SELINUX_CONTEXT;
4349 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4350 }
4351 }
4352#endif
4353
165a31c0 4354 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4355 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4356 * however if we have it as we want to keep it open until the final execve(). */
4357
1da37e58 4358 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4359 if (r >= 0)
4360 r = shift_fds(fds, n_fds);
4361 if (r >= 0)
25b583d7 4362 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4363 if (r < 0) {
4364 *exit_status = EXIT_FDS;
12145637 4365 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4366 }
e66cf1a3 4367
5686391b
LP
4368 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4369 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4370 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4371 * came this far. */
4372
165a31c0 4373 secure_bits = context->secure_bits;
e66cf1a3 4374
165a31c0
LP
4375 if (needs_sandboxing) {
4376 uint64_t bset;
e66cf1a3 4377
ce932d2d
LP
4378 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4379 * requested. (Note this is placed after the general resource limit initialization, see
4380 * above, in order to take precedence.) */
f4170c67
LP
4381 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4382 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4383 *exit_status = EXIT_LIMITS;
12145637 4384 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4385 }
4386 }
4387
37ac2744
JB
4388#if ENABLE_SMACK
4389 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4390 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4391 if (use_smack) {
b83d5050 4392 r = setup_smack(context, executable_fd);
37ac2744
JB
4393 if (r < 0) {
4394 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4395 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4396 }
4397 }
4398#endif
4399
165a31c0
LP
4400 bset = context->capability_bounding_set;
4401 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4402 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4403 * instead of us doing that */
4404 if (needs_ambient_hack)
4405 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4406 (UINT64_C(1) << CAP_SETUID) |
4407 (UINT64_C(1) << CAP_SETGID);
4408
4409 if (!cap_test_all(bset)) {
4410 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4411 if (r < 0) {
4412 *exit_status = EXIT_CAPABILITIES;
12145637 4413 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4414 }
4c2630eb 4415 }
3b8bddde 4416
16fcb191
TK
4417 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4418 * keep-caps set.
4419 * To be able to raise the ambient capabilities after setresuid() they have to be
4420 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4421 * After setresuid() the ambient capabilities can be raised as they are present in
4422 * the permitted and inhertiable set. However it is possible that someone wants to
4423 * set ambient capabilities without changing the user, so we also set the ambient
4424 * capabilities here.
4425 * The requested ambient capabilities are raised in the inheritable set if the
4426 * second argument is true. */
943800f4 4427 if (!needs_ambient_hack) {
755d4b67
IP
4428 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4429 if (r < 0) {
4430 *exit_status = EXIT_CAPABILITIES;
12145637 4431 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4432 }
755d4b67 4433 }
165a31c0 4434 }
755d4b67 4435
fa97f630
JB
4436 /* chroot to root directory first, before we lose the ability to chroot */
4437 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4438 if (r < 0)
4439 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4440
165a31c0 4441 if (needs_setuid) {
08f67696 4442 if (uid_is_valid(uid)) {
ff0af2a1
LP
4443 r = enforce_user(context, uid);
4444 if (r < 0) {
4445 *exit_status = EXIT_USER;
12145637 4446 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4447 }
165a31c0
LP
4448
4449 if (!needs_ambient_hack &&
4450 context->capability_ambient_set != 0) {
755d4b67 4451
16fcb191 4452 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4453 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4454 if (r < 0) {
4455 *exit_status = EXIT_CAPABILITIES;
12145637 4456 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4457 }
755d4b67 4458 }
5b6319dc 4459 }
165a31c0 4460 }
d35fbf6b 4461
56ef8db9
JB
4462 /* Apply working directory here, because the working directory might be on NFS and only the user running
4463 * this service might have the correct privilege to change to the working directory */
fa97f630 4464 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4465 if (r < 0)
4466 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4467
165a31c0 4468 if (needs_sandboxing) {
37ac2744 4469 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4470 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4471 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4472 * are restricted. */
4473
349cc4a5 4474#if HAVE_SELINUX
43b1f709 4475 if (use_selinux) {
5cd9cd35
LP
4476 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4477
4478 if (exec_context) {
4479 r = setexeccon(exec_context);
4480 if (r < 0) {
4481 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 4482 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
4483 }
4484 }
4485 }
4486#endif
4487
349cc4a5 4488#if HAVE_APPARMOR
43b1f709 4489 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4490 r = aa_change_onexec(context->apparmor_profile);
4491 if (r < 0 && !context->apparmor_profile_ignore) {
4492 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4493 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4494 }
4495 }
4496#endif
4497
165a31c0 4498 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4499 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4500 * CAP_SETPCAP. */
4501 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4502 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4503 * effective set here.
4504 * The effective set is overwritten during execve with the following values:
4505 * - ambient set (for non-root processes)
4506 * - (inheritable | bounding) set for root processes)
4507 *
4508 * Hence there is no security impact to raise it in the effective set before execve
4509 */
4510 r = capability_gain_cap_setpcap(NULL);
4511 if (r < 0) {
4512 *exit_status = EXIT_CAPABILITIES;
4513 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4514 }
755d4b67 4515 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4516 *exit_status = EXIT_SECUREBITS;
12145637 4517 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4518 }
dbdc4098 4519 }
5b6319dc 4520
59eeb84b 4521 if (context_has_no_new_privileges(context))
d35fbf6b 4522 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4523 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4524 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4525 }
4526
349cc4a5 4527#if HAVE_SECCOMP
469830d1
LP
4528 r = apply_address_families(unit, context);
4529 if (r < 0) {
4530 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4531 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4532 }
04aa0cb9 4533
469830d1
LP
4534 r = apply_memory_deny_write_execute(unit, context);
4535 if (r < 0) {
4536 *exit_status = EXIT_SECCOMP;
12145637 4537 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4538 }
f4170c67 4539
469830d1
LP
4540 r = apply_restrict_realtime(unit, context);
4541 if (r < 0) {
4542 *exit_status = EXIT_SECCOMP;
12145637 4543 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4544 }
4545
f69567cb
LP
4546 r = apply_restrict_suid_sgid(unit, context);
4547 if (r < 0) {
4548 *exit_status = EXIT_SECCOMP;
4549 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4550 }
4551
add00535
LP
4552 r = apply_restrict_namespaces(unit, context);
4553 if (r < 0) {
4554 *exit_status = EXIT_SECCOMP;
12145637 4555 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4556 }
4557
469830d1
LP
4558 r = apply_protect_sysctl(unit, context);
4559 if (r < 0) {
4560 *exit_status = EXIT_SECCOMP;
12145637 4561 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4562 }
4563
469830d1
LP
4564 r = apply_protect_kernel_modules(unit, context);
4565 if (r < 0) {
4566 *exit_status = EXIT_SECCOMP;
12145637 4567 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4568 }
4569
84703040
KK
4570 r = apply_protect_kernel_logs(unit, context);
4571 if (r < 0) {
4572 *exit_status = EXIT_SECCOMP;
4573 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4574 }
4575
fc64760d
KK
4576 r = apply_protect_clock(unit, context);
4577 if (r < 0) {
4578 *exit_status = EXIT_SECCOMP;
4579 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4580 }
4581
469830d1
LP
4582 r = apply_private_devices(unit, context);
4583 if (r < 0) {
4584 *exit_status = EXIT_SECCOMP;
12145637 4585 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4586 }
4587
4588 r = apply_syscall_archs(unit, context);
4589 if (r < 0) {
4590 *exit_status = EXIT_SECCOMP;
12145637 4591 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4592 }
4593
78e864e5
TM
4594 r = apply_lock_personality(unit, context);
4595 if (r < 0) {
4596 *exit_status = EXIT_SECCOMP;
12145637 4597 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4598 }
4599
9df2cdd8
TM
4600 r = apply_syscall_log(unit, context);
4601 if (r < 0) {
4602 *exit_status = EXIT_SECCOMP;
4603 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4604 }
4605
5cd9cd35
LP
4606 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4607 * by the filter as little as possible. */
165a31c0 4608 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4609 if (r < 0) {
4610 *exit_status = EXIT_SECCOMP;
12145637 4611 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4612 }
4613#endif
d35fbf6b 4614 }
034c6ed7 4615
00819cc1
LP
4616 if (!strv_isempty(context->unset_environment)) {
4617 char **ee = NULL;
4618
4619 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4620 if (!ee) {
4621 *exit_status = EXIT_MEMORY;
12145637 4622 return log_oom();
00819cc1
LP
4623 }
4624
130d3d22 4625 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4626 }
4627
7ca69792
AZ
4628 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4629 replaced_argv = replace_env_argv(command->argv, accum_env);
4630 if (!replaced_argv) {
4631 *exit_status = EXIT_MEMORY;
4632 return log_oom();
4633 }
4634 final_argv = replaced_argv;
4635 } else
4636 final_argv = command->argv;
034c6ed7 4637
f1d34068 4638 if (DEBUG_LOGGING) {
c2b2df60 4639 _cleanup_free_ char *line = NULL;
81a2b7ce 4640
d35fbf6b 4641 line = exec_command_line(final_argv);
a1230ff9 4642 if (line)
c2503e35
RH
4643 log_unit_struct(unit, LOG_DEBUG,
4644 "EXECUTABLE=%s", executable,
4645 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4646 LOG_UNIT_INVOCATION_ID(unit));
d35fbf6b 4647 }
dd305ec9 4648
5686391b
LP
4649 if (exec_fd >= 0) {
4650 uint8_t hot = 1;
4651
4652 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4653 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4654
4655 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4656 *exit_status = EXIT_EXEC;
4657 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4658 }
4659 }
4660
a6d9111c 4661 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4662
4663 if (exec_fd >= 0) {
4664 uint8_t hot = 0;
4665
4666 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4667 * that POLLHUP on it no longer means execve() succeeded. */
4668
4669 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4670 *exit_status = EXIT_EXEC;
4671 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4672 }
4673 }
12145637 4674
ff0af2a1 4675 *exit_status = EXIT_EXEC;
9f71ba8d 4676 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4677}
81a2b7ce 4678
34cf6c43 4679static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4680static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4681
f2341e0a
LP
4682int exec_spawn(Unit *unit,
4683 ExecCommand *command,
d35fbf6b
DM
4684 const ExecContext *context,
4685 const ExecParameters *params,
4686 ExecRuntime *runtime,
29206d46 4687 DynamicCreds *dcreds,
d35fbf6b 4688 pid_t *ret) {
8351ceae 4689
ee39ca20 4690 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4691 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4692 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4693 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4694 _cleanup_free_ char *line = NULL;
d35fbf6b 4695 pid_t pid;
8351ceae 4696
f2341e0a 4697 assert(unit);
d35fbf6b
DM
4698 assert(command);
4699 assert(context);
4700 assert(ret);
4701 assert(params);
25b583d7 4702 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4703
d35fbf6b
DM
4704 if (context->std_input == EXEC_INPUT_SOCKET ||
4705 context->std_output == EXEC_OUTPUT_SOCKET ||
4706 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4707
d85ff944
YW
4708 if (params->n_socket_fds > 1)
4709 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4710
d85ff944
YW
4711 if (params->n_socket_fds == 0)
4712 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4713
d35fbf6b
DM
4714 socket_fd = params->fds[0];
4715 } else {
4716 socket_fd = -1;
4717 fds = params->fds;
9b141911 4718 n_socket_fds = params->n_socket_fds;
25b583d7 4719 n_storage_fds = params->n_storage_fds;
d35fbf6b 4720 }
94f04347 4721
34cf6c43 4722 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4723 if (r < 0)
4724 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4725
f2341e0a 4726 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4727 if (r < 0)
f2341e0a 4728 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4729
ee39ca20 4730 line = exec_command_line(command->argv);
d35fbf6b
DM
4731 if (!line)
4732 return log_oom();
fab56fc5 4733
9f71ba8d
ZJS
4734 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4735 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4736 mac_selinux_maybe_reload();
4737
c2503e35
RH
4738 log_unit_struct(unit, LOG_DEBUG,
4739 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4740 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4741 the mount namespace in the child, but we want to log
4742 from the parent, so we need to use the (possibly
4743 inaccurate) path here. */
4744 LOG_UNIT_INVOCATION_ID(unit));
12145637 4745
78f93209
LP
4746 if (params->cgroup_path) {
4747 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4748 if (r < 0)
4749 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4750 if (r > 0) { /* We are using a child cgroup */
4751 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4752 if (r < 0)
4753 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
4754
4755 /* Normally we would not propagate the oomd xattrs to children but since we created this
4756 * sub-cgroup internally we should do it. */
4757 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
4758 }
4759 }
4760
d35fbf6b
DM
4761 pid = fork();
4762 if (pid < 0)
74129a12 4763 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
4764
4765 if (pid == 0) {
12145637 4766 int exit_status = EXIT_SUCCESS;
ff0af2a1 4767
f2341e0a
LP
4768 r = exec_child(unit,
4769 command,
ff0af2a1
LP
4770 context,
4771 params,
4772 runtime,
29206d46 4773 dcreds,
ff0af2a1 4774 socket_fd,
52c239d7 4775 named_iofds,
4c47affc 4776 fds,
9b141911 4777 n_socket_fds,
25b583d7 4778 n_storage_fds,
ff0af2a1 4779 files_env,
00d9ef85 4780 unit->manager->user_lookup_fds[1],
12145637
LP
4781 &exit_status);
4782
e1714f02
ZJS
4783 if (r < 0) {
4784 const char *status =
4785 exit_status_to_string(exit_status,
e04ed6db 4786 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 4787
c2503e35
RH
4788 log_unit_struct_errno(unit, LOG_ERR, r,
4789 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4790 LOG_UNIT_INVOCATION_ID(unit),
4791 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4792 status, command->path),
4793 "EXECUTABLE=%s", command->path);
e1714f02 4794 }
4c2630eb 4795
ff0af2a1 4796 _exit(exit_status);
034c6ed7
LP
4797 }
4798
f2341e0a 4799 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 4800
78f93209
LP
4801 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4802 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4803 * process will be killed too). */
4804 if (subcgroup_path)
4805 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 4806
b58b4116 4807 exec_status_start(&command->exec_status, pid);
9fb86720 4808
034c6ed7 4809 *ret = pid;
5cb5a6ff
LP
4810 return 0;
4811}
4812
034c6ed7
LP
4813void exec_context_init(ExecContext *c) {
4814 assert(c);
4815
4c12626c 4816 c->umask = 0022;
9eba9da4 4817 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 4818 c->cpu_sched_policy = SCHED_OTHER;
071830ff 4819 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 4820 c->syslog_level_prefix = true;
353e12c2 4821 c->ignore_sigpipe = true;
3a43da28 4822 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 4823 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
4824 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4825 c->directories[t].mode = 0755;
12213aed 4826 c->timeout_clean_usec = USEC_INFINITY;
a103496c 4827 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
4828 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4829 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 4830 c->log_level_max = -1;
005bfaf1
TM
4831#if HAVE_SECCOMP
4832 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4833#endif
b070c7c0 4834 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
4835}
4836
613b411c 4837void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
4838 assert(c);
4839
6796073e
LP
4840 c->environment = strv_free(c->environment);
4841 c->environment_files = strv_free(c->environment_files);
b4c14404 4842 c->pass_environment = strv_free(c->pass_environment);
00819cc1 4843 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 4844
31ce987c 4845 rlimit_free_all(c->rlimit);
034c6ed7 4846
5b10116e 4847 for (size_t l = 0; l < 3; l++) {
52c239d7 4848 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
4849 c->stdio_file[l] = mfree(c->stdio_file[l]);
4850 }
52c239d7 4851
a1e58e8e
LP
4852 c->working_directory = mfree(c->working_directory);
4853 c->root_directory = mfree(c->root_directory);
915e6d16 4854 c->root_image = mfree(c->root_image);
18d73705 4855 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
4856 c->root_hash = mfree(c->root_hash);
4857 c->root_hash_size = 0;
4858 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
4859 c->root_hash_sig = mfree(c->root_hash_sig);
4860 c->root_hash_sig_size = 0;
4861 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 4862 c->root_verity = mfree(c->root_verity);
93f59701 4863 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
4864 c->tty_path = mfree(c->tty_path);
4865 c->syslog_identifier = mfree(c->syslog_identifier);
4866 c->user = mfree(c->user);
4867 c->group = mfree(c->group);
034c6ed7 4868
6796073e 4869 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 4870
a1e58e8e 4871 c->pam_name = mfree(c->pam_name);
5b6319dc 4872
2a624c36
AP
4873 c->read_only_paths = strv_free(c->read_only_paths);
4874 c->read_write_paths = strv_free(c->read_write_paths);
4875 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
4876 c->exec_paths = strv_free(c->exec_paths);
4877 c->no_exec_paths = strv_free(c->no_exec_paths);
82c121a4 4878
d2d6c096 4879 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
4880 c->bind_mounts = NULL;
4881 c->n_bind_mounts = 0;
2abd4e38
YW
4882 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4883 c->temporary_filesystems = NULL;
4884 c->n_temporary_filesystems = 0;
b3d13314 4885 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 4886
0985c7c4 4887 cpu_set_reset(&c->cpu_set);
b070c7c0 4888 numa_policy_reset(&c->numa_policy);
86a3475b 4889
a1e58e8e
LP
4890 c->utmp_id = mfree(c->utmp_id);
4891 c->selinux_context = mfree(c->selinux_context);
4892 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 4893 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 4894
8cfa775f 4895 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
4896 c->syscall_archs = set_free(c->syscall_archs);
4897 c->address_families = set_free(c->address_families);
e66cf1a3 4898
5b10116e
ZJS
4899 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4900 c->directories[t].paths = strv_free(c->directories[t].paths);
d3070fbd
LP
4901
4902 c->log_level_max = -1;
4903
4904 exec_context_free_log_extra_fields(c);
08f3be7a 4905
5ac1530e
ZJS
4906 c->log_ratelimit_interval_usec = 0;
4907 c->log_ratelimit_burst = 0;
90fc172e 4908
08f3be7a
LP
4909 c->stdin_data = mfree(c->stdin_data);
4910 c->stdin_data_size = 0;
a8d08f39
LP
4911
4912 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 4913 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
4914
4915 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f
LP
4916
4917 c->load_credentials = strv_free(c->load_credentials);
4918 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
4919}
4920
34cf6c43 4921int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
4922 char **i;
4923
4924 assert(c);
4925
4926 if (!runtime_prefix)
4927 return 0;
4928
3536f49e 4929 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
c2b2df60 4930 _cleanup_free_ char *p = NULL;
e66cf1a3 4931
494d0247
YW
4932 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4933 p = path_join(runtime_prefix, "private", *i);
4934 else
4935 p = path_join(runtime_prefix, *i);
e66cf1a3
LP
4936 if (!p)
4937 return -ENOMEM;
4938
7bc4bf4a
LP
4939 /* We execute this synchronously, since we need to be sure this is gone when we start the
4940 * service next. */
c6878637 4941 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
4942 }
4943
4944 return 0;
5cb5a6ff
LP
4945}
4946
bb0c0d6f
LP
4947int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4948 _cleanup_free_ char *p = NULL;
4949
4950 assert(c);
4951
4952 if (!runtime_prefix || !unit)
4953 return 0;
4954
4955 p = path_join(runtime_prefix, "credentials", unit);
4956 if (!p)
4957 return -ENOMEM;
4958
4959 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
4960 * unmount it, and afterwards remove the mount point */
4961 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
4962 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
4963
4964 return 0;
4965}
4966
34cf6c43 4967static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
4968 assert(c);
4969
a1e58e8e 4970 c->path = mfree(c->path);
6796073e 4971 c->argv = strv_free(c->argv);
43d0fcbd
LP
4972}
4973
da6053d0 4974void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 4975 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
4976 exec_command_done(c+i);
4977}
4978
f1acf85a 4979ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
4980 ExecCommand *i;
4981
4982 while ((i = c)) {
71fda00f 4983 LIST_REMOVE(command, c, i);
43d0fcbd 4984 exec_command_done(i);
5cb5a6ff
LP
4985 free(i);
4986 }
f1acf85a
ZJS
4987
4988 return NULL;
5cb5a6ff
LP
4989}
4990
da6053d0 4991void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 4992 for (size_t i = 0; i < n; i++)
f1acf85a 4993 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
4994}
4995
6a1d4d9f 4996void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 4997 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
4998 exec_status_reset(&c[i].exec_status);
4999}
5000
5001void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5002 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5003 ExecCommand *z;
5004
5005 LIST_FOREACH(command, z, c[i])
5006 exec_status_reset(&z->exec_status);
5007 }
5008}
5009
039f0e70 5010typedef struct InvalidEnvInfo {
34cf6c43 5011 const Unit *unit;
039f0e70
LP
5012 const char *path;
5013} InvalidEnvInfo;
5014
5015static void invalid_env(const char *p, void *userdata) {
5016 InvalidEnvInfo *info = userdata;
5017
f2341e0a 5018 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5019}
5020
52c239d7
LB
5021const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5022 assert(c);
5023
5024 switch (fd_index) {
5073ff6b 5025
52c239d7
LB
5026 case STDIN_FILENO:
5027 if (c->std_input != EXEC_INPUT_NAMED_FD)
5028 return NULL;
5073ff6b 5029
52c239d7 5030 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5031
52c239d7
LB
5032 case STDOUT_FILENO:
5033 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5034 return NULL;
5073ff6b 5035
52c239d7 5036 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5037
52c239d7
LB
5038 case STDERR_FILENO:
5039 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5040 return NULL;
5073ff6b 5041
52c239d7 5042 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5043
52c239d7
LB
5044 default:
5045 return NULL;
5046 }
5047}
5048
2caa38e9
LP
5049static int exec_context_named_iofds(
5050 const ExecContext *c,
5051 const ExecParameters *p,
5052 int named_iofds[static 3]) {
5053
5b10116e 5054 size_t targets;
56fbd561 5055 const char* stdio_fdname[3];
da6053d0 5056 size_t n_fds;
52c239d7
LB
5057
5058 assert(c);
5059 assert(p);
2caa38e9 5060 assert(named_iofds);
52c239d7
LB
5061
5062 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5063 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5064 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5065
5b10116e 5066 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5067 stdio_fdname[i] = exec_context_fdname(c, i);
5068
4c47affc
FB
5069 n_fds = p->n_storage_fds + p->n_socket_fds;
5070
5b10116e 5071 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5072 if (named_iofds[STDIN_FILENO] < 0 &&
5073 c->std_input == EXEC_INPUT_NAMED_FD &&
5074 stdio_fdname[STDIN_FILENO] &&
5075 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5076
52c239d7
LB
5077 named_iofds[STDIN_FILENO] = p->fds[i];
5078 targets--;
56fbd561
ZJS
5079
5080 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5081 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5082 stdio_fdname[STDOUT_FILENO] &&
5083 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5084
52c239d7
LB
5085 named_iofds[STDOUT_FILENO] = p->fds[i];
5086 targets--;
56fbd561
ZJS
5087
5088 } else if (named_iofds[STDERR_FILENO] < 0 &&
5089 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5090 stdio_fdname[STDERR_FILENO] &&
5091 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5092
52c239d7
LB
5093 named_iofds[STDERR_FILENO] = p->fds[i];
5094 targets--;
5095 }
5096
56fbd561 5097 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5098}
5099
34cf6c43 5100static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5101 char **i, **r = NULL;
5102
5103 assert(c);
5104 assert(l);
5105
5106 STRV_FOREACH(i, c->environment_files) {
5107 char *fn;
52511fae 5108 int k;
8c7be95e
LP
5109 bool ignore = false;
5110 char **p;
7fd1b19b 5111 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5112
5113 fn = *i;
5114
5115 if (fn[0] == '-') {
5116 ignore = true;
313cefa1 5117 fn++;
8c7be95e
LP
5118 }
5119
5120 if (!path_is_absolute(fn)) {
8c7be95e
LP
5121 if (ignore)
5122 continue;
5123
5124 strv_free(r);
5125 return -EINVAL;
5126 }
5127
2bef10ab 5128 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5129 k = safe_glob(fn, 0, &pglob);
5130 if (k < 0) {
2bef10ab
PL
5131 if (ignore)
5132 continue;
8c7be95e 5133
2bef10ab 5134 strv_free(r);
d8c92e8b 5135 return k;
2bef10ab 5136 }
8c7be95e 5137
d8c92e8b
ZJS
5138 /* When we don't match anything, -ENOENT should be returned */
5139 assert(pglob.gl_pathc > 0);
5140
5b10116e 5141 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5142 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5143 if (k < 0) {
5144 if (ignore)
5145 continue;
8c7be95e 5146
2bef10ab 5147 strv_free(r);
2bef10ab 5148 return k;
e9c1ea9d 5149 }
ebc05a09 5150 /* Log invalid environment variables with filename */
039f0e70
LP
5151 if (p) {
5152 InvalidEnvInfo info = {
f2341e0a 5153 .unit = unit,
039f0e70
LP
5154 .path = pglob.gl_pathv[n]
5155 };
5156
5157 p = strv_env_clean_with_callback(p, invalid_env, &info);
5158 }
8c7be95e 5159
234519ae 5160 if (!r)
2bef10ab
PL
5161 r = p;
5162 else {
5163 char **m;
8c7be95e 5164
2bef10ab
PL
5165 m = strv_env_merge(2, r, p);
5166 strv_free(r);
5167 strv_free(p);
c84a9488 5168 if (!m)
2bef10ab 5169 return -ENOMEM;
2bef10ab
PL
5170
5171 r = m;
5172 }
8c7be95e
LP
5173 }
5174 }
5175
5176 *l = r;
5177
5178 return 0;
5179}
5180
6ac8fdc9 5181static bool tty_may_match_dev_console(const char *tty) {
7b912648 5182 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5183
1e22b5cd
LP
5184 if (!tty)
5185 return true;
5186
a119ec7c 5187 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5188
5189 /* trivial identity? */
5190 if (streq(tty, "console"))
5191 return true;
5192
7b912648
LP
5193 if (resolve_dev_console(&resolved) < 0)
5194 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5195
5196 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5197 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5198}
5199
6c0ae739
LP
5200static bool exec_context_may_touch_tty(const ExecContext *ec) {
5201 assert(ec);
1e22b5cd 5202
6c0ae739 5203 return ec->tty_reset ||
1e22b5cd
LP
5204 ec->tty_vhangup ||
5205 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5206 is_terminal_input(ec->std_input) ||
5207 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5208 is_terminal_output(ec->std_error);
5209}
5210
5211bool exec_context_may_touch_console(const ExecContext *ec) {
5212
5213 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5214 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5215}
5216
15ae422b
LP
5217static void strv_fprintf(FILE *f, char **l) {
5218 char **g;
5219
5220 assert(f);
5221
5222 STRV_FOREACH(g, l)
5223 fprintf(f, " %s", *g);
5224}
5225
ddc155b2
TM
5226static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5227 assert(f);
5228 assert(prefix);
5229 assert(name);
5230
5231 if (!strv_isempty(strv)) {
a7bd1656 5232 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5233 strv_fprintf(f, strv);
5234 fputs("\n", f);
5235 }
5236}
5237
34cf6c43 5238void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
12213aed 5239 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
add00535 5240 int r;
9eba9da4 5241
5cb5a6ff
LP
5242 assert(c);
5243 assert(f);
5244
4ad49000 5245 prefix = strempty(prefix);
5cb5a6ff
LP
5246
5247 fprintf(f,
94f04347
LP
5248 "%sUMask: %04o\n"
5249 "%sWorkingDirectory: %s\n"
451a074f 5250 "%sRootDirectory: %s\n"
15ae422b 5251 "%sNonBlocking: %s\n"
64747e2d 5252 "%sPrivateTmp: %s\n"
7f112f50 5253 "%sPrivateDevices: %s\n"
59eeb84b 5254 "%sProtectKernelTunables: %s\n"
e66a2f65 5255 "%sProtectKernelModules: %s\n"
84703040 5256 "%sProtectKernelLogs: %s\n"
fc64760d 5257 "%sProtectClock: %s\n"
59eeb84b 5258 "%sProtectControlGroups: %s\n"
d251207d
LP
5259 "%sPrivateNetwork: %s\n"
5260 "%sPrivateUsers: %s\n"
1b8689f9
LP
5261 "%sProtectHome: %s\n"
5262 "%sProtectSystem: %s\n"
5d997827 5263 "%sMountAPIVFS: %s\n"
f3e43635 5264 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5265 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5266 "%sRestrictRealtime: %s\n"
f69567cb 5267 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5268 "%sKeyringMode: %s\n"
4e399953
LP
5269 "%sProtectHostname: %s\n"
5270 "%sProtectProc: %s\n"
5271 "%sProcSubset: %s\n",
5cb5a6ff 5272 prefix, c->umask,
14eb3285
LP
5273 prefix, empty_to_root(c->working_directory),
5274 prefix, empty_to_root(c->root_directory),
15ae422b 5275 prefix, yes_no(c->non_blocking),
64747e2d 5276 prefix, yes_no(c->private_tmp),
7f112f50 5277 prefix, yes_no(c->private_devices),
59eeb84b 5278 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5279 prefix, yes_no(c->protect_kernel_modules),
84703040 5280 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5281 prefix, yes_no(c->protect_clock),
59eeb84b 5282 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5283 prefix, yes_no(c->private_network),
5284 prefix, yes_no(c->private_users),
1b8689f9
LP
5285 prefix, protect_home_to_string(c->protect_home),
5286 prefix, protect_system_to_string(c->protect_system),
5e98086d 5287 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5288 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5289 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5290 prefix, yes_no(c->restrict_realtime),
f69567cb 5291 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5292 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5293 prefix, yes_no(c->protect_hostname),
5294 prefix, protect_proc_to_string(c->protect_proc),
5295 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5296
915e6d16
LP
5297 if (c->root_image)
5298 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5299
18d73705
LB
5300 if (c->root_image_options) {
5301 MountOptions *o;
5302
5303 fprintf(f, "%sRootImageOptions:", prefix);
5304 LIST_FOREACH(mount_options, o, c->root_image_options)
5305 if (!isempty(o->options))
9ece6444
LB
5306 fprintf(f, " %s:%s",
5307 partition_designator_to_string(o->partition_designator),
5308 o->options);
18d73705
LB
5309 fprintf(f, "\n");
5310 }
5311
0389f4fa
LB
5312 if (c->root_hash) {
5313 _cleanup_free_ char *encoded = NULL;
5314 encoded = hexmem(c->root_hash, c->root_hash_size);
5315 if (encoded)
5316 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5317 }
5318
5319 if (c->root_hash_path)
5320 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5321
d4d55b0d
LB
5322 if (c->root_hash_sig) {
5323 _cleanup_free_ char *encoded = NULL;
5324 ssize_t len;
5325 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5326 if (len)
5327 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5328 }
5329
5330 if (c->root_hash_sig_path)
5331 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5332
0389f4fa
LB
5333 if (c->root_verity)
5334 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5335
8c7be95e
LP
5336 STRV_FOREACH(e, c->environment)
5337 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5338
5339 STRV_FOREACH(e, c->environment_files)
5340 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5341
b4c14404
FB
5342 STRV_FOREACH(e, c->pass_environment)
5343 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5344
00819cc1
LP
5345 STRV_FOREACH(e, c->unset_environment)
5346 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5347
53f47dfc
YW
5348 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5349
5b10116e 5350 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5351 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5352
5353 STRV_FOREACH(d, c->directories[dt].paths)
5354 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5355 }
c2bbd90b 5356
12213aed
YW
5357 fprintf(f,
5358 "%sTimeoutCleanSec: %s\n",
5359 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5360
fb33a393
LP
5361 if (c->nice_set)
5362 fprintf(f,
5363 "%sNice: %i\n",
5364 prefix, c->nice);
5365
dd6c17b1 5366 if (c->oom_score_adjust_set)
fb33a393 5367 fprintf(f,
dd6c17b1
LP
5368 "%sOOMScoreAdjust: %i\n",
5369 prefix, c->oom_score_adjust);
9eba9da4 5370
ad21e542
ZJS
5371 if (c->coredump_filter_set)
5372 fprintf(f,
5373 "%sCoredumpFilter: 0x%"PRIx64"\n",
5374 prefix, c->coredump_filter);
5375
5b10116e 5376 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5377 if (c->rlimit[i]) {
4c3a2b84 5378 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5379 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5380 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5381 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5382 }
94f04347 5383
f8b69d1d 5384 if (c->ioprio_set) {
1756a011 5385 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5386
837df140
YW
5387 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5388 if (r >= 0)
5389 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5390
5391 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 5392 }
94f04347 5393
f8b69d1d 5394 if (c->cpu_sched_set) {
1756a011 5395 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5396
837df140
YW
5397 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5398 if (r >= 0)
5399 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5400
94f04347 5401 fprintf(f,
38b48754
LP
5402 "%sCPUSchedulingPriority: %i\n"
5403 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5404 prefix, c->cpu_sched_priority,
5405 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5406 }
94f04347 5407
0985c7c4 5408 if (c->cpu_set.set) {
e7fca352
MS
5409 _cleanup_free_ char *affinity = NULL;
5410
5411 affinity = cpu_set_to_range_string(&c->cpu_set);
5412 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5413 }
5414
b070c7c0
MS
5415 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5416 _cleanup_free_ char *nodes = NULL;
5417
5418 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5419 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5420 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5421 }
5422
3a43da28 5423 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5424 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5425
5426 fprintf(f,
80876c20
LP
5427 "%sStandardInput: %s\n"
5428 "%sStandardOutput: %s\n"
5429 "%sStandardError: %s\n",
5430 prefix, exec_input_to_string(c->std_input),
5431 prefix, exec_output_to_string(c->std_output),
5432 prefix, exec_output_to_string(c->std_error));
5433
befc4a80
LP
5434 if (c->std_input == EXEC_INPUT_NAMED_FD)
5435 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5436 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5437 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5438 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5439 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5440
5441 if (c->std_input == EXEC_INPUT_FILE)
5442 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5443 if (c->std_output == EXEC_OUTPUT_FILE)
5444 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5445 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5446 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5447 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5448 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5449 if (c->std_error == EXEC_OUTPUT_FILE)
5450 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5451 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5452 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5453 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5454 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5455
80876c20
LP
5456 if (c->tty_path)
5457 fprintf(f,
6ea832a2
LP
5458 "%sTTYPath: %s\n"
5459 "%sTTYReset: %s\n"
5460 "%sTTYVHangup: %s\n"
5461 "%sTTYVTDisallocate: %s\n",
5462 prefix, c->tty_path,
5463 prefix, yes_no(c->tty_reset),
5464 prefix, yes_no(c->tty_vhangup),
5465 prefix, yes_no(c->tty_vt_disallocate));
94f04347 5466
9f6444eb 5467 if (IN_SET(c->std_output,
9f6444eb
LP
5468 EXEC_OUTPUT_KMSG,
5469 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5470 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5471 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5472 IN_SET(c->std_error,
9f6444eb
LP
5473 EXEC_OUTPUT_KMSG,
5474 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5475 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5476 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5477
5ce70e5b 5478 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5479
837df140
YW
5480 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5481 if (r >= 0)
5482 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5483
837df140
YW
5484 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5485 if (r >= 0)
5486 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5487 }
94f04347 5488
d3070fbd
LP
5489 if (c->log_level_max >= 0) {
5490 _cleanup_free_ char *t = NULL;
5491
5492 (void) log_level_to_string_alloc(c->log_level_max, &t);
5493
5494 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5495 }
5496
5ac1530e 5497 if (c->log_ratelimit_interval_usec > 0) {
90fc172e
AZ
5498 char buf_timespan[FORMAT_TIMESPAN_MAX];
5499
5500 fprintf(f,
5501 "%sLogRateLimitIntervalSec: %s\n",
5ac1530e 5502 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e
AZ
5503 }
5504
5ac1530e
ZJS
5505 if (c->log_ratelimit_burst > 0)
5506 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5507
5b10116e
ZJS
5508 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5509 fprintf(f, "%sLogExtraFields: ", prefix);
5510 fwrite(c->log_extra_fields[j].iov_base,
5511 1, c->log_extra_fields[j].iov_len,
5512 f);
5513 fputc('\n', f);
d3070fbd
LP
5514 }
5515
91dd5f7c
LP
5516 if (c->log_namespace)
5517 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5518
07d46372
YW
5519 if (c->secure_bits) {
5520 _cleanup_free_ char *str = NULL;
5521
5522 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5523 if (r >= 0)
5524 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5525 }
94f04347 5526
a103496c 5527 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5528 _cleanup_free_ char *str = NULL;
94f04347 5529
dd1f5bd0
YW
5530 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5531 if (r >= 0)
5532 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5533 }
5534
5535 if (c->capability_ambient_set != 0) {
dd1f5bd0 5536 _cleanup_free_ char *str = NULL;
755d4b67 5537
dd1f5bd0
YW
5538 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5539 if (r >= 0)
5540 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5541 }
5542
5543 if (c->user)
f2d3769a 5544 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5545 if (c->group)
f2d3769a 5546 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5547
29206d46
LP
5548 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5549
ddc155b2 5550 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5551
5b6319dc 5552 if (c->pam_name)
f2d3769a 5553 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5554
ddc155b2
TM
5555 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5556 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5557 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5558 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5559 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
2e22afe9 5560
5b10116e
ZJS
5561 for (size_t i = 0; i < c->n_bind_mounts; i++)
5562 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5563 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5564 c->bind_mounts[i].ignore_enoent ? "-": "",
5565 c->bind_mounts[i].source,
5566 c->bind_mounts[i].destination,
5567 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5568
5b10116e
ZJS
5569 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5570 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5571
5b10116e
ZJS
5572 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5573 t->path,
5574 isempty(t->options) ? "" : ":",
5575 strempty(t->options));
5576 }
2abd4e38 5577
169c1bda
LP
5578 if (c->utmp_id)
5579 fprintf(f,
5580 "%sUtmpIdentifier: %s\n",
5581 prefix, c->utmp_id);
7b52a628
MS
5582
5583 if (c->selinux_context)
5584 fprintf(f,
5f8640fb
LP
5585 "%sSELinuxContext: %s%s\n",
5586 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5587
80c21aea
WC
5588 if (c->apparmor_profile)
5589 fprintf(f,
5590 "%sAppArmorProfile: %s%s\n",
5591 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5592
5593 if (c->smack_process_label)
5594 fprintf(f,
5595 "%sSmackProcessLabel: %s%s\n",
5596 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5597
050f7277 5598 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5599 fprintf(f,
5600 "%sPersonality: %s\n",
5601 prefix, strna(personality_to_string(c->personality)));
5602
78e864e5
TM
5603 fprintf(f,
5604 "%sLockPersonality: %s\n",
5605 prefix, yes_no(c->lock_personality));
5606
17df7223 5607 if (c->syscall_filter) {
349cc4a5 5608#if HAVE_SECCOMP
8cfa775f 5609 void *id, *val;
17df7223 5610 bool first = true;
351a19b1 5611#endif
17df7223
LP
5612
5613 fprintf(f,
57183d11 5614 "%sSystemCallFilter: ",
17df7223
LP
5615 prefix);
5616
6b000af4 5617 if (!c->syscall_allow_list)
17df7223
LP
5618 fputc('~', f);
5619
349cc4a5 5620#if HAVE_SECCOMP
90e74a66 5621 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5622 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5623 const char *errno_name = NULL;
5624 int num = PTR_TO_INT(val);
17df7223
LP
5625
5626 if (first)
5627 first = false;
5628 else
5629 fputc(' ', f);
5630
57183d11 5631 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5632 fputs(strna(name), f);
8cfa775f
YW
5633
5634 if (num >= 0) {
005bfaf1 5635 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5636 if (errno_name)
5637 fprintf(f, ":%s", errno_name);
5638 else
5639 fprintf(f, ":%d", num);
5640 }
17df7223 5641 }
351a19b1 5642#endif
17df7223
LP
5643
5644 fputc('\n', f);
5645 }
5646
57183d11 5647 if (c->syscall_archs) {
349cc4a5 5648#if HAVE_SECCOMP
57183d11
LP
5649 void *id;
5650#endif
5651
5652 fprintf(f,
5653 "%sSystemCallArchitectures:",
5654 prefix);
5655
349cc4a5 5656#if HAVE_SECCOMP
90e74a66 5657 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5658 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5659#endif
5660 fputc('\n', f);
5661 }
5662
add00535
LP
5663 if (exec_context_restrict_namespaces_set(c)) {
5664 _cleanup_free_ char *s = NULL;
5665
86c2a9f1 5666 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5667 if (r >= 0)
5668 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5669 prefix, strna(s));
add00535
LP
5670 }
5671
a8d08f39
LP
5672 if (c->network_namespace_path)
5673 fprintf(f,
5674 "%sNetworkNamespacePath: %s\n",
5675 prefix, c->network_namespace_path);
5676
3df90f24 5677 if (c->syscall_errno > 0) {
005bfaf1 5678#if HAVE_SECCOMP
3df90f24 5679 const char *errno_name;
005bfaf1 5680#endif
3df90f24
YW
5681
5682 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5683
005bfaf1
TM
5684#if HAVE_SECCOMP
5685 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5686 if (errno_name)
005bfaf1 5687 fputs(errno_name, f);
3df90f24 5688 else
005bfaf1
TM
5689 fprintf(f, "%d", c->syscall_errno);
5690#endif
5691 fputc('\n', f);
3df90f24 5692 }
b3d13314 5693
5b10116e 5694 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5695 MountOptions *o;
5696
79e20ceb 5697 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5698 c->mount_images[i].ignore_enoent ? "-": "",
5699 c->mount_images[i].source,
79e20ceb 5700 c->mount_images[i].destination);
427353f6 5701 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5702 fprintf(f, ":%s:%s",
427353f6 5703 partition_designator_to_string(o->partition_designator),
79e20ceb 5704 strempty(o->options));
427353f6
LB
5705 fprintf(f, "\n");
5706 }
93f59701
LB
5707
5708 for (size_t i = 0; i < c->n_extension_images; i++) {
5709 MountOptions *o;
5710
5711 fprintf(f, "%sExtensionImages: %s%s", prefix,
5712 c->extension_images[i].ignore_enoent ? "-": "",
5713 c->extension_images[i].source);
5714 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5715 fprintf(f, ":%s:%s",
5716 partition_designator_to_string(o->partition_designator),
5717 strempty(o->options));
5718 fprintf(f, "\n");
5719 }
5cb5a6ff
LP
5720}
5721
34cf6c43 5722bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5723 assert(c);
5724
61233823 5725 /* Returns true if the process forked off would run under
a931ad47
LP
5726 * an unchanged UID or as root. */
5727
5728 if (!c->user)
5729 return true;
5730
5731 if (streq(c->user, "root") || streq(c->user, "0"))
5732 return true;
5733
5734 return false;
5735}
5736
34cf6c43 5737int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
5738 int p;
5739
5740 assert(c);
5741
5742 if (c->ioprio_set)
5743 return c->ioprio;
5744
5745 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5746 if (p < 0)
5747 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5748
5749 return p;
5750}
5751
5e98086d
ZJS
5752bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5753 assert(c);
5754
61198784 5755 /* Explicit setting wins */
5e98086d
ZJS
5756 if (c->mount_apivfs_set)
5757 return c->mount_apivfs;
5758
61198784 5759 /* Default to "yes" if root directory or image are specified */
74e12520 5760 if (exec_context_with_rootfs(c))
61198784
ZJS
5761 return true;
5762
5e98086d
ZJS
5763 return false;
5764}
5765
d3070fbd 5766void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
5767 assert(c);
5768
5b10116e 5769 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
5770 free(c->log_extra_fields[l].iov_base);
5771 c->log_extra_fields = mfree(c->log_extra_fields);
5772 c->n_log_extra_fields = 0;
5773}
5774
6f765baf 5775void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
5776 _cleanup_close_ int fd = -1;
5777 const char *path;
5778 struct stat st;
6f765baf
LP
5779 int r;
5780
5781 assert(c);
5782
5783 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5784 exec_context_tty_reset(c, NULL);
5785
5786 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5787 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5788 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
5789 if (!exec_context_may_touch_tty(c))
5790 return;
6f765baf 5791
0ba976e8
LP
5792 path = exec_context_tty_path(c);
5793 if (!path)
5794 return;
6f765baf 5795
0ba976e8
LP
5796 fd = open(path, O_PATH|O_CLOEXEC);
5797 if (fd < 0)
5798 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5799 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5800 path);
5801
5802 if (fstat(fd, &st) < 0)
5803 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5804
5805 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5806 * if things are a character device, since a proper check either means we'd have to open the TTY and
5807 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5808 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5809 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5810 if (!S_ISCHR(st.st_mode))
5811 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5812
5813 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5814 if (r < 0)
5815 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
5816}
5817
4c2f5842
LP
5818int exec_context_get_clean_directories(
5819 ExecContext *c,
5820 char **prefix,
5821 ExecCleanMask mask,
5822 char ***ret) {
5823
5824 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
5825 int r;
5826
5827 assert(c);
5828 assert(prefix);
5829 assert(ret);
5830
5b10116e 5831 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
5832 char **i;
5833
5834 if (!FLAGS_SET(mask, 1U << t))
5835 continue;
5836
5837 if (!prefix[t])
5838 continue;
5839
5840 STRV_FOREACH(i, c->directories[t].paths) {
5841 char *j;
5842
5843 j = path_join(prefix[t], *i);
5844 if (!j)
5845 return -ENOMEM;
5846
5847 r = strv_consume(&l, j);
5848 if (r < 0)
5849 return r;
7f622a19
YW
5850
5851 /* Also remove private directories unconditionally. */
5852 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5853 j = path_join(prefix[t], "private", *i);
5854 if (!j)
5855 return -ENOMEM;
5856
5857 r = strv_consume(&l, j);
5858 if (r < 0)
5859 return r;
5860 }
4c2f5842
LP
5861 }
5862 }
5863
5864 *ret = TAKE_PTR(l);
5865 return 0;
5866}
5867
5868int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5869 ExecCleanMask mask = 0;
5870
5871 assert(c);
5872 assert(ret);
5873
5874 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5875 if (!strv_isempty(c->directories[t].paths))
5876 mask |= 1U << t;
5877
5878 *ret = mask;
5879 return 0;
5880}
5881
b58b4116 5882void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 5883 assert(s);
5cb5a6ff 5884
2ed26ed0
LP
5885 *s = (ExecStatus) {
5886 .pid = pid,
5887 };
5888
b58b4116
LP
5889 dual_timestamp_get(&s->start_timestamp);
5890}
5891
34cf6c43 5892void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
5893 assert(s);
5894
d46b79bb 5895 if (s->pid != pid)
2ed26ed0
LP
5896 *s = (ExecStatus) {
5897 .pid = pid,
5898 };
b58b4116 5899
63983207 5900 dual_timestamp_get(&s->exit_timestamp);
9fb86720 5901
034c6ed7
LP
5902 s->code = code;
5903 s->status = status;
169c1bda 5904
6f765baf
LP
5905 if (context && context->utmp_id)
5906 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
5907}
5908
6a1d4d9f
LP
5909void exec_status_reset(ExecStatus *s) {
5910 assert(s);
5911
5912 *s = (ExecStatus) {};
5913}
5914
34cf6c43 5915void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
5916 char buf[FORMAT_TIMESTAMP_MAX];
5917
5918 assert(s);
5919 assert(f);
5920
9fb86720
LP
5921 if (s->pid <= 0)
5922 return;
5923
4c940960
LP
5924 prefix = strempty(prefix);
5925
9fb86720 5926 fprintf(f,
ccd06097
ZJS
5927 "%sPID: "PID_FMT"\n",
5928 prefix, s->pid);
9fb86720 5929
af9d16e1 5930 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
5931 fprintf(f,
5932 "%sStart Timestamp: %s\n",
63983207 5933 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
9fb86720 5934
af9d16e1 5935 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
5936 fprintf(f,
5937 "%sExit Timestamp: %s\n"
5938 "%sExit Code: %s\n"
5939 "%sExit Status: %i\n",
63983207 5940 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
9fb86720
LP
5941 prefix, sigchld_code_to_string(s->code),
5942 prefix, s->status);
5cb5a6ff 5943}
44d8db9e 5944
34cf6c43 5945static char *exec_command_line(char **argv) {
44d8db9e
LP
5946 size_t k;
5947 char *n, *p, **a;
5948 bool first = true;
5949
9e2f7c11 5950 assert(argv);
44d8db9e 5951
9164977d 5952 k = 1;
9e2f7c11 5953 STRV_FOREACH(a, argv)
44d8db9e
LP
5954 k += strlen(*a)+3;
5955
5cd9cd35
LP
5956 n = new(char, k);
5957 if (!n)
44d8db9e
LP
5958 return NULL;
5959
5960 p = n;
9e2f7c11 5961 STRV_FOREACH(a, argv) {
44d8db9e
LP
5962
5963 if (!first)
5964 *(p++) = ' ';
5965 else
5966 first = false;
5967
5968 if (strpbrk(*a, WHITESPACE)) {
5969 *(p++) = '\'';
5970 p = stpcpy(p, *a);
5971 *(p++) = '\'';
5972 } else
5973 p = stpcpy(p, *a);
5974
5975 }
5976
9164977d
LP
5977 *p = 0;
5978
44d8db9e
LP
5979 /* FIXME: this doesn't really handle arguments that have
5980 * spaces and ticks in them */
5981
5982 return n;
5983}
5984
34cf6c43 5985static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 5986 _cleanup_free_ char *cmd = NULL;
4c940960 5987 const char *prefix2;
44d8db9e
LP
5988
5989 assert(c);
5990 assert(f);
5991
4c940960 5992 prefix = strempty(prefix);
63c372cb 5993 prefix2 = strjoina(prefix, "\t");
44d8db9e 5994
9e2f7c11 5995 cmd = exec_command_line(c->argv);
44d8db9e
LP
5996 fprintf(f,
5997 "%sCommand Line: %s\n",
4bbccb02 5998 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 5999
9fb86720 6000 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6001}
6002
6003void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6004 assert(f);
6005
4c940960 6006 prefix = strempty(prefix);
44d8db9e
LP
6007
6008 LIST_FOREACH(command, c, c)
6009 exec_command_dump(c, f, prefix);
6010}
94f04347 6011
a6a80b4f
LP
6012void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6013 ExecCommand *end;
6014
6015 assert(l);
6016 assert(e);
6017
6018 if (*l) {
35b8ca3a 6019 /* It's kind of important, that we keep the order here */
71fda00f
LP
6020 LIST_FIND_TAIL(command, *l, end);
6021 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6022 } else
6023 *l = e;
6024}
6025
26fd040d
LP
6026int exec_command_set(ExecCommand *c, const char *path, ...) {
6027 va_list ap;
6028 char **l, *p;
6029
6030 assert(c);
6031 assert(path);
6032
6033 va_start(ap, path);
6034 l = strv_new_ap(path, ap);
6035 va_end(ap);
6036
6037 if (!l)
6038 return -ENOMEM;
6039
250a918d
LP
6040 p = strdup(path);
6041 if (!p) {
26fd040d
LP
6042 strv_free(l);
6043 return -ENOMEM;
6044 }
6045
6897dfe8 6046 free_and_replace(c->path, p);
26fd040d 6047
130d3d22 6048 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6049}
6050
86b23b07 6051int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6052 _cleanup_strv_free_ char **l = NULL;
86b23b07 6053 va_list ap;
86b23b07
JS
6054 int r;
6055
6056 assert(c);
6057 assert(path);
6058
6059 va_start(ap, path);
6060 l = strv_new_ap(path, ap);
6061 va_end(ap);
6062
6063 if (!l)
6064 return -ENOMEM;
6065
e287086b 6066 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6067 if (r < 0)
86b23b07 6068 return r;
86b23b07
JS
6069
6070 return 0;
6071}
6072
e8a565cb
YW
6073static void *remove_tmpdir_thread(void *p) {
6074 _cleanup_free_ char *path = p;
86b23b07 6075
e8a565cb
YW
6076 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6077 return NULL;
6078}
6079
6080static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6081 int r;
6082
6083 if (!rt)
6084 return NULL;
6085
6086 if (rt->manager)
6087 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6088
6089 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6090
6091 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6092 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6093
6094 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6095 if (r < 0)
e8a565cb 6096 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6097 else
6098 rt->tmp_dir = NULL;
e8a565cb 6099 }
613b411c 6100
56a13a49 6101 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6102 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6103
6104 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6105 if (r < 0)
e8a565cb 6106 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6107 else
6108 rt->var_tmp_dir = NULL;
e8a565cb
YW
6109 }
6110
6111 rt->id = mfree(rt->id);
6112 rt->tmp_dir = mfree(rt->tmp_dir);
6113 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6114 safe_close_pair(rt->netns_storage_socket);
a70581ff 6115 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6116 return mfree(rt);
6117}
6118
6119static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6120 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6121}
6122
56a13a49
ZJS
6123static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6124 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6125 ExecRuntime *n;
613b411c 6126
8e8009dc 6127 assert(ret);
613b411c 6128
56a13a49
ZJS
6129 id_copy = strdup(id);
6130 if (!id_copy)
6131 return -ENOMEM;
6132
8e8009dc
LP
6133 n = new(ExecRuntime, 1);
6134 if (!n)
613b411c
LP
6135 return -ENOMEM;
6136
8e8009dc 6137 *n = (ExecRuntime) {
56a13a49 6138 .id = TAKE_PTR(id_copy),
8e8009dc 6139 .netns_storage_socket = { -1, -1 },
a70581ff 6140 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6141 };
6142
6143 *ret = n;
613b411c
LP
6144 return 0;
6145}
6146
e8a565cb
YW
6147static int exec_runtime_add(
6148 Manager *m,
6149 const char *id,
56a13a49
ZJS
6150 char **tmp_dir,
6151 char **var_tmp_dir,
6152 int netns_storage_socket[2],
a70581ff 6153 int ipcns_storage_socket[2],
e8a565cb
YW
6154 ExecRuntime **ret) {
6155
6156 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6157 int r;
6158
e8a565cb 6159 assert(m);
613b411c
LP
6160 assert(id);
6161
a70581ff 6162 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6163
56a13a49 6164 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6165 if (r < 0)
6166 return r;
6167
63083706 6168 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6169 if (r < 0)
6170 return r;
e8a565cb 6171
56a13a49
ZJS
6172 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6173 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6174 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6175
6176 if (netns_storage_socket) {
56a13a49
ZJS
6177 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6178 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6179 }
6180
a70581ff
XR
6181 if (ipcns_storage_socket) {
6182 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6183 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6184 }
6185
e8a565cb
YW
6186 rt->manager = m;
6187
6188 if (ret)
6189 *ret = rt;
e8a565cb 6190 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6191 TAKE_PTR(rt);
e8a565cb
YW
6192 return 0;
6193}
6194
74aaf59b
LP
6195static int exec_runtime_make(
6196 Manager *m,
6197 const ExecContext *c,
6198 const char *id,
6199 ExecRuntime **ret) {
6200
56a13a49 6201 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6202 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6203 int r;
6204
6205 assert(m);
6206 assert(c);
6207 assert(id);
6208
6209 /* It is not necessary to create ExecRuntime object. */
a70581ff 6210 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6211 *ret = NULL;
e8a565cb 6212 return 0;
74aaf59b 6213 }
e8a565cb 6214
efa2f3a1
TM
6215 if (c->private_tmp &&
6216 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6217 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6218 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6219 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6220 if (r < 0)
6221 return r;
6222 }
6223
a8d08f39 6224 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6225 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6226 return -errno;
6227 }
6228
a70581ff
XR
6229 if (c->private_ipc || c->ipc_namespace_path) {
6230 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6231 return -errno;
6232 }
6233
6234 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6235 if (r < 0)
6236 return r;
6237
613b411c
LP
6238 return 1;
6239}
6240
e8a565cb
YW
6241int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6242 ExecRuntime *rt;
6243 int r;
613b411c 6244
e8a565cb
YW
6245 assert(m);
6246 assert(id);
6247 assert(ret);
6248
6249 rt = hashmap_get(m->exec_runtime_by_id, id);
6250 if (rt)
6251 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
6252 goto ref;
6253
74aaf59b
LP
6254 if (!create) {
6255 *ret = NULL;
e8a565cb 6256 return 0;
74aaf59b 6257 }
e8a565cb
YW
6258
6259 /* If not found, then create a new object. */
6260 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6261 if (r < 0)
e8a565cb 6262 return r;
74aaf59b
LP
6263 if (r == 0) {
6264 /* When r == 0, it is not necessary to create ExecRuntime object. */
6265 *ret = NULL;
6266 return 0;
6267 }
613b411c 6268
e8a565cb
YW
6269ref:
6270 /* increment reference counter. */
6271 rt->n_ref++;
6272 *ret = rt;
6273 return 1;
6274}
613b411c 6275
e8a565cb
YW
6276ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6277 if (!rt)
613b411c
LP
6278 return NULL;
6279
e8a565cb 6280 assert(rt->n_ref > 0);
613b411c 6281
e8a565cb
YW
6282 rt->n_ref--;
6283 if (rt->n_ref > 0)
f2341e0a
LP
6284 return NULL;
6285
e8a565cb 6286 return exec_runtime_free(rt, destroy);
613b411c
LP
6287}
6288
e8a565cb
YW
6289int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6290 ExecRuntime *rt;
e8a565cb
YW
6291
6292 assert(m);
613b411c
LP
6293 assert(f);
6294 assert(fds);
6295
90e74a66 6296 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6297 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6298
e8a565cb
YW
6299 if (rt->tmp_dir)
6300 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6301
e8a565cb
YW
6302 if (rt->var_tmp_dir)
6303 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6304
e8a565cb
YW
6305 if (rt->netns_storage_socket[0] >= 0) {
6306 int copy;
613b411c 6307
e8a565cb
YW
6308 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6309 if (copy < 0)
6310 return copy;
613b411c 6311
e8a565cb
YW
6312 fprintf(f, " netns-socket-0=%i", copy);
6313 }
613b411c 6314
e8a565cb
YW
6315 if (rt->netns_storage_socket[1] >= 0) {
6316 int copy;
613b411c 6317
e8a565cb
YW
6318 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6319 if (copy < 0)
6320 return copy;
613b411c 6321
e8a565cb
YW
6322 fprintf(f, " netns-socket-1=%i", copy);
6323 }
6324
a70581ff
XR
6325 if (rt->ipcns_storage_socket[0] >= 0) {
6326 int copy;
6327
6328 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6329 if (copy < 0)
6330 return copy;
6331
6332 fprintf(f, " ipcns-socket-0=%i", copy);
6333 }
6334
6335 if (rt->ipcns_storage_socket[1] >= 0) {
6336 int copy;
6337
6338 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6339 if (copy < 0)
6340 return copy;
6341
6342 fprintf(f, " ipcns-socket-1=%i", copy);
6343 }
6344
e8a565cb 6345 fputc('\n', f);
613b411c
LP
6346 }
6347
6348 return 0;
6349}
6350
e8a565cb
YW
6351int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6352 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6353 ExecRuntime *rt;
613b411c
LP
6354 int r;
6355
e8a565cb
YW
6356 /* This is for the migration from old (v237 or earlier) deserialization text.
6357 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6358 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6359 * so or not from the serialized text, then we always creates a new object owned by this. */
6360
6361 assert(u);
613b411c
LP
6362 assert(key);
6363 assert(value);
6364
e8a565cb
YW
6365 /* Manager manages ExecRuntime objects by the unit id.
6366 * So, we omit the serialized text when the unit does not have id (yet?)... */
6367 if (isempty(u->id)) {
6368 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6369 return 0;
6370 }
613b411c 6371
cbc165d1
ZJS
6372 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6373 return log_oom();
e8a565cb
YW
6374
6375 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6376 if (!rt) {
cbc165d1 6377 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6378 return log_oom();
613b411c 6379
e8a565cb
YW
6380 rt = rt_create;
6381 }
6382
6383 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6384 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6385 return -ENOMEM;
613b411c
LP
6386
6387 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6388 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6389 return -ENOMEM;
613b411c
LP
6390
6391 } else if (streq(key, "netns-socket-0")) {
6392 int fd;
6393
e8a565cb 6394 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6395 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6396 return 0;
613b411c 6397 }
e8a565cb
YW
6398
6399 safe_close(rt->netns_storage_socket[0]);
6400 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6401
613b411c
LP
6402 } else if (streq(key, "netns-socket-1")) {
6403 int fd;
6404
e8a565cb 6405 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6406 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6407 return 0;
613b411c 6408 }
e8a565cb
YW
6409
6410 safe_close(rt->netns_storage_socket[1]);
6411 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6412
613b411c
LP
6413 } else
6414 return 0;
6415
e8a565cb
YW
6416 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6417 if (rt_create) {
6418 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6419 if (r < 0) {
3fe91079 6420 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6421 return 0;
6422 }
613b411c 6423
e8a565cb 6424 rt_create->manager = u->manager;
613b411c 6425
e8a565cb 6426 /* Avoid cleanup */
56a13a49 6427 TAKE_PTR(rt_create);
e8a565cb 6428 }
98b47d54 6429
e8a565cb
YW
6430 return 1;
6431}
613b411c 6432
56a13a49
ZJS
6433int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6434 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6435 char *id = NULL;
a70581ff 6436 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6437 const char *p, *v = value;
6438 size_t n;
613b411c 6439
e8a565cb
YW
6440 assert(m);
6441 assert(value);
6442 assert(fds);
98b47d54 6443
e8a565cb
YW
6444 n = strcspn(v, " ");
6445 id = strndupa(v, n);
6446 if (v[n] != ' ')
6447 goto finalize;
6448 p = v + n + 1;
6449
6450 v = startswith(p, "tmp-dir=");
6451 if (v) {
6452 n = strcspn(v, " ");
56a13a49
ZJS
6453 tmp_dir = strndup(v, n);
6454 if (!tmp_dir)
6455 return log_oom();
e8a565cb
YW
6456 if (v[n] != ' ')
6457 goto finalize;
6458 p = v + n + 1;
6459 }
6460
6461 v = startswith(p, "var-tmp-dir=");
6462 if (v) {
6463 n = strcspn(v, " ");
56a13a49
ZJS
6464 var_tmp_dir = strndup(v, n);
6465 if (!var_tmp_dir)
6466 return log_oom();
e8a565cb
YW
6467 if (v[n] != ' ')
6468 goto finalize;
6469 p = v + n + 1;
6470 }
6471
6472 v = startswith(p, "netns-socket-0=");
6473 if (v) {
6474 char *buf;
6475
6476 n = strcspn(v, " ");
6477 buf = strndupa(v, n);
c413bb28 6478
a70581ff 6479 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6480 if (r < 0)
6481 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6482 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6483 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6484 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6485 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6486 if (v[n] != ' ')
6487 goto finalize;
6488 p = v + n + 1;
613b411c
LP
6489 }
6490
e8a565cb
YW
6491 v = startswith(p, "netns-socket-1=");
6492 if (v) {
6493 char *buf;
98b47d54 6494
e8a565cb
YW
6495 n = strcspn(v, " ");
6496 buf = strndupa(v, n);
a70581ff
XR
6497
6498 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6499 if (r < 0)
6500 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6501 if (!fdset_contains(fds, netns_fdpair[1]))
6502 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6503 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6504 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6505 if (v[n] != ' ')
6506 goto finalize;
6507 p = v + n + 1;
6508 }
6509
6510 v = startswith(p, "ipcns-socket-0=");
6511 if (v) {
6512 char *buf;
6513
6514 n = strcspn(v, " ");
6515 buf = strndupa(v, n);
6516
6517 r = safe_atoi(buf, &ipcns_fdpair[0]);
6518 if (r < 0)
6519 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6520 if (!fdset_contains(fds, ipcns_fdpair[0]))
6521 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6522 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6523 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6524 if (v[n] != ' ')
6525 goto finalize;
6526 p = v + n + 1;
6527 }
6528
6529 v = startswith(p, "ipcns-socket-1=");
6530 if (v) {
6531 char *buf;
6532
6533 n = strcspn(v, " ");
6534 buf = strndupa(v, n);
6535
6536 r = safe_atoi(buf, &ipcns_fdpair[1]);
6537 if (r < 0)
6538 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6539 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6540 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6541 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6542 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6543 }
98b47d54 6544
e8a565cb 6545finalize:
a70581ff 6546 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6547 if (r < 0)
56a13a49
ZJS
6548 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6549 return 0;
e8a565cb 6550}
613b411c 6551
e8a565cb
YW
6552void exec_runtime_vacuum(Manager *m) {
6553 ExecRuntime *rt;
e8a565cb
YW
6554
6555 assert(m);
6556
6557 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6558
90e74a66 6559 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6560 if (rt->n_ref > 0)
6561 continue;
6562
6563 (void) exec_runtime_free(rt, false);
6564 }
613b411c
LP
6565}
6566
b9c04eaf
YW
6567void exec_params_clear(ExecParameters *p) {
6568 if (!p)
6569 return;
6570
c3f8a065
LP
6571 p->environment = strv_free(p->environment);
6572 p->fd_names = strv_free(p->fd_names);
6573 p->fds = mfree(p->fds);
6574 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6575}
6576
bb0c0d6f
LP
6577ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6578 if (!sc)
6579 return NULL;
6580
6581 free(sc->id);
6582 free(sc->data);
6583 return mfree(sc);
6584}
6585
6586DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
6587
80876c20
LP
6588static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6589 [EXEC_INPUT_NULL] = "null",
6590 [EXEC_INPUT_TTY] = "tty",
6591 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6592 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6593 [EXEC_INPUT_SOCKET] = "socket",
6594 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6595 [EXEC_INPUT_DATA] = "data",
2038c3f5 6596 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6597};
6598
8a0867d6
LP
6599DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6600
94f04347 6601static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6602 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6603 [EXEC_OUTPUT_NULL] = "null",
80876c20 6604 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6605 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6606 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6607 [EXEC_OUTPUT_JOURNAL] = "journal",
6608 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6609 [EXEC_OUTPUT_SOCKET] = "socket",
6610 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6611 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6612 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6613 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6614};
6615
6616DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6617
6618static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6619 [EXEC_UTMP_INIT] = "init",
6620 [EXEC_UTMP_LOGIN] = "login",
6621 [EXEC_UTMP_USER] = "user",
6622};
6623
6624DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6625
6626static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6627 [EXEC_PRESERVE_NO] = "no",
6628 [EXEC_PRESERVE_YES] = "yes",
6629 [EXEC_PRESERVE_RESTART] = "restart",
6630};
6631
6632DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6633
6b7b2ed9 6634/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6635static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6636 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6637 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6638 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6639 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6640 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6641};
6642
6643DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6644
6b7b2ed9
LP
6645/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6646 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6647 * directories, specifically .timer units with their timestamp touch file. */
6648static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6649 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6650 [EXEC_DIRECTORY_STATE] = "state",
6651 [EXEC_DIRECTORY_CACHE] = "cache",
6652 [EXEC_DIRECTORY_LOGS] = "logs",
6653 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6654};
6655
6656DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6657
6658/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6659 * the service payload in. */
fb2042dd
YW
6660static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6661 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6662 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6663 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6664 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6665 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6666};
6667
6668DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6669
b1edf445
LP
6670static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6671 [EXEC_KEYRING_INHERIT] = "inherit",
6672 [EXEC_KEYRING_PRIVATE] = "private",
6673 [EXEC_KEYRING_SHARED] = "shared",
6674};
6675
6676DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);