]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
tree-wide: use FORMAT_TIMESTAMP()
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b 5#include <poll.h>
d251207d 6#include <sys/eventfd.h>
f5947a5e 7#include <sys/ioctl.h>
f3e43635 8#include <sys/mman.h>
bb0c0d6f 9#include <sys/mount.h>
8dd4c05b 10#include <sys/personality.h>
94f04347 11#include <sys/prctl.h>
d2ffa389 12#include <sys/shm.h>
d2ffa389 13#include <sys/types.h>
8dd4c05b
LP
14#include <sys/un.h>
15#include <unistd.h>
023a4f67 16#include <utmpx.h>
5cb5a6ff 17
349cc4a5 18#if HAVE_PAM
5b6319dc
LP
19#include <security/pam_appl.h>
20#endif
21
349cc4a5 22#if HAVE_SELINUX
7b52a628
MS
23#include <selinux/selinux.h>
24#endif
25
349cc4a5 26#if HAVE_SECCOMP
17df7223
LP
27#include <seccomp.h>
28#endif
29
349cc4a5 30#if HAVE_APPARMOR
eef65bf3
MS
31#include <sys/apparmor.h>
32#endif
33
24882e06 34#include "sd-messages.h"
8dd4c05b 35
bb0c0d6f 36#include "acl-util.h"
8dd4c05b 37#include "af-list.h"
b5efdb8a 38#include "alloc-util.h"
349cc4a5 39#if HAVE_APPARMOR
3ffd4af2
LP
40#include "apparmor-util.h"
41#endif
8dd4c05b
LP
42#include "async.h"
43#include "barrier.h"
8dd4c05b 44#include "cap-list.h"
430f0182 45#include "capability-util.h"
fdb3deca 46#include "cgroup-setup.h"
bb0c0d6f 47#include "chown-recursive.h"
da681e1b 48#include "cpu-set-util.h"
43144be4 49#include "creds-util.h"
6a818c3c 50#include "data-fd-util.h"
f6a6225e 51#include "def.h"
686d13b9 52#include "env-file.h"
4d1a6904 53#include "env-util.h"
17df7223 54#include "errno-list.h"
3ffd4af2 55#include "execute.h"
8dd4c05b 56#include "exit-status.h"
3ffd4af2 57#include "fd-util.h"
bb0c0d6f 58#include "fileio.h"
f97b34a6 59#include "format-util.h"
f4f15635 60#include "fs-util.h"
7d50b32a 61#include "glob-util.h"
0389f4fa 62#include "hexdecoct.h"
c004493c 63#include "io-util.h"
8dd4c05b 64#include "ioprio.h"
a1164ae3 65#include "label.h"
8dd4c05b
LP
66#include "log.h"
67#include "macro.h"
e8a565cb 68#include "manager.h"
2a341bb9 69#include "manager-dump.h"
0a970718 70#include "memory-util.h"
f5947a5e 71#include "missing_fs.h"
8dd4c05b 72#include "mkdir.h"
21935150 73#include "mount-util.h"
bb0c0d6f 74#include "mountpoint-util.h"
8dd4c05b 75#include "namespace.h"
6bedfcbb 76#include "parse-util.h"
8dd4c05b 77#include "path-util.h"
0b452006 78#include "process-util.h"
d3dcf4e3 79#include "random-util.h"
78f22b97 80#include "rlimit-util.h"
8dd4c05b 81#include "rm-rf.h"
349cc4a5 82#if HAVE_SECCOMP
3ffd4af2
LP
83#include "seccomp-util.h"
84#endif
07d46372 85#include "securebits-util.h"
8dd4c05b 86#include "selinux-util.h"
24882e06 87#include "signal-util.h"
8dd4c05b 88#include "smack-util.h"
57b7a260 89#include "socket-util.h"
fd63e712 90#include "special.h"
949befd3 91#include "stat-util.h"
8b43440b 92#include "string-table.h"
07630cea 93#include "string-util.h"
8dd4c05b 94#include "strv.h"
7ccbd1ae 95#include "syslog-util.h"
8dd4c05b 96#include "terminal-util.h"
bb0c0d6f 97#include "tmpfile-util.h"
566b7d23 98#include "umask-util.h"
2d3b784d 99#include "unit-serialize.h"
b1d4f8e1 100#include "user-util.h"
8dd4c05b 101#include "utmp-wtmp.h"
5cb5a6ff 102
e056b01d 103#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 104#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 105
531dca78
LP
106#define SNDBUF_SIZE (8*1024*1024)
107
da6053d0 108static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
109 if (n_fds <= 0)
110 return 0;
111
a0d40ac5
LP
112 /* Modifies the fds array! (sorts it) */
113
034c6ed7
LP
114 assert(fds);
115
5b10116e
ZJS
116 for (int start = 0;;) {
117 int restart_from = -1;
034c6ed7 118
5b10116e 119 for (int i = start; i < (int) n_fds; i++) {
034c6ed7
LP
120 int nfd;
121
122 /* Already at right index? */
123 if (fds[i] == i+3)
124 continue;
125
3cc2aff1
LP
126 nfd = fcntl(fds[i], F_DUPFD, i + 3);
127 if (nfd < 0)
034c6ed7
LP
128 return -errno;
129
03e334a1 130 safe_close(fds[i]);
034c6ed7
LP
131 fds[i] = nfd;
132
133 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 134 * let's remember that and try again from here */
034c6ed7
LP
135 if (nfd != i+3 && restart_from < 0)
136 restart_from = i;
137 }
138
139 if (restart_from < 0)
140 break;
141
142 start = restart_from;
143 }
144
145 return 0;
146}
147
25b583d7 148static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
5b10116e 149 size_t n_fds;
e2c76839 150 int r;
47a71eed 151
25b583d7 152 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
153 if (n_fds <= 0)
154 return 0;
155
156 assert(fds);
157
9b141911
FB
158 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
159 * O_NONBLOCK only applies to socket activation though. */
47a71eed 160
5b10116e 161 for (size_t i = 0; i < n_fds; i++) {
47a71eed 162
9b141911
FB
163 if (i < n_socket_fds) {
164 r = fd_nonblock(fds[i], nonblock);
165 if (r < 0)
166 return r;
167 }
47a71eed 168
451a074f
LP
169 /* We unconditionally drop FD_CLOEXEC from the fds,
170 * since after all we want to pass these fds to our
171 * children */
47a71eed 172
3cc2aff1
LP
173 r = fd_cloexec(fds[i], false);
174 if (r < 0)
e2c76839 175 return r;
47a71eed
LP
176 }
177
178 return 0;
179}
180
1e22b5cd 181static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
182 assert(context);
183
1e22b5cd
LP
184 if (context->stdio_as_fds)
185 return NULL;
186
80876c20
LP
187 if (context->tty_path)
188 return context->tty_path;
189
190 return "/dev/console";
191}
192
1e22b5cd
LP
193static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
194 const char *path;
195
6ea832a2
LP
196 assert(context);
197
1e22b5cd 198 path = exec_context_tty_path(context);
6ea832a2 199
1e22b5cd
LP
200 if (context->tty_vhangup) {
201 if (p && p->stdin_fd >= 0)
202 (void) terminal_vhangup_fd(p->stdin_fd);
203 else if (path)
204 (void) terminal_vhangup(path);
205 }
6ea832a2 206
1e22b5cd
LP
207 if (context->tty_reset) {
208 if (p && p->stdin_fd >= 0)
209 (void) reset_terminal_fd(p->stdin_fd, true);
210 else if (path)
211 (void) reset_terminal(path);
212 }
213
214 if (context->tty_vt_disallocate && path)
215 (void) vt_disallocate(path);
6ea832a2
LP
216}
217
6af760f3
LP
218static bool is_terminal_input(ExecInput i) {
219 return IN_SET(i,
220 EXEC_INPUT_TTY,
221 EXEC_INPUT_TTY_FORCE,
222 EXEC_INPUT_TTY_FAIL);
223}
224
3a1286b6 225static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
226 return IN_SET(o,
227 EXEC_OUTPUT_TTY,
6af760f3
LP
228 EXEC_OUTPUT_KMSG_AND_CONSOLE,
229 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
230}
231
aac8c0c3
LP
232static bool is_kmsg_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_KMSG,
235 EXEC_OUTPUT_KMSG_AND_CONSOLE);
236}
237
6af760f3
LP
238static bool exec_context_needs_term(const ExecContext *c) {
239 assert(c);
240
241 /* Return true if the execution context suggests we should set $TERM to something useful. */
242
243 if (is_terminal_input(c->std_input))
244 return true;
245
246 if (is_terminal_output(c->std_output))
247 return true;
248
249 if (is_terminal_output(c->std_error))
250 return true;
251
252 return !!c->tty_path;
3a1286b6
MS
253}
254
80876c20 255static int open_null_as(int flags, int nfd) {
046a82c1 256 int fd;
071830ff 257
80876c20 258 assert(nfd >= 0);
071830ff 259
613b411c
LP
260 fd = open("/dev/null", flags|O_NOCTTY);
261 if (fd < 0)
071830ff
LP
262 return -errno;
263
046a82c1 264 return move_fd(fd, nfd, false);
071830ff
LP
265}
266
91dd5f7c
LP
267static int connect_journal_socket(
268 int fd,
269 const char *log_namespace,
270 uid_t uid,
271 gid_t gid) {
272
f36a9d59
ZJS
273 union sockaddr_union sa;
274 socklen_t sa_len;
524daa8c
ZJS
275 uid_t olduid = UID_INVALID;
276 gid_t oldgid = GID_INVALID;
91dd5f7c 277 const char *j;
524daa8c
ZJS
278 int r;
279
91dd5f7c
LP
280 j = log_namespace ?
281 strjoina("/run/systemd/journal.", log_namespace, "/stdout") :
282 "/run/systemd/journal/stdout";
283 r = sockaddr_un_set_path(&sa.un, j);
284 if (r < 0)
285 return r;
f36a9d59 286 sa_len = r;
91dd5f7c 287
cad93f29 288 if (gid_is_valid(gid)) {
524daa8c
ZJS
289 oldgid = getgid();
290
92a17af9 291 if (setegid(gid) < 0)
524daa8c
ZJS
292 return -errno;
293 }
294
cad93f29 295 if (uid_is_valid(uid)) {
524daa8c
ZJS
296 olduid = getuid();
297
92a17af9 298 if (seteuid(uid) < 0) {
524daa8c
ZJS
299 r = -errno;
300 goto restore_gid;
301 }
302 }
303
f36a9d59 304 r = connect(fd, &sa.sa, sa_len) < 0 ? -errno : 0;
524daa8c
ZJS
305
306 /* If we fail to restore the uid or gid, things will likely
307 fail later on. This should only happen if an LSM interferes. */
308
cad93f29 309 if (uid_is_valid(uid))
524daa8c
ZJS
310 (void) seteuid(olduid);
311
312 restore_gid:
cad93f29 313 if (gid_is_valid(gid))
524daa8c
ZJS
314 (void) setegid(oldgid);
315
316 return r;
317}
318
fd1f9c89 319static int connect_logger_as(
34cf6c43 320 const Unit *unit,
fd1f9c89 321 const ExecContext *context,
af635cf3 322 const ExecParameters *params,
fd1f9c89
LP
323 ExecOutput output,
324 const char *ident,
fd1f9c89
LP
325 int nfd,
326 uid_t uid,
327 gid_t gid) {
328
2ac1ff68
EV
329 _cleanup_close_ int fd = -1;
330 int r;
071830ff
LP
331
332 assert(context);
af635cf3 333 assert(params);
80876c20
LP
334 assert(output < _EXEC_OUTPUT_MAX);
335 assert(ident);
336 assert(nfd >= 0);
071830ff 337
54fe0cdb
LP
338 fd = socket(AF_UNIX, SOCK_STREAM, 0);
339 if (fd < 0)
80876c20 340 return -errno;
071830ff 341
91dd5f7c 342 r = connect_journal_socket(fd, context->log_namespace, uid, gid);
524daa8c
ZJS
343 if (r < 0)
344 return r;
071830ff 345
2ac1ff68 346 if (shutdown(fd, SHUT_RD) < 0)
80876c20 347 return -errno;
071830ff 348
fd1f9c89 349 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 350
2ac1ff68 351 if (dprintf(fd,
62bca2c6 352 "%s\n"
80876c20
LP
353 "%s\n"
354 "%i\n"
54fe0cdb
LP
355 "%i\n"
356 "%i\n"
357 "%i\n"
4f4a1dbf 358 "%i\n",
c867611e 359 context->syslog_identifier ?: ident,
af635cf3 360 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
361 context->syslog_priority,
362 !!context->syslog_level_prefix,
f3dc6af2 363 false,
aac8c0c3 364 is_kmsg_output(output),
2ac1ff68
EV
365 is_terminal_output(output)) < 0)
366 return -errno;
80876c20 367
2ac1ff68 368 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 369}
2ac1ff68 370
3a274a21 371static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 372 int fd;
071830ff 373
80876c20
LP
374 assert(path);
375 assert(nfd >= 0);
fd1f9c89 376
3a274a21 377 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 378 if (fd < 0)
80876c20 379 return fd;
071830ff 380
046a82c1 381 return move_fd(fd, nfd, false);
80876c20 382}
071830ff 383
2038c3f5 384static int acquire_path(const char *path, int flags, mode_t mode) {
86fca584
ZJS
385 union sockaddr_union sa;
386 socklen_t sa_len;
15a3e96f 387 _cleanup_close_ int fd = -1;
86fca584 388 int r;
071830ff 389
80876c20 390 assert(path);
071830ff 391
2038c3f5
LP
392 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
393 flags |= O_CREAT;
394
395 fd = open(path, flags|O_NOCTTY, mode);
396 if (fd >= 0)
15a3e96f 397 return TAKE_FD(fd);
071830ff 398
2038c3f5
LP
399 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
400 return -errno;
2038c3f5
LP
401
402 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
403
86fca584
ZJS
404 r = sockaddr_un_set_path(&sa.un, path);
405 if (r < 0)
406 return r == -EINVAL ? -ENXIO : r;
407 sa_len = r;
408
2038c3f5
LP
409 fd = socket(AF_UNIX, SOCK_STREAM, 0);
410 if (fd < 0)
411 return -errno;
412
86fca584 413 if (connect(fd, &sa.sa, sa_len) < 0)
2038c3f5 414 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
e8607daf 415 * indication that this wasn't an AF_UNIX socket after all */
071830ff 416
2038c3f5
LP
417 if ((flags & O_ACCMODE) == O_RDONLY)
418 r = shutdown(fd, SHUT_WR);
419 else if ((flags & O_ACCMODE) == O_WRONLY)
420 r = shutdown(fd, SHUT_RD);
421 else
86fca584 422 r = 0;
15a3e96f 423 if (r < 0)
2038c3f5 424 return -errno;
2038c3f5 425
15a3e96f 426 return TAKE_FD(fd);
80876c20 427}
071830ff 428
08f3be7a
LP
429static int fixup_input(
430 const ExecContext *context,
431 int socket_fd,
432 bool apply_tty_stdin) {
433
434 ExecInput std_input;
435
436 assert(context);
437
438 std_input = context->std_input;
1e3ad081
LP
439
440 if (is_terminal_input(std_input) && !apply_tty_stdin)
441 return EXEC_INPUT_NULL;
071830ff 442
03fd9c49 443 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
444 return EXEC_INPUT_NULL;
445
08f3be7a
LP
446 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
447 return EXEC_INPUT_NULL;
448
03fd9c49 449 return std_input;
4f2d528d
LP
450}
451
7966a916 452static int fixup_output(ExecOutput output, int socket_fd) {
4f2d528d 453
7966a916 454 if (output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
455 return EXEC_OUTPUT_INHERIT;
456
7966a916 457 return output;
4f2d528d
LP
458}
459
a34ceba6
LP
460static int setup_input(
461 const ExecContext *context,
462 const ExecParameters *params,
52c239d7 463 int socket_fd,
2caa38e9 464 const int named_iofds[static 3]) {
a34ceba6 465
4f2d528d
LP
466 ExecInput i;
467
468 assert(context);
a34ceba6 469 assert(params);
2caa38e9 470 assert(named_iofds);
a34ceba6
LP
471
472 if (params->stdin_fd >= 0) {
473 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
474 return -errno;
475
476 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
477 if (isatty(STDIN_FILENO)) {
478 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
479 (void) reset_terminal_fd(STDIN_FILENO, true);
480 }
a34ceba6
LP
481
482 return STDIN_FILENO;
483 }
4f2d528d 484
08f3be7a 485 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
486
487 switch (i) {
071830ff 488
80876c20
LP
489 case EXEC_INPUT_NULL:
490 return open_null_as(O_RDONLY, STDIN_FILENO);
491
492 case EXEC_INPUT_TTY:
493 case EXEC_INPUT_TTY_FORCE:
494 case EXEC_INPUT_TTY_FAIL: {
046a82c1 495 int fd;
071830ff 496
1e22b5cd 497 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
498 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
499 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
500 ACQUIRE_TERMINAL_WAIT,
3a43da28 501 USEC_INFINITY);
970edce6 502 if (fd < 0)
80876c20
LP
503 return fd;
504
046a82c1 505 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
506 }
507
4f2d528d 508 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
509 assert(socket_fd >= 0);
510
4f2d528d
LP
511 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
512
52c239d7 513 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
514 assert(named_iofds[STDIN_FILENO] >= 0);
515
52c239d7
LB
516 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
517 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
518
08f3be7a
LP
519 case EXEC_INPUT_DATA: {
520 int fd;
521
522 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
523 if (fd < 0)
524 return fd;
525
526 return move_fd(fd, STDIN_FILENO, false);
527 }
528
2038c3f5
LP
529 case EXEC_INPUT_FILE: {
530 bool rw;
531 int fd;
532
533 assert(context->stdio_file[STDIN_FILENO]);
534
535 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
536 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
537
538 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
539 if (fd < 0)
540 return fd;
541
542 return move_fd(fd, STDIN_FILENO, false);
543 }
544
80876c20
LP
545 default:
546 assert_not_reached("Unknown input type");
547 }
548}
549
41fc585a
LP
550static bool can_inherit_stderr_from_stdout(
551 const ExecContext *context,
552 ExecOutput o,
553 ExecOutput e) {
554
555 assert(context);
556
557 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
558 * stderr fd */
559
560 if (e == EXEC_OUTPUT_INHERIT)
561 return true;
562 if (e != o)
563 return false;
564
565 if (e == EXEC_OUTPUT_NAMED_FD)
566 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
567
8d7dab1f 568 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND, EXEC_OUTPUT_FILE_TRUNCATE))
41fc585a
LP
569 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
570
571 return true;
572}
573
a34ceba6 574static int setup_output(
34cf6c43 575 const Unit *unit,
a34ceba6
LP
576 const ExecContext *context,
577 const ExecParameters *params,
578 int fileno,
579 int socket_fd,
2caa38e9 580 const int named_iofds[static 3],
a34ceba6 581 const char *ident,
7bce046b
LP
582 uid_t uid,
583 gid_t gid,
584 dev_t *journal_stream_dev,
585 ino_t *journal_stream_ino) {
a34ceba6 586
4f2d528d
LP
587 ExecOutput o;
588 ExecInput i;
47c1d80d 589 int r;
4f2d528d 590
f2341e0a 591 assert(unit);
80876c20 592 assert(context);
a34ceba6 593 assert(params);
80876c20 594 assert(ident);
7bce046b
LP
595 assert(journal_stream_dev);
596 assert(journal_stream_ino);
80876c20 597
a34ceba6
LP
598 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
599
600 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
601 return -errno;
602
603 return STDOUT_FILENO;
604 }
605
606 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
607 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
608 return -errno;
609
610 return STDERR_FILENO;
611 }
612
08f3be7a 613 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 614 o = fixup_output(context->std_output, socket_fd);
4f2d528d 615
eb17e935
MS
616 if (fileno == STDERR_FILENO) {
617 ExecOutput e;
618 e = fixup_output(context->std_error, socket_fd);
80876c20 619
eb17e935
MS
620 /* This expects the input and output are already set up */
621
622 /* Don't change the stderr file descriptor if we inherit all
623 * the way and are not on a tty */
624 if (e == EXEC_OUTPUT_INHERIT &&
625 o == EXEC_OUTPUT_INHERIT &&
626 i == EXEC_INPUT_NULL &&
627 !is_terminal_input(context->std_input) &&
7966a916 628 getppid() != 1)
eb17e935
MS
629 return fileno;
630
631 /* Duplicate from stdout if possible */
41fc585a 632 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 633 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 634
eb17e935 635 o = e;
80876c20 636
eb17e935 637 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
638 /* If input got downgraded, inherit the original value */
639 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 640 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 641
08f3be7a
LP
642 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
643 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 644 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 645
acb591e4
LP
646 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
647 if (getppid() != 1)
eb17e935 648 return fileno;
94f04347 649
eb17e935
MS
650 /* We need to open /dev/null here anew, to get the right access mode. */
651 return open_null_as(O_WRONLY, fileno);
071830ff 652 }
94f04347 653
eb17e935 654 switch (o) {
80876c20
LP
655
656 case EXEC_OUTPUT_NULL:
eb17e935 657 return open_null_as(O_WRONLY, fileno);
80876c20
LP
658
659 case EXEC_OUTPUT_TTY:
4f2d528d 660 if (is_terminal_input(i))
eb17e935 661 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
662
663 /* We don't reset the terminal if this is just about output */
1e22b5cd 664 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20 665
9a6bca7a 666 case EXEC_OUTPUT_KMSG:
28dbc1e8 667 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
668 case EXEC_OUTPUT_JOURNAL:
669 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 670 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 671 if (r < 0) {
7966a916
ZJS
672 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m",
673 fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 674 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
675 } else {
676 struct stat st;
677
678 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
679 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
680 * services to detect whether they are connected to the journal or not.
681 *
682 * If both stdout and stderr are connected to a stream then let's make sure to store the data
683 * about STDERR as that's usually the best way to do logging. */
7bce046b 684
ab2116b1
LP
685 if (fstat(fileno, &st) >= 0 &&
686 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
687 *journal_stream_dev = st.st_dev;
688 *journal_stream_ino = st.st_ino;
689 }
47c1d80d
MS
690 }
691 return r;
4f2d528d
LP
692
693 case EXEC_OUTPUT_SOCKET:
694 assert(socket_fd >= 0);
e75a9ed1 695
eb17e935 696 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 697
52c239d7 698 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
699 assert(named_iofds[fileno] >= 0);
700
52c239d7
LB
701 (void) fd_nonblock(named_iofds[fileno], false);
702 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
703
566b7d23 704 case EXEC_OUTPUT_FILE:
8d7dab1f
LW
705 case EXEC_OUTPUT_FILE_APPEND:
706 case EXEC_OUTPUT_FILE_TRUNCATE: {
2038c3f5 707 bool rw;
566b7d23 708 int fd, flags;
2038c3f5
LP
709
710 assert(context->stdio_file[fileno]);
711
712 rw = context->std_input == EXEC_INPUT_FILE &&
713 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
714
715 if (rw)
716 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
717
566b7d23
ZD
718 flags = O_WRONLY;
719 if (o == EXEC_OUTPUT_FILE_APPEND)
720 flags |= O_APPEND;
8d7dab1f
LW
721 else if (o == EXEC_OUTPUT_FILE_TRUNCATE)
722 flags |= O_TRUNC;
566b7d23
ZD
723
724 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
725 if (fd < 0)
726 return fd;
727
566b7d23 728 return move_fd(fd, fileno, 0);
2038c3f5
LP
729 }
730
94f04347 731 default:
80876c20 732 assert_not_reached("Unknown error type");
94f04347 733 }
071830ff
LP
734}
735
02a51aba 736static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 737 int r;
02a51aba
LP
738
739 assert(fd >= 0);
02a51aba 740
1ff74fb6 741 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
742 if (isatty(fd) < 1) {
743 if (IN_SET(errno, EINVAL, ENOTTY))
744 return 0; /* not a tty */
1ff74fb6 745
02a51aba 746 return -errno;
4b3b5bc7 747 }
02a51aba 748
4b3b5bc7 749 /* This might fail. What matters are the results. */
f2df231f 750 r = fchmod_and_chown(fd, TTY_MODE, uid, GID_INVALID);
4b3b5bc7
LP
751 if (r < 0)
752 return r;
02a51aba 753
4b3b5bc7 754 return 1;
02a51aba
LP
755}
756
7d5ceb64 757static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
758 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
759 int r;
80876c20 760
80876c20
LP
761 assert(_saved_stdin);
762 assert(_saved_stdout);
763
af6da548
LP
764 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
765 if (saved_stdin < 0)
766 return -errno;
80876c20 767
af6da548 768 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
769 if (saved_stdout < 0)
770 return -errno;
80876c20 771
8854d795 772 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
773 if (fd < 0)
774 return fd;
80876c20 775
af6da548
LP
776 r = chown_terminal(fd, getuid());
777 if (r < 0)
3d18b167 778 return r;
02a51aba 779
3d18b167
LP
780 r = reset_terminal_fd(fd, true);
781 if (r < 0)
782 return r;
80876c20 783
2b33ab09 784 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 785 fd = -1;
2b33ab09
LP
786 if (r < 0)
787 return r;
80876c20
LP
788
789 *_saved_stdin = saved_stdin;
790 *_saved_stdout = saved_stdout;
791
3d18b167 792 saved_stdin = saved_stdout = -1;
80876c20 793
3d18b167 794 return 0;
80876c20
LP
795}
796
63d77c92 797static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
798 assert(err < 0);
799
800 if (err == -ETIMEDOUT)
63d77c92 801 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
802 else {
803 errno = -err;
63d77c92 804 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
805 }
806}
807
63d77c92 808static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 809 _cleanup_close_ int fd = -1;
80876c20 810
3b20f877 811 assert(vc);
80876c20 812
7d5ceb64 813 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 814 if (fd < 0)
3b20f877 815 return;
80876c20 816
63d77c92 817 write_confirm_error_fd(err, fd, u);
af6da548 818}
80876c20 819
3d18b167 820static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 821 int r = 0;
80876c20 822
af6da548
LP
823 assert(saved_stdin);
824 assert(saved_stdout);
825
826 release_terminal();
827
828 if (*saved_stdin >= 0)
80876c20 829 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 830 r = -errno;
80876c20 831
af6da548 832 if (*saved_stdout >= 0)
80876c20 833 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 834 r = -errno;
80876c20 835
3d18b167
LP
836 *saved_stdin = safe_close(*saved_stdin);
837 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
838
839 return r;
840}
841
3b20f877
FB
842enum {
843 CONFIRM_PRETEND_FAILURE = -1,
844 CONFIRM_PRETEND_SUCCESS = 0,
845 CONFIRM_EXECUTE = 1,
846};
847
eedf223a 848static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 849 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 850 _cleanup_free_ char *e = NULL;
3b20f877 851 char c;
af6da548 852
3b20f877 853 /* For any internal errors, assume a positive response. */
7d5ceb64 854 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 855 if (r < 0) {
63d77c92 856 write_confirm_error(r, vc, u);
3b20f877
FB
857 return CONFIRM_EXECUTE;
858 }
af6da548 859
b0eb2944
FB
860 /* confirm_spawn might have been disabled while we were sleeping. */
861 if (manager_is_confirm_spawn_disabled(u->manager)) {
862 r = 1;
863 goto restore_stdio;
864 }
af6da548 865
2bcd3c26
FB
866 e = ellipsize(cmdline, 60, 100);
867 if (!e) {
868 log_oom();
869 r = CONFIRM_EXECUTE;
870 goto restore_stdio;
871 }
af6da548 872
d172b175 873 for (;;) {
539622bd 874 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 875 if (r < 0) {
63d77c92 876 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
877 r = CONFIRM_EXECUTE;
878 goto restore_stdio;
879 }
af6da548 880
d172b175 881 switch (c) {
b0eb2944
FB
882 case 'c':
883 printf("Resuming normal execution.\n");
884 manager_disable_confirm_spawn();
885 r = 1;
886 break;
dd6f9ac0
FB
887 case 'D':
888 unit_dump(u, stdout, " ");
889 continue; /* ask again */
d172b175
FB
890 case 'f':
891 printf("Failing execution.\n");
892 r = CONFIRM_PRETEND_FAILURE;
893 break;
894 case 'h':
b0eb2944
FB
895 printf(" c - continue, proceed without asking anymore\n"
896 " D - dump, show the state of the unit\n"
dd6f9ac0 897 " f - fail, don't execute the command and pretend it failed\n"
d172b175 898 " h - help\n"
eedf223a 899 " i - info, show a short summary of the unit\n"
56fde33a 900 " j - jobs, show jobs that are in progress\n"
d172b175
FB
901 " s - skip, don't execute the command and pretend it succeeded\n"
902 " y - yes, execute the command\n");
dd6f9ac0 903 continue; /* ask again */
eedf223a
FB
904 case 'i':
905 printf(" Description: %s\n"
906 " Unit: %s\n"
907 " Command: %s\n",
908 u->id, u->description, cmdline);
909 continue; /* ask again */
56fde33a
FB
910 case 'j':
911 manager_dump_jobs(u->manager, stdout, " ");
912 continue; /* ask again */
539622bd
FB
913 case 'n':
914 /* 'n' was removed in favor of 'f'. */
915 printf("Didn't understand 'n', did you mean 'f'?\n");
916 continue; /* ask again */
d172b175
FB
917 case 's':
918 printf("Skipping execution.\n");
919 r = CONFIRM_PRETEND_SUCCESS;
920 break;
921 case 'y':
922 r = CONFIRM_EXECUTE;
923 break;
924 default:
925 assert_not_reached("Unhandled choice");
926 }
3b20f877 927 break;
3b20f877 928 }
af6da548 929
3b20f877 930restore_stdio:
af6da548 931 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 932 return r;
80876c20
LP
933}
934
4d885bd3
DH
935static int get_fixed_user(const ExecContext *c, const char **user,
936 uid_t *uid, gid_t *gid,
937 const char **home, const char **shell) {
81a2b7ce 938 int r;
4d885bd3 939 const char *name;
81a2b7ce 940
4d885bd3 941 assert(c);
81a2b7ce 942
23deef88
LP
943 if (!c->user)
944 return 0;
945
4d885bd3
DH
946 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
947 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 948
23deef88 949 name = c->user;
fafff8f1 950 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
951 if (r < 0)
952 return r;
81a2b7ce 953
4d885bd3
DH
954 *user = name;
955 return 0;
956}
957
958static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
959 int r;
960 const char *name;
961
962 assert(c);
963
964 if (!c->group)
965 return 0;
966
967 name = c->group;
fafff8f1 968 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
969 if (r < 0)
970 return r;
971
972 *group = name;
973 return 0;
974}
975
cdc5d5c5
DH
976static int get_supplementary_groups(const ExecContext *c, const char *user,
977 const char *group, gid_t gid,
978 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
979 char **i;
980 int r, k = 0;
981 int ngroups_max;
982 bool keep_groups = false;
983 gid_t *groups = NULL;
984 _cleanup_free_ gid_t *l_gids = NULL;
985
986 assert(c);
987
bbeea271
DH
988 /*
989 * If user is given, then lookup GID and supplementary groups list.
990 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
991 * here and as early as possible so we keep the list of supplementary
992 * groups of the caller.
bbeea271
DH
993 */
994 if (user && gid_is_valid(gid) && gid != 0) {
995 /* First step, initialize groups from /etc/groups */
996 if (initgroups(user, gid) < 0)
997 return -errno;
998
999 keep_groups = true;
1000 }
1001
ac6e8be6 1002 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
1003 return 0;
1004
366ddd25
DH
1005 /*
1006 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
1007 * be positive, otherwise fail.
1008 */
1009 errno = 0;
1010 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1011 if (ngroups_max <= 0)
1012 return errno_or_else(EOPNOTSUPP);
366ddd25 1013
4d885bd3
DH
1014 l_gids = new(gid_t, ngroups_max);
1015 if (!l_gids)
1016 return -ENOMEM;
81a2b7ce 1017
4d885bd3
DH
1018 if (keep_groups) {
1019 /*
1020 * Lookup the list of groups that the user belongs to, we
1021 * avoid NSS lookups here too for gid=0.
1022 */
1023 k = ngroups_max;
1024 if (getgrouplist(user, gid, l_gids, &k) < 0)
1025 return -EINVAL;
1026 } else
1027 k = 0;
81a2b7ce 1028
4d885bd3
DH
1029 STRV_FOREACH(i, c->supplementary_groups) {
1030 const char *g;
81a2b7ce 1031
4d885bd3
DH
1032 if (k >= ngroups_max)
1033 return -E2BIG;
81a2b7ce 1034
4d885bd3 1035 g = *i;
fafff8f1 1036 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1037 if (r < 0)
1038 return r;
81a2b7ce 1039
4d885bd3
DH
1040 k++;
1041 }
81a2b7ce 1042
4d885bd3
DH
1043 /*
1044 * Sets ngids to zero to drop all supplementary groups, happens
1045 * when we are under root and SupplementaryGroups= is empty.
1046 */
1047 if (k == 0) {
1048 *ngids = 0;
1049 return 0;
1050 }
81a2b7ce 1051
4d885bd3
DH
1052 /* Otherwise get the final list of supplementary groups */
1053 groups = memdup(l_gids, sizeof(gid_t) * k);
1054 if (!groups)
1055 return -ENOMEM;
1056
1057 *supplementary_gids = groups;
1058 *ngids = k;
1059
1060 groups = NULL;
1061
1062 return 0;
1063}
1064
34cf6c43 1065static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1066 int r;
1067
709dbeac
YW
1068 /* Handle SupplementaryGroups= if it is not empty */
1069 if (ngids > 0) {
4d885bd3
DH
1070 r = maybe_setgroups(ngids, supplementary_gids);
1071 if (r < 0)
97f0e76f 1072 return r;
4d885bd3 1073 }
81a2b7ce 1074
4d885bd3
DH
1075 if (gid_is_valid(gid)) {
1076 /* Then set our gids */
1077 if (setresgid(gid, gid, gid) < 0)
1078 return -errno;
81a2b7ce
LP
1079 }
1080
1081 return 0;
1082}
1083
dbdc4098
TK
1084static int set_securebits(int bits, int mask) {
1085 int current, applied;
1086 current = prctl(PR_GET_SECUREBITS);
1087 if (current < 0)
1088 return -errno;
1089 /* Clear all securebits defined in mask and set bits */
1090 applied = (current & ~mask) | bits;
1091 if (current == applied)
1092 return 0;
1093 if (prctl(PR_SET_SECUREBITS, applied) < 0)
1094 return -errno;
1095 return 1;
1096}
1097
81a2b7ce 1098static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce 1099 assert(context);
dbdc4098 1100 int r;
81a2b7ce 1101
4d885bd3
DH
1102 if (!uid_is_valid(uid))
1103 return 0;
1104
479050b3 1105 /* Sets (but doesn't look up) the uid and make sure we keep the
dbdc4098
TK
1106 * capabilities while doing so. For setting secure bits the capability CAP_SETPCAP is
1107 * required, so we also need keep-caps in this case.
1108 */
81a2b7ce 1109
dbdc4098 1110 if (context->capability_ambient_set != 0 || context->secure_bits != 0) {
81a2b7ce
LP
1111
1112 /* First step: If we need to keep capabilities but
1113 * drop privileges we need to make sure we keep our
cbb21cca 1114 * caps, while we drop privileges. */
693ced48 1115 if (uid != 0) {
dbdc4098
TK
1116 /* Add KEEP_CAPS to the securebits */
1117 r = set_securebits(1<<SECURE_KEEP_CAPS, 0);
1118 if (r < 0)
1119 return r;
693ced48 1120 }
81a2b7ce
LP
1121 }
1122
479050b3 1123 /* Second step: actually set the uids */
81a2b7ce
LP
1124 if (setresuid(uid, uid, uid) < 0)
1125 return -errno;
1126
1127 /* At this point we should have all necessary capabilities but
1128 are otherwise a normal user. However, the caps might got
1129 corrupted due to the setresuid() so we need clean them up
1130 later. This is done outside of this call. */
1131
1132 return 0;
1133}
1134
349cc4a5 1135#if HAVE_PAM
5b6319dc
LP
1136
1137static int null_conv(
1138 int num_msg,
1139 const struct pam_message **msg,
1140 struct pam_response **resp,
1141 void *appdata_ptr) {
1142
1143 /* We don't support conversations */
1144
1145 return PAM_CONV_ERR;
1146}
1147
cefc33ae
LP
1148#endif
1149
5b6319dc
LP
1150static int setup_pam(
1151 const char *name,
1152 const char *user,
940c5210 1153 uid_t uid,
2d6fce8d 1154 gid_t gid,
5b6319dc 1155 const char *tty,
2065ca69 1156 char ***env,
5b8d1f6b 1157 const int fds[], size_t n_fds) {
5b6319dc 1158
349cc4a5 1159#if HAVE_PAM
cefc33ae 1160
5b6319dc
LP
1161 static const struct pam_conv conv = {
1162 .conv = null_conv,
1163 .appdata_ptr = NULL
1164 };
1165
2d7c6aa2 1166 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1167 pam_handle_t *handle = NULL;
d6e5f3ad 1168 sigset_t old_ss;
7bb70b6e 1169 int pam_code = PAM_SUCCESS, r;
84eada2f 1170 char **nv, **e = NULL;
5b6319dc
LP
1171 bool close_session = false;
1172 pid_t pam_pid = 0, parent_pid;
970edce6 1173 int flags = 0;
5b6319dc
LP
1174
1175 assert(name);
1176 assert(user);
2065ca69 1177 assert(env);
5b6319dc
LP
1178
1179 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1180 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1181 * systemd via the cgroup logic. It will then remove the PAM
1182 * session again. The parent process will exec() the actual
1183 * daemon. We do things this way to ensure that the main PID
1184 * of the daemon is the one we initially fork()ed. */
1185
7bb70b6e
LP
1186 r = barrier_create(&barrier);
1187 if (r < 0)
2d7c6aa2
DH
1188 goto fail;
1189
553d2243 1190 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1191 flags |= PAM_SILENT;
1192
f546241b
ZJS
1193 pam_code = pam_start(name, user, &conv, &handle);
1194 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1195 handle = NULL;
1196 goto fail;
1197 }
1198
3cd24c1a
LP
1199 if (!tty) {
1200 _cleanup_free_ char *q = NULL;
1201
1202 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1203 * out if that's the case, and read the TTY off it. */
1204
1205 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1206 tty = strjoina("/dev/", q);
1207 }
1208
f546241b
ZJS
1209 if (tty) {
1210 pam_code = pam_set_item(handle, PAM_TTY, tty);
1211 if (pam_code != PAM_SUCCESS)
5b6319dc 1212 goto fail;
f546241b 1213 }
5b6319dc 1214
84eada2f
JW
1215 STRV_FOREACH(nv, *env) {
1216 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1217 if (pam_code != PAM_SUCCESS)
1218 goto fail;
1219 }
1220
970edce6 1221 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1222 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1223 goto fail;
1224
3bb39ea9
DG
1225 pam_code = pam_setcred(handle, PAM_ESTABLISH_CRED | flags);
1226 if (pam_code != PAM_SUCCESS)
46d7c6af 1227 log_debug("pam_setcred() failed, ignoring: %s", pam_strerror(handle, pam_code));
3bb39ea9 1228
970edce6 1229 pam_code = pam_open_session(handle, flags);
f546241b 1230 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1231 goto fail;
1232
1233 close_session = true;
1234
f546241b
ZJS
1235 e = pam_getenvlist(handle);
1236 if (!e) {
5b6319dc
LP
1237 pam_code = PAM_BUF_ERR;
1238 goto fail;
1239 }
1240
1241 /* Block SIGTERM, so that we know that it won't get lost in
1242 * the child */
ce30c8dc 1243
72c0a2c2 1244 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1245
df0ff127 1246 parent_pid = getpid_cached();
5b6319dc 1247
4c253ed1
LP
1248 r = safe_fork("(sd-pam)", 0, &pam_pid);
1249 if (r < 0)
5b6319dc 1250 goto fail;
4c253ed1 1251 if (r == 0) {
7bb70b6e 1252 int sig, ret = EXIT_PAM;
5b6319dc
LP
1253
1254 /* The child's job is to reset the PAM session on
1255 * termination */
2d7c6aa2 1256 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1257
1da37e58
ZJS
1258 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only
1259 * those fds are open here that have been opened by PAM. */
4c253ed1 1260 (void) close_many(fds, n_fds);
5b6319dc 1261
940c5210
AK
1262 /* Drop privileges - we don't need any to pam_close_session
1263 * and this will make PR_SET_PDEATHSIG work in most cases.
1264 * If this fails, ignore the error - but expect sd-pam threads
1265 * to fail to exit normally */
2d6fce8d 1266
97f0e76f
LP
1267 r = maybe_setgroups(0, NULL);
1268 if (r < 0)
1269 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1270 if (setresgid(gid, gid, gid) < 0)
1271 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1272 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1273 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1274
9c274488 1275 (void) ignore_signals(SIGPIPE);
ce30c8dc 1276
940c5210
AK
1277 /* Wait until our parent died. This will only work if
1278 * the above setresuid() succeeds, otherwise the kernel
1279 * will not allow unprivileged parents kill their privileged
1280 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1281 * to do the rest for us. */
1282 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1283 goto child_finish;
1284
2d7c6aa2
DH
1285 /* Tell the parent that our setup is done. This is especially
1286 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1287 * setup might race against our setresuid(2) call.
1288 *
1289 * If the parent aborted, we'll detect this below, hence ignore
1290 * return failure here. */
1291 (void) barrier_place(&barrier);
2d7c6aa2 1292
643f4706 1293 /* Check if our parent process might already have died? */
5b6319dc 1294 if (getppid() == parent_pid) {
d6e5f3ad
DM
1295 sigset_t ss;
1296
1297 assert_se(sigemptyset(&ss) >= 0);
1298 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1299
3dead8d9
LP
1300 for (;;) {
1301 if (sigwait(&ss, &sig) < 0) {
1302 if (errno == EINTR)
1303 continue;
1304
1305 goto child_finish;
1306 }
5b6319dc 1307
3dead8d9
LP
1308 assert(sig == SIGTERM);
1309 break;
1310 }
5b6319dc
LP
1311 }
1312
3bb39ea9
DG
1313 pam_code = pam_setcred(handle, PAM_DELETE_CRED | flags);
1314 if (pam_code != PAM_SUCCESS)
1315 goto child_finish;
1316
3dead8d9 1317 /* If our parent died we'll end the session */
f546241b 1318 if (getppid() != parent_pid) {
970edce6 1319 pam_code = pam_close_session(handle, flags);
f546241b 1320 if (pam_code != PAM_SUCCESS)
5b6319dc 1321 goto child_finish;
f546241b 1322 }
5b6319dc 1323
7bb70b6e 1324 ret = 0;
5b6319dc
LP
1325
1326 child_finish:
970edce6 1327 pam_end(handle, pam_code | flags);
7bb70b6e 1328 _exit(ret);
5b6319dc
LP
1329 }
1330
2d7c6aa2
DH
1331 barrier_set_role(&barrier, BARRIER_PARENT);
1332
5b6319dc
LP
1333 /* If the child was forked off successfully it will do all the
1334 * cleanups, so forget about the handle here. */
1335 handle = NULL;
1336
3b8bddde 1337 /* Unblock SIGTERM again in the parent */
72c0a2c2 1338 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1339
1340 /* We close the log explicitly here, since the PAM modules
1341 * might have opened it, but we don't want this fd around. */
1342 closelog();
1343
2d7c6aa2
DH
1344 /* Synchronously wait for the child to initialize. We don't care for
1345 * errors as we cannot recover. However, warn loudly if it happens. */
1346 if (!barrier_place_and_sync(&barrier))
1347 log_error("PAM initialization failed");
1348
130d3d22 1349 return strv_free_and_replace(*env, e);
5b6319dc
LP
1350
1351fail:
970edce6
ZJS
1352 if (pam_code != PAM_SUCCESS) {
1353 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1354 r = -EPERM; /* PAM errors do not map to errno */
1355 } else
1356 log_error_errno(r, "PAM failed: %m");
9ba35398 1357
5b6319dc
LP
1358 if (handle) {
1359 if (close_session)
970edce6 1360 pam_code = pam_close_session(handle, flags);
5b6319dc 1361
970edce6 1362 pam_end(handle, pam_code | flags);
5b6319dc
LP
1363 }
1364
1365 strv_free(e);
5b6319dc
LP
1366 closelog();
1367
7bb70b6e 1368 return r;
cefc33ae
LP
1369#else
1370 return 0;
5b6319dc 1371#endif
cefc33ae 1372}
5b6319dc 1373
5d6b1584
LP
1374static void rename_process_from_path(const char *path) {
1375 char process_name[11];
1376 const char *p;
1377 size_t l;
1378
1379 /* This resulting string must fit in 10 chars (i.e. the length
1380 * of "/sbin/init") to look pretty in /bin/ps */
1381
2b6bf07d 1382 p = basename(path);
5d6b1584
LP
1383 if (isempty(p)) {
1384 rename_process("(...)");
1385 return;
1386 }
1387
1388 l = strlen(p);
1389 if (l > 8) {
1390 /* The end of the process name is usually more
1391 * interesting, since the first bit might just be
1392 * "systemd-" */
1393 p = p + l - 8;
1394 l = 8;
1395 }
1396
1397 process_name[0] = '(';
1398 memcpy(process_name+1, p, l);
1399 process_name[1+l] = ')';
1400 process_name[1+l+1] = 0;
1401
1402 rename_process(process_name);
1403}
1404
469830d1
LP
1405static bool context_has_address_families(const ExecContext *c) {
1406 assert(c);
1407
6b000af4 1408 return c->address_families_allow_list ||
469830d1
LP
1409 !set_isempty(c->address_families);
1410}
1411
1412static bool context_has_syscall_filters(const ExecContext *c) {
1413 assert(c);
1414
6b000af4 1415 return c->syscall_allow_list ||
8cfa775f 1416 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1417}
1418
9df2cdd8
TM
1419static bool context_has_syscall_logs(const ExecContext *c) {
1420 assert(c);
1421
1422 return c->syscall_log_allow_list ||
1423 !hashmap_isempty(c->syscall_log);
1424}
1425
469830d1
LP
1426static bool context_has_no_new_privileges(const ExecContext *c) {
1427 assert(c);
1428
1429 if (c->no_new_privileges)
1430 return true;
1431
1432 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1433 return false;
1434
1435 /* We need NNP if we have any form of seccomp and are unprivileged */
0538d2a8 1436 return c->lock_personality ||
469830d1 1437 c->memory_deny_write_execute ||
0538d2a8 1438 c->private_devices ||
fc64760d 1439 c->protect_clock ||
0538d2a8 1440 c->protect_hostname ||
469830d1
LP
1441 c->protect_kernel_tunables ||
1442 c->protect_kernel_modules ||
84703040 1443 c->protect_kernel_logs ||
0538d2a8
YW
1444 context_has_address_families(c) ||
1445 exec_context_restrict_namespaces_set(c) ||
1446 c->restrict_realtime ||
1447 c->restrict_suid_sgid ||
78e864e5 1448 !set_isempty(c->syscall_archs) ||
0538d2a8
YW
1449 context_has_syscall_filters(c) ||
1450 context_has_syscall_logs(c);
469830d1
LP
1451}
1452
bb0c0d6f
LP
1453static bool exec_context_has_credentials(const ExecContext *context) {
1454
1455 assert(context);
1456
1457 return !hashmap_isempty(context->set_credentials) ||
43144be4 1458 !hashmap_isempty(context->load_credentials);
bb0c0d6f
LP
1459}
1460
349cc4a5 1461#if HAVE_SECCOMP
17df7223 1462
83f12b27 1463static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1464
1465 if (is_seccomp_available())
1466 return false;
1467
f673b62d 1468 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1469 return true;
83f12b27
FS
1470}
1471
165a31c0 1472static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1473 uint32_t negative_action, default_action, action;
165a31c0 1474 int r;
8351ceae 1475
469830d1 1476 assert(u);
c0467cf3 1477 assert(c);
8351ceae 1478
469830d1 1479 if (!context_has_syscall_filters(c))
83f12b27
FS
1480 return 0;
1481
469830d1
LP
1482 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1483 return 0;
e9642be2 1484
005bfaf1 1485 negative_action = c->syscall_errno == SECCOMP_ERROR_NUMBER_KILL ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1486
6b000af4 1487 if (c->syscall_allow_list) {
469830d1
LP
1488 default_action = negative_action;
1489 action = SCMP_ACT_ALLOW;
7c66bae2 1490 } else {
469830d1
LP
1491 default_action = SCMP_ACT_ALLOW;
1492 action = negative_action;
57183d11 1493 }
8351ceae 1494
165a31c0 1495 if (needs_ambient_hack) {
6b000af4 1496 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_allow_list, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
165a31c0
LP
1497 if (r < 0)
1498 return r;
1499 }
1500
b54f36c6 1501 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1502}
1503
9df2cdd8
TM
1504static int apply_syscall_log(const Unit* u, const ExecContext *c) {
1505#ifdef SCMP_ACT_LOG
1506 uint32_t default_action, action;
1507#endif
1508
1509 assert(u);
1510 assert(c);
1511
1512 if (!context_has_syscall_logs(c))
1513 return 0;
1514
1515#ifdef SCMP_ACT_LOG
1516 if (skip_seccomp_unavailable(u, "SystemCallLog="))
1517 return 0;
1518
1519 if (c->syscall_log_allow_list) {
1520 /* Log nothing but the ones listed */
1521 default_action = SCMP_ACT_ALLOW;
1522 action = SCMP_ACT_LOG;
1523 } else {
1524 /* Log everything but the ones listed */
1525 default_action = SCMP_ACT_LOG;
1526 action = SCMP_ACT_ALLOW;
1527 }
1528
1529 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_log, action, false);
1530#else
1531 /* old libseccomp */
1532 log_unit_debug(u, "SECCOMP feature SCMP_ACT_LOG not available, skipping SystemCallLog=");
1533 return 0;
1534#endif
1535}
1536
469830d1
LP
1537static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1538 assert(u);
4298d0b5
LP
1539 assert(c);
1540
469830d1 1541 if (set_isempty(c->syscall_archs))
83f12b27
FS
1542 return 0;
1543
469830d1
LP
1544 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1545 return 0;
4298d0b5 1546
469830d1
LP
1547 return seccomp_restrict_archs(c->syscall_archs);
1548}
4298d0b5 1549
469830d1
LP
1550static int apply_address_families(const Unit* u, const ExecContext *c) {
1551 assert(u);
1552 assert(c);
4298d0b5 1553
469830d1
LP
1554 if (!context_has_address_families(c))
1555 return 0;
4298d0b5 1556
469830d1
LP
1557 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1558 return 0;
4298d0b5 1559
6b000af4 1560 return seccomp_restrict_address_families(c->address_families, c->address_families_allow_list);
8351ceae 1561}
4298d0b5 1562
83f12b27 1563static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1564 assert(u);
f3e43635
TM
1565 assert(c);
1566
469830d1 1567 if (!c->memory_deny_write_execute)
83f12b27
FS
1568 return 0;
1569
469830d1
LP
1570 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1571 return 0;
f3e43635 1572
469830d1 1573 return seccomp_memory_deny_write_execute();
f3e43635
TM
1574}
1575
83f12b27 1576static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1577 assert(u);
f4170c67
LP
1578 assert(c);
1579
469830d1 1580 if (!c->restrict_realtime)
83f12b27
FS
1581 return 0;
1582
469830d1
LP
1583 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1584 return 0;
f4170c67 1585
469830d1 1586 return seccomp_restrict_realtime();
f4170c67
LP
1587}
1588
f69567cb
LP
1589static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1590 assert(u);
1591 assert(c);
1592
1593 if (!c->restrict_suid_sgid)
1594 return 0;
1595
1596 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1597 return 0;
1598
1599 return seccomp_restrict_suid_sgid();
1600}
1601
59e856c7 1602static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1603 assert(u);
59eeb84b
LP
1604 assert(c);
1605
1606 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1607 * let's protect even those systems where this is left on in the kernel. */
1608
469830d1 1609 if (!c->protect_kernel_tunables)
59eeb84b
LP
1610 return 0;
1611
469830d1
LP
1612 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1613 return 0;
59eeb84b 1614
469830d1 1615 return seccomp_protect_sysctl();
59eeb84b
LP
1616}
1617
59e856c7 1618static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1619 assert(u);
502d704e
DH
1620 assert(c);
1621
25a8d8a0 1622 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1623
469830d1
LP
1624 if (!c->protect_kernel_modules)
1625 return 0;
1626
502d704e
DH
1627 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1628 return 0;
1629
b54f36c6 1630 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1631}
1632
84703040
KK
1633static int apply_protect_kernel_logs(const Unit *u, const ExecContext *c) {
1634 assert(u);
1635 assert(c);
1636
1637 if (!c->protect_kernel_logs)
1638 return 0;
1639
1640 if (skip_seccomp_unavailable(u, "ProtectKernelLogs="))
1641 return 0;
1642
1643 return seccomp_protect_syslog();
1644}
1645
daf8f72b 1646static int apply_protect_clock(const Unit *u, const ExecContext *c) {
fc64760d
KK
1647 assert(u);
1648 assert(c);
1649
1650 if (!c->protect_clock)
1651 return 0;
1652
1653 if (skip_seccomp_unavailable(u, "ProtectClock="))
1654 return 0;
1655
1656 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_CLOCK, SCMP_ACT_ERRNO(EPERM), false);
1657}
1658
59e856c7 1659static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1660 assert(u);
ba128bb8
LP
1661 assert(c);
1662
8f81a5f6 1663 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1664
469830d1
LP
1665 if (!c->private_devices)
1666 return 0;
1667
ba128bb8
LP
1668 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1669 return 0;
1670
b54f36c6 1671 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1672}
1673
34cf6c43 1674static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1675 assert(u);
add00535
LP
1676 assert(c);
1677
1678 if (!exec_context_restrict_namespaces_set(c))
1679 return 0;
1680
1681 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1682 return 0;
1683
1684 return seccomp_restrict_namespaces(c->restrict_namespaces);
1685}
1686
78e864e5 1687static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1688 unsigned long personality;
1689 int r;
78e864e5
TM
1690
1691 assert(u);
1692 assert(c);
1693
1694 if (!c->lock_personality)
1695 return 0;
1696
1697 if (skip_seccomp_unavailable(u, "LockPersonality="))
1698 return 0;
1699
e8132d63
LP
1700 personality = c->personality;
1701
1702 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1703 if (personality == PERSONALITY_INVALID) {
1704
1705 r = opinionated_personality(&personality);
1706 if (r < 0)
1707 return r;
1708 }
78e864e5
TM
1709
1710 return seccomp_lock_personality(personality);
1711}
1712
c0467cf3 1713#endif
8351ceae 1714
daf8f72b 1715static int apply_protect_hostname(const Unit *u, const ExecContext *c, int *ret_exit_status) {
daf8f72b
LP
1716 assert(u);
1717 assert(c);
1718
1719 if (!c->protect_hostname)
1720 return 0;
1721
1722 if (ns_type_supported(NAMESPACE_UTS)) {
1723 if (unshare(CLONE_NEWUTS) < 0) {
1724 if (!ERRNO_IS_NOT_SUPPORTED(errno) && !ERRNO_IS_PRIVILEGE(errno)) {
1725 *ret_exit_status = EXIT_NAMESPACE;
1726 return log_unit_error_errno(u, errno, "Failed to set up UTS namespacing: %m");
1727 }
1728
1729 log_unit_warning(u, "ProtectHostname=yes is configured, but UTS namespace setup is prohibited (container manager?), ignoring namespace setup.");
1730 }
1731 } else
1732 log_unit_warning(u, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
1733
1734#if HAVE_SECCOMP
8f3e342f
ZJS
1735 int r;
1736
daf8f72b
LP
1737 if (skip_seccomp_unavailable(u, "ProtectHostname="))
1738 return 0;
1739
1740 r = seccomp_protect_hostname();
1741 if (r < 0) {
1742 *ret_exit_status = EXIT_SECCOMP;
1743 return log_unit_error_errno(u, r, "Failed to apply hostname restrictions: %m");
1744 }
1745#endif
1746
1747 return 0;
1748}
1749
3042bbeb 1750static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1751 assert(idle_pipe);
1752
54eb2300
LP
1753 idle_pipe[1] = safe_close(idle_pipe[1]);
1754 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1755
1756 if (idle_pipe[0] >= 0) {
1757 int r;
1758
1759 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1760
1761 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1762 ssize_t n;
1763
31a7eb86 1764 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1765 n = write(idle_pipe[3], "x", 1);
1766 if (n > 0)
cd972d69 1767 /* Wait for systemd to react to the signal above. */
54756dce 1768 (void) fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1769 }
1770
54eb2300 1771 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1772
1773 }
1774
54eb2300 1775 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1776}
1777
fb2042dd
YW
1778static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1779
7cae38c4 1780static int build_environment(
34cf6c43 1781 const Unit *u,
9fa95f85 1782 const ExecContext *c,
1e22b5cd 1783 const ExecParameters *p,
da6053d0 1784 size_t n_fds,
7cae38c4
LP
1785 const char *home,
1786 const char *username,
1787 const char *shell,
7bce046b
LP
1788 dev_t journal_stream_dev,
1789 ino_t journal_stream_ino,
7cae38c4
LP
1790 char ***ret) {
1791
1792 _cleanup_strv_free_ char **our_env = NULL;
da6053d0 1793 size_t n_env = 0;
7cae38c4
LP
1794 char *x;
1795
4b58153d 1796 assert(u);
7cae38c4 1797 assert(c);
7c1cb6f1 1798 assert(p);
7cae38c4
LP
1799 assert(ret);
1800
dc4e2940 1801#define N_ENV_VARS 17
8d5bb13d 1802 our_env = new0(char*, N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1803 if (!our_env)
1804 return -ENOMEM;
1805
1806 if (n_fds > 0) {
8dd4c05b
LP
1807 _cleanup_free_ char *joined = NULL;
1808
df0ff127 1809 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1810 return -ENOMEM;
1811 our_env[n_env++] = x;
1812
da6053d0 1813 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1814 return -ENOMEM;
1815 our_env[n_env++] = x;
8dd4c05b 1816
1e22b5cd 1817 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1818 if (!joined)
1819 return -ENOMEM;
1820
605405c6 1821 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1822 if (!x)
1823 return -ENOMEM;
1824 our_env[n_env++] = x;
7cae38c4
LP
1825 }
1826
b08af3b1 1827 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1828 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1829 return -ENOMEM;
1830 our_env[n_env++] = x;
1831
1e22b5cd 1832 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1833 return -ENOMEM;
1834 our_env[n_env++] = x;
1835 }
1836
fd63e712
LP
1837 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1838 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1839 * check the database directly. */
ac647978 1840 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1841 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1842 if (!x)
1843 return -ENOMEM;
1844 our_env[n_env++] = x;
1845 }
1846
7cae38c4 1847 if (home) {
b910cc72 1848 x = strjoin("HOME=", home);
7cae38c4
LP
1849 if (!x)
1850 return -ENOMEM;
7bbead1d 1851
4ff361cc 1852 path_simplify(x + 5);
7cae38c4
LP
1853 our_env[n_env++] = x;
1854 }
1855
1856 if (username) {
b910cc72 1857 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1858 if (!x)
1859 return -ENOMEM;
1860 our_env[n_env++] = x;
1861
b910cc72 1862 x = strjoin("USER=", username);
7cae38c4
LP
1863 if (!x)
1864 return -ENOMEM;
1865 our_env[n_env++] = x;
1866 }
1867
1868 if (shell) {
b910cc72 1869 x = strjoin("SHELL=", shell);
7cae38c4
LP
1870 if (!x)
1871 return -ENOMEM;
7bbead1d 1872
4ff361cc 1873 path_simplify(x + 6);
7cae38c4
LP
1874 our_env[n_env++] = x;
1875 }
1876
4b58153d
LP
1877 if (!sd_id128_is_null(u->invocation_id)) {
1878 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1879 return -ENOMEM;
1880
1881 our_env[n_env++] = x;
1882 }
1883
6af760f3
LP
1884 if (exec_context_needs_term(c)) {
1885 const char *tty_path, *term = NULL;
1886
1887 tty_path = exec_context_tty_path(c);
1888
e8cf09b2
LP
1889 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try
1890 * to inherit the $TERM set for PID 1. This is useful for containers so that the $TERM the
1891 * container manager passes to PID 1 ends up all the way in the console login shown. */
6af760f3 1892
e8cf09b2 1893 if (path_equal_ptr(tty_path, "/dev/console") && getppid() == 1)
6af760f3 1894 term = getenv("TERM");
e8cf09b2 1895
6af760f3
LP
1896 if (!term)
1897 term = default_term_for_tty(tty_path);
7cae38c4 1898
b910cc72 1899 x = strjoin("TERM=", term);
7cae38c4
LP
1900 if (!x)
1901 return -ENOMEM;
1902 our_env[n_env++] = x;
1903 }
1904
7bce046b
LP
1905 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1906 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1907 return -ENOMEM;
1908
1909 our_env[n_env++] = x;
1910 }
1911
91dd5f7c
LP
1912 if (c->log_namespace) {
1913 x = strjoin("LOG_NAMESPACE=", c->log_namespace);
1914 if (!x)
1915 return -ENOMEM;
1916
1917 our_env[n_env++] = x;
1918 }
1919
5b10116e 1920 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
fb2042dd
YW
1921 _cleanup_free_ char *pre = NULL, *joined = NULL;
1922 const char *n;
1923
1924 if (!p->prefix[t])
1925 continue;
1926
1927 if (strv_isempty(c->directories[t].paths))
1928 continue;
1929
1930 n = exec_directory_env_name_to_string(t);
1931 if (!n)
1932 continue;
1933
1934 pre = strjoin(p->prefix[t], "/");
1935 if (!pre)
1936 return -ENOMEM;
1937
48904c8b 1938 joined = strv_join_full(c->directories[t].paths, ":", pre, true);
fb2042dd
YW
1939 if (!joined)
1940 return -ENOMEM;
1941
1942 x = strjoin(n, "=", joined);
1943 if (!x)
1944 return -ENOMEM;
1945
1946 our_env[n_env++] = x;
1947 }
1948
bb0c0d6f
LP
1949 if (exec_context_has_credentials(c) && p->prefix[EXEC_DIRECTORY_RUNTIME]) {
1950 x = strjoin("CREDENTIALS_DIRECTORY=", p->prefix[EXEC_DIRECTORY_RUNTIME], "/credentials/", u->id);
1951 if (!x)
1952 return -ENOMEM;
1953
1954 our_env[n_env++] = x;
1955 }
1956
dc4e2940
YW
1957 if (asprintf(&x, "SYSTEMD_EXEC_PID=" PID_FMT, getpid_cached()) < 0)
1958 return -ENOMEM;
1959
1960 our_env[n_env++] = x;
1961
7cae38c4 1962 our_env[n_env++] = NULL;
8d5bb13d
LP
1963 assert(n_env <= N_ENV_VARS + _EXEC_DIRECTORY_TYPE_MAX);
1964#undef N_ENV_VARS
7cae38c4 1965
ae2a15bc 1966 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1967
1968 return 0;
1969}
1970
b4c14404
FB
1971static int build_pass_environment(const ExecContext *c, char ***ret) {
1972 _cleanup_strv_free_ char **pass_env = NULL;
319a4f4b 1973 size_t n_env = 0;
b4c14404
FB
1974 char **i;
1975
1976 STRV_FOREACH(i, c->pass_environment) {
1977 _cleanup_free_ char *x = NULL;
1978 char *v;
1979
1980 v = getenv(*i);
1981 if (!v)
1982 continue;
605405c6 1983 x = strjoin(*i, "=", v);
b4c14404
FB
1984 if (!x)
1985 return -ENOMEM;
00819cc1 1986
319a4f4b 1987 if (!GREEDY_REALLOC(pass_env, n_env + 2))
b4c14404 1988 return -ENOMEM;
00819cc1 1989
1cc6c93a 1990 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1991 pass_env[n_env] = NULL;
b4c14404
FB
1992 }
1993
ae2a15bc 1994 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1995
1996 return 0;
1997}
1998
5e8deb94 1999bool exec_needs_mount_namespace(
8b44a3d2
LP
2000 const ExecContext *context,
2001 const ExecParameters *params,
4657abb5 2002 const ExecRuntime *runtime) {
8b44a3d2
LP
2003
2004 assert(context);
8b44a3d2 2005
915e6d16
LP
2006 if (context->root_image)
2007 return true;
2008
2a624c36
AP
2009 if (!strv_isempty(context->read_write_paths) ||
2010 !strv_isempty(context->read_only_paths) ||
ddc155b2
TM
2011 !strv_isempty(context->inaccessible_paths) ||
2012 !strv_isempty(context->exec_paths) ||
2013 !strv_isempty(context->no_exec_paths))
8b44a3d2
LP
2014 return true;
2015
42b1d8e0 2016 if (context->n_bind_mounts > 0)
d2d6c096
LP
2017 return true;
2018
2abd4e38
YW
2019 if (context->n_temporary_filesystems > 0)
2020 return true;
2021
b3d13314
LB
2022 if (context->n_mount_images > 0)
2023 return true;
2024
93f59701
LB
2025 if (context->n_extension_images > 0)
2026 return true;
2027
37ed15d7 2028 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
2029 return true;
2030
2031 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
2032 return true;
2033
8b44a3d2 2034 if (context->private_devices ||
228af36f 2035 context->private_mounts ||
8b44a3d2 2036 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
2037 context->protect_home != PROTECT_HOME_NO ||
2038 context->protect_kernel_tunables ||
c575770b 2039 context->protect_kernel_modules ||
94a7b275 2040 context->protect_kernel_logs ||
4e399953
LP
2041 context->protect_control_groups ||
2042 context->protect_proc != PROTECT_PROC_DEFAULT ||
80271a44
XR
2043 context->proc_subset != PROC_SUBSET_ALL ||
2044 context->private_ipc ||
2045 context->ipc_namespace_path)
8b44a3d2
LP
2046 return true;
2047
37c56f89 2048 if (context->root_directory) {
5e98086d 2049 if (exec_context_get_effective_mount_apivfs(context))
37c56f89
YW
2050 return true;
2051
5b10116e 2052 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
5e8deb94 2053 if (params && !params->prefix[t])
37c56f89
YW
2054 continue;
2055
2056 if (!strv_isempty(context->directories[t].paths))
2057 return true;
2058 }
2059 }
5d997827 2060
42b1d8e0 2061 if (context->dynamic_user &&
b43ee82f 2062 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
2063 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
2064 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
2065 return true;
2066
91dd5f7c
LP
2067 if (context->log_namespace)
2068 return true;
2069
8b44a3d2
LP
2070 return false;
2071}
2072
5749f855 2073static int setup_private_users(uid_t ouid, gid_t ogid, uid_t uid, gid_t gid) {
d251207d
LP
2074 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
2075 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
2076 _cleanup_close_ int unshare_ready_fd = -1;
2077 _cleanup_(sigkill_waitp) pid_t pid = 0;
2078 uint64_t c = 1;
d251207d
LP
2079 ssize_t n;
2080 int r;
2081
5749f855
AZ
2082 /* Set up a user namespace and map the original UID/GID (IDs from before any user or group changes, i.e.
2083 * the IDs from the user or system manager(s)) to itself, the selected UID/GID to itself, and everything else to
d251207d
LP
2084 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
2085 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
2086 * which waits for the parent to create the new user namespace while staying in the original namespace. The
2087 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
5749f855
AZ
2088 * continues execution normally.
2089 * For unprivileged users (i.e. without capabilities), the root to root mapping is excluded. As such, it
2090 * does not need CAP_SETUID to write the single line mapping to itself. */
d251207d 2091
5749f855
AZ
2092 /* Can only set up multiple mappings with CAP_SETUID. */
2093 if (have_effective_cap(CAP_SETUID) && uid != ouid && uid_is_valid(uid))
587ab01b 2094 r = asprintf(&uid_map,
5749f855 2095 UID_FMT " " UID_FMT " 1\n" /* Map $OUID → $OUID */
587ab01b 2096 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
5749f855
AZ
2097 ouid, ouid, uid, uid);
2098 else
2099 r = asprintf(&uid_map,
2100 UID_FMT " " UID_FMT " 1\n", /* Map $OUID → $OUID */
2101 ouid, ouid);
d251207d 2102
5749f855
AZ
2103 if (r < 0)
2104 return -ENOMEM;
2105
2106 /* Can only set up multiple mappings with CAP_SETGID. */
2107 if (have_effective_cap(CAP_SETGID) && gid != ogid && gid_is_valid(gid))
587ab01b 2108 r = asprintf(&gid_map,
5749f855 2109 GID_FMT " " GID_FMT " 1\n" /* Map $OGID → $OGID */
587ab01b 2110 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
5749f855
AZ
2111 ogid, ogid, gid, gid);
2112 else
2113 r = asprintf(&gid_map,
2114 GID_FMT " " GID_FMT " 1\n", /* Map $OGID -> $OGID */
2115 ogid, ogid);
2116
2117 if (r < 0)
2118 return -ENOMEM;
d251207d
LP
2119
2120 /* Create a communication channel so that the parent can tell the child when it finished creating the user
2121 * namespace. */
2122 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
2123 if (unshare_ready_fd < 0)
2124 return -errno;
2125
2126 /* Create a communication channel so that the child can tell the parent a proper error code in case it
2127 * failed. */
2128 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
2129 return -errno;
2130
4c253ed1
LP
2131 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
2132 if (r < 0)
2133 return r;
2134 if (r == 0) {
d251207d
LP
2135 _cleanup_close_ int fd = -1;
2136 const char *a;
2137 pid_t ppid;
2138
2139 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
2140 * here, after the parent opened its own user namespace. */
2141
2142 ppid = getppid();
2143 errno_pipe[0] = safe_close(errno_pipe[0]);
2144
2145 /* Wait until the parent unshared the user namespace */
2146 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
2147 r = -errno;
2148 goto child_fail;
2149 }
2150
2151 /* Disable the setgroups() system call in the child user namespace, for good. */
2152 a = procfs_file_alloca(ppid, "setgroups");
2153 fd = open(a, O_WRONLY|O_CLOEXEC);
2154 if (fd < 0) {
2155 if (errno != ENOENT) {
2156 r = -errno;
2157 goto child_fail;
2158 }
2159
2160 /* If the file is missing the kernel is too old, let's continue anyway. */
2161 } else {
2162 if (write(fd, "deny\n", 5) < 0) {
2163 r = -errno;
2164 goto child_fail;
2165 }
2166
2167 fd = safe_close(fd);
2168 }
2169
2170 /* First write the GID map */
2171 a = procfs_file_alloca(ppid, "gid_map");
2172 fd = open(a, O_WRONLY|O_CLOEXEC);
2173 if (fd < 0) {
2174 r = -errno;
2175 goto child_fail;
2176 }
2177 if (write(fd, gid_map, strlen(gid_map)) < 0) {
2178 r = -errno;
2179 goto child_fail;
2180 }
2181 fd = safe_close(fd);
2182
2183 /* The write the UID map */
2184 a = procfs_file_alloca(ppid, "uid_map");
2185 fd = open(a, O_WRONLY|O_CLOEXEC);
2186 if (fd < 0) {
2187 r = -errno;
2188 goto child_fail;
2189 }
2190 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2191 r = -errno;
2192 goto child_fail;
2193 }
2194
2195 _exit(EXIT_SUCCESS);
2196
2197 child_fail:
2198 (void) write(errno_pipe[1], &r, sizeof(r));
2199 _exit(EXIT_FAILURE);
2200 }
2201
2202 errno_pipe[1] = safe_close(errno_pipe[1]);
2203
2204 if (unshare(CLONE_NEWUSER) < 0)
2205 return -errno;
2206
2207 /* Let the child know that the namespace is ready now */
2208 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2209 return -errno;
2210
2211 /* Try to read an error code from the child */
2212 n = read(errno_pipe[0], &r, sizeof(r));
2213 if (n < 0)
2214 return -errno;
2215 if (n == sizeof(r)) { /* an error code was sent to us */
2216 if (r < 0)
2217 return r;
2218 return -EIO;
2219 }
2220 if (n != 0) /* on success we should have read 0 bytes */
2221 return -EIO;
2222
2e87a1fd
LP
2223 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2224 pid = 0;
d251207d
LP
2225 if (r < 0)
2226 return r;
2e87a1fd 2227 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2228 return -EIO;
2229
2230 return 0;
2231}
2232
494d0247
YW
2233static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2234 if (!context->dynamic_user)
2235 return false;
2236
2237 if (type == EXEC_DIRECTORY_CONFIGURATION)
2238 return false;
2239
2240 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2241 return false;
2242
2243 return true;
2244}
2245
3536f49e 2246static int setup_exec_directory(
07689d5d
LP
2247 const ExecContext *context,
2248 const ExecParameters *params,
2249 uid_t uid,
3536f49e 2250 gid_t gid,
3536f49e
YW
2251 ExecDirectoryType type,
2252 int *exit_status) {
07689d5d 2253
72fd1768 2254 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2255 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2256 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2257 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2258 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2259 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2260 };
07689d5d
LP
2261 char **rt;
2262 int r;
2263
2264 assert(context);
2265 assert(params);
72fd1768 2266 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2267 assert(exit_status);
07689d5d 2268
3536f49e
YW
2269 if (!params->prefix[type])
2270 return 0;
2271
8679efde 2272 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2273 if (!uid_is_valid(uid))
2274 uid = 0;
2275 if (!gid_is_valid(gid))
2276 gid = 0;
2277 }
2278
2279 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 2280 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2281
edbfeb12 2282 p = path_join(params->prefix[type], *rt);
3536f49e
YW
2283 if (!p) {
2284 r = -ENOMEM;
2285 goto fail;
2286 }
07689d5d 2287
23a7448e
YW
2288 r = mkdir_parents_label(p, 0755);
2289 if (r < 0)
3536f49e 2290 goto fail;
23a7448e 2291
494d0247 2292 if (exec_directory_is_private(context, type)) {
3f5b1508
LP
2293 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2294 * case we want to avoid leaving a directory around fully accessible that is owned by
2295 * a dynamic user whose UID is later on reused. To lock this down we use the same
2296 * trick used by container managers to prohibit host users to get access to files of
2297 * the same UID in containers: we place everything inside a directory that has an
2298 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2299 * for unprivileged host code. We then use fs namespacing to make this directory
2300 * permeable for the service itself.
6c47cd7d 2301 *
3f5b1508
LP
2302 * Specifically: for a service which wants a special directory "foo/" we first create
2303 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2304 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2305 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2306 * unprivileged host users can't look into it. Inside of the namespace of the unit
2307 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2308 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2309 * for the service and making sure it only gets access to the dirs it needs but no
2310 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2311 *
3f5b1508
LP
2312 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2313 * to be owned by the service itself.
2314 *
2315 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2316 * for sharing files or sockets with other services. */
6c47cd7d 2317
4ede9802
LP
2318 pp = path_join(params->prefix[type], "private");
2319 if (!pp) {
6c47cd7d
LP
2320 r = -ENOMEM;
2321 goto fail;
2322 }
2323
2324 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
4ede9802 2325 r = mkdir_safe_label(pp, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2326 if (r < 0)
2327 goto fail;
2328
4ede9802 2329 if (!path_extend(&pp, *rt)) {
6c47cd7d
LP
2330 r = -ENOMEM;
2331 goto fail;
2332 }
2333
2334 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2335 r = mkdir_parents_label(pp, 0755);
2336 if (r < 0)
2337 goto fail;
2338
949befd3
LP
2339 if (is_dir(p, false) > 0 &&
2340 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2341
2342 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2343 * it over. Most likely the service has been upgraded from one that didn't use
2344 * DynamicUser=1, to one that does. */
2345
cf52c45d
LP
2346 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2347 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2348 exec_directory_type_to_string(type), p, pp);
2349
949befd3
LP
2350 if (rename(p, pp) < 0) {
2351 r = -errno;
2352 goto fail;
2353 }
2354 } else {
2355 /* Otherwise, create the actual directory for the service */
2356
2357 r = mkdir_label(pp, context->directories[type].mode);
2358 if (r < 0 && r != -EEXIST)
2359 goto fail;
2360 }
6c47cd7d 2361
6c47cd7d 2362 /* And link it up from the original place */
6c9c51e5 2363 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2364 if (r < 0)
2365 goto fail;
2366
6c47cd7d 2367 } else {
5c6d40d1
LP
2368 _cleanup_free_ char *target = NULL;
2369
2370 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2371 readlink_and_make_absolute(p, &target) >= 0) {
578dc69f 2372 _cleanup_free_ char *q = NULL, *q_resolved = NULL, *target_resolved = NULL;
5c6d40d1
LP
2373
2374 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2375 * by DynamicUser=1 (see above)?
2376 *
2377 * We do this for all directory types except for ConfigurationDirectory=,
2378 * since they all support the private/ symlink logic at least in some
2379 * configurations, see above. */
5c6d40d1 2380
578dc69f
YW
2381 r = chase_symlinks(target, NULL, 0, &target_resolved, NULL);
2382 if (r < 0)
2383 goto fail;
2384
5c6d40d1
LP
2385 q = path_join(params->prefix[type], "private", *rt);
2386 if (!q) {
2387 r = -ENOMEM;
2388 goto fail;
2389 }
2390
578dc69f
YW
2391 /* /var/lib or friends may be symlinks. So, let's chase them also. */
2392 r = chase_symlinks(q, NULL, CHASE_NONEXISTENT, &q_resolved, NULL);
2393 if (r < 0)
2394 goto fail;
2395
2396 if (path_equal(q_resolved, target_resolved)) {
5c6d40d1
LP
2397
2398 /* Hmm, apparently DynamicUser= was once turned on for this service,
2399 * but is no longer. Let's move the directory back up. */
2400
cf52c45d
LP
2401 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2402 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2403 exec_directory_type_to_string(type), q, p);
2404
5c6d40d1
LP
2405 if (unlink(p) < 0) {
2406 r = -errno;
2407 goto fail;
2408 }
2409
2410 if (rename(q, p) < 0) {
2411 r = -errno;
2412 goto fail;
2413 }
2414 }
2415 }
2416
6c47cd7d 2417 r = mkdir_label(p, context->directories[type].mode);
d484580c 2418 if (r < 0) {
d484580c
LP
2419 if (r != -EEXIST)
2420 goto fail;
2421
206e9864
LP
2422 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2423 struct stat st;
2424
2425 /* Don't change the owner/access mode of the configuration directory,
2426 * as in the common case it is not written to by a service, and shall
2427 * not be writable. */
2428
2429 if (stat(p, &st) < 0) {
2430 r = -errno;
2431 goto fail;
2432 }
2433
2434 /* Still complain if the access mode doesn't match */
2435 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2436 log_warning("%s \'%s\' already exists but the mode is different. "
2437 "(File system: %o %sMode: %o)",
2438 exec_directory_type_to_string(type), *rt,
2439 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2440
6cff72eb 2441 continue;
206e9864 2442 }
6cff72eb 2443 }
a1164ae3 2444 }
07689d5d 2445
206e9864 2446 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2447 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2448 * current UID/GID ownership.) */
2449 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2450 if (r < 0)
2451 goto fail;
c71b2eb7 2452
607b358e
LP
2453 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2454 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
7802194a 2455 * assignments to exist. */
607b358e 2456 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2457 if (r < 0)
3536f49e 2458 goto fail;
07689d5d
LP
2459 }
2460
2461 return 0;
3536f49e
YW
2462
2463fail:
2464 *exit_status = exit_status_table[type];
3536f49e 2465 return r;
07689d5d
LP
2466}
2467
bb0c0d6f
LP
2468static int write_credential(
2469 int dfd,
2470 const char *id,
2471 const void *data,
2472 size_t size,
2473 uid_t uid,
2474 bool ownership_ok) {
2475
2476 _cleanup_(unlink_and_freep) char *tmp = NULL;
2477 _cleanup_close_ int fd = -1;
2478 int r;
2479
2480 r = tempfn_random_child("", "cred", &tmp);
2481 if (r < 0)
2482 return r;
2483
2484 fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
2485 if (fd < 0) {
2486 tmp = mfree(tmp);
2487 return -errno;
2488 }
2489
43144be4 2490 r = loop_write(fd, data, size, /* do_poll = */ false);
bb0c0d6f
LP
2491 if (r < 0)
2492 return r;
2493
2494 if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
2495 return -errno;
2496
2497 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2498 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
bb0c0d6f
LP
2499 if (r < 0) {
2500 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2501 return r;
2502
2503 if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
2504 * to express: that the user gets read access and nothing
2505 * else. But if the backing fs can't support that (e.g. ramfs)
2506 * then we can use file ownership instead. But that's only safe if
2507 * we can then re-mount the whole thing read-only, so that the
2508 * user can no longer chmod() the file to gain write access. */
2509 return r;
2510
f5fbe71d 2511 if (fchown(fd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2512 return -errno;
2513 }
2514 }
2515
2516 if (renameat(dfd, tmp, dfd, id) < 0)
2517 return -errno;
2518
2519 tmp = mfree(tmp);
2520 return 0;
2521}
2522
bb0c0d6f
LP
2523static int acquire_credentials(
2524 const ExecContext *context,
2525 const ExecParameters *params,
d3dcf4e3 2526 const char *unit,
bb0c0d6f
LP
2527 const char *p,
2528 uid_t uid,
2529 bool ownership_ok) {
2530
43144be4 2531 uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
bb0c0d6f 2532 _cleanup_close_ int dfd = -1;
43144be4 2533 ExecLoadCredential *lc;
bb0c0d6f 2534 ExecSetCredential *sc;
bb0c0d6f
LP
2535 int r;
2536
2537 assert(context);
2538 assert(p);
2539
2540 dfd = open(p, O_DIRECTORY|O_CLOEXEC);
2541 if (dfd < 0)
2542 return -errno;
2543
43144be4
LP
2544 /* First, load credentials off disk (or acquire via AF_UNIX socket) */
2545 HASHMAP_FOREACH(lc, context->load_credentials) {
2546 ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
bb0c0d6f 2547 _cleanup_(erase_and_freep) char *data = NULL;
d3dcf4e3 2548 _cleanup_free_ char *j = NULL, *bindname = NULL;
fc682be2 2549 bool missing_ok = true;
bb0c0d6f
LP
2550 const char *source;
2551 size_t size, add;
2552
43144be4 2553 if (path_is_absolute(lc->path)) {
bb0c0d6f 2554 /* If this is an absolute path, read the data directly from it, and support AF_UNIX sockets */
43144be4 2555 source = lc->path;
bb0c0d6f 2556 flags |= READ_FULL_FILE_CONNECT_SOCKET;
d3dcf4e3
LP
2557
2558 /* Pass some minimal info about the unit and the credential name we are looking to acquire
2559 * via the source socket address in case we read off an AF_UNIX socket. */
43144be4 2560 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, lc->id) < 0)
d3dcf4e3
LP
2561 return -ENOMEM;
2562
fc682be2
LP
2563 missing_ok = false;
2564
bb0c0d6f
LP
2565 } else if (params->received_credentials) {
2566 /* If this is a relative path, take it relative to the credentials we received
2567 * ourselves. We don't support the AF_UNIX stuff in this mode, since we are operating
2568 * on a credential store, i.e. this is guaranteed to be regular files. */
43144be4 2569 j = path_join(params->received_credentials, lc->path);
bb0c0d6f
LP
2570 if (!j)
2571 return -ENOMEM;
2572
2573 source = j;
2574 } else
2575 source = NULL;
2576
2577 if (source)
43144be4
LP
2578 r = read_full_file_full(
2579 AT_FDCWD, source,
2580 UINT64_MAX,
2581 lc->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
2582 flags | (lc->encrypted ? READ_FULL_FILE_UNBASE64 : 0),
2583 bindname,
2584 &data, &size);
bb0c0d6f
LP
2585 else
2586 r = -ENOENT;
43144be4 2587 if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, lc->id))) {
fc682be2
LP
2588 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
2589 * will get clear errors if we don't pass such a missing credential on as they
2590 * themselves will get ENOENT when trying to read them, which should not be much
2591 * worse than when we handle the error here and make it fatal.
2592 *
43144be4
LP
2593 * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
2594 * we are fine, too. */
2595 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", lc->path);
bb0c0d6f 2596 continue;
fc682be2 2597 }
bb0c0d6f 2598 if (r < 0)
43144be4
LP
2599 return log_debug_errno(r, "Failed to read credential '%s': %m", lc->path);
2600
2601 if (lc->encrypted) {
2602 _cleanup_free_ void *plaintext = NULL;
2603 size_t plaintext_size = 0;
2604
2605 r = decrypt_credential_and_warn(lc->id, now(CLOCK_REALTIME), NULL, data, size, &plaintext, &plaintext_size);
2606 if (r < 0)
2607 return r;
bb0c0d6f 2608
43144be4
LP
2609 free_and_replace(data, plaintext);
2610 size = plaintext_size;
2611 }
2612
2613 add = strlen(lc->id) + size;
bb0c0d6f
LP
2614 if (add > left)
2615 return -E2BIG;
2616
43144be4 2617 r = write_credential(dfd, lc->id, data, size, uid, ownership_ok);
bb0c0d6f
LP
2618 if (r < 0)
2619 return r;
2620
2621 left -= add;
2622 }
2623
43144be4
LP
2624 /* First we use the literally specified credentials. Note that they might be overridden again below,
2625 * and thus act as a "default" if the same credential is specified multiple times */
2626 HASHMAP_FOREACH(sc, context->set_credentials) {
2627 _cleanup_(erase_and_freep) void *plaintext = NULL;
2628 const char *data;
2629 size_t size, add;
2630
2631 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
2632 continue;
2633 if (errno != ENOENT)
2634 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
2635
2636 if (sc->encrypted) {
2637 r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, sc->data, sc->size, &plaintext, &size);
2638 if (r < 0)
2639 return r;
2640
2641 data = plaintext;
2642 } else {
2643 data = sc->data;
2644 size = sc->size;
2645 }
2646
2647 add = strlen(sc->id) + size;
2648 if (add > left)
2649 return -E2BIG;
2650
2651 r = write_credential(dfd, sc->id, data, size, uid, ownership_ok);
2652 if (r < 0)
2653 return r;
2654
2655
2656 left -= add;
2657 }
2658
bb0c0d6f
LP
2659 if (fchmod(dfd, 0500) < 0) /* Now take away the "w" bit */
2660 return -errno;
2661
2662 /* After we created all keys with the right perms, also make sure the credential store as a whole is
2663 * accessible */
2664
2665 if (uid_is_valid(uid) && uid != getuid()) {
567aeb58 2666 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
bb0c0d6f
LP
2667 if (r < 0) {
2668 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2669 return r;
2670
2671 if (!ownership_ok)
2672 return r;
2673
f5fbe71d 2674 if (fchown(dfd, uid, GID_INVALID) < 0)
bb0c0d6f
LP
2675 return -errno;
2676 }
2677 }
2678
2679 return 0;
2680}
2681
2682static int setup_credentials_internal(
2683 const ExecContext *context,
2684 const ExecParameters *params,
d3dcf4e3 2685 const char *unit,
bb0c0d6f
LP
2686 const char *final, /* This is where the credential store shall eventually end up at */
2687 const char *workspace, /* This is where we can prepare it before moving it to the final place */
2688 bool reuse_workspace, /* Whether to reuse any existing workspace mount if it already is a mount */
2689 bool must_mount, /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
2690 uid_t uid) {
2691
2692 int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
2693 * if we mounted something; false if we definitely can't mount anything */
2694 bool final_mounted;
2695 const char *where;
2696
2697 assert(context);
2698 assert(final);
2699 assert(workspace);
2700
2701 if (reuse_workspace) {
2702 r = path_is_mount_point(workspace, NULL, 0);
2703 if (r < 0)
2704 return r;
2705 if (r > 0)
2706 workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse it, let's keep this in mind */
2707 else
2708 workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
2709 } else
2710 workspace_mounted = -1; /* ditto */
2711
2712 r = path_is_mount_point(final, NULL, 0);
2713 if (r < 0)
2714 return r;
2715 if (r > 0) {
2716 /* If the final place already has something mounted, we use that. If the workspace also has
2717 * something mounted we assume it's actually the same mount (but with MS_RDONLY
2718 * different). */
2719 final_mounted = true;
2720
2721 if (workspace_mounted < 0) {
2722 /* If the final place is mounted, but the workspace we isn't, then let's bind mount
2723 * the final version to the workspace, and make it writable, so that we can make
2724 * changes */
2725
21935150
LP
2726 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2727 if (r < 0)
2728 return r;
bb0c0d6f 2729
21935150
LP
2730 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2731 if (r < 0)
2732 return r;
bb0c0d6f
LP
2733
2734 workspace_mounted = true;
2735 }
2736 } else
2737 final_mounted = false;
2738
2739 if (workspace_mounted < 0) {
2740 /* Nothing is mounted on the workspace yet, let's try to mount something now */
2741 for (int try = 0;; try++) {
2742
2743 if (try == 0) {
2744 /* Try "ramfs" first, since it's not swap backed */
21935150
LP
2745 r = mount_nofollow_verbose(LOG_DEBUG, "ramfs", workspace, "ramfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, "mode=0700");
2746 if (r >= 0) {
bb0c0d6f
LP
2747 workspace_mounted = true;
2748 break;
2749 }
2750
2751 } else if (try == 1) {
2752 _cleanup_free_ char *opts = NULL;
2753
43144be4 2754 if (asprintf(&opts, "mode=0700,nr_inodes=1024,size=%zu", (size_t) CREDENTIALS_TOTAL_SIZE_MAX) < 0)
bb0c0d6f
LP
2755 return -ENOMEM;
2756
2757 /* Fall back to "tmpfs" otherwise */
21935150
LP
2758 r = mount_nofollow_verbose(LOG_DEBUG, "tmpfs", workspace, "tmpfs", MS_NODEV|MS_NOEXEC|MS_NOSUID, opts);
2759 if (r >= 0) {
bb0c0d6f
LP
2760 workspace_mounted = true;
2761 break;
2762 }
2763
2764 } else {
2765 /* If that didn't work, try to make a bind mount from the final to the workspace, so that we can make it writable there. */
21935150
LP
2766 r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
2767 if (r < 0) {
2768 if (!ERRNO_IS_PRIVILEGE(r)) /* Propagate anything that isn't a permission problem */
2769 return r;
bb0c0d6f
LP
2770
2771 if (must_mount) /* If we it's not OK to use the plain directory
2772 * fallback, propagate all errors too */
21935150 2773 return r;
bb0c0d6f
LP
2774
2775 /* If we lack privileges to bind mount stuff, then let's gracefully
2776 * proceed for compat with container envs, and just use the final dir
2777 * as is. */
2778
2779 workspace_mounted = false;
2780 break;
2781 }
2782
2783 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
21935150
LP
2784 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2785 if (r < 0)
2786 return r;
bb0c0d6f
LP
2787
2788 workspace_mounted = true;
2789 break;
2790 }
2791 }
2792 }
2793
2794 assert(!must_mount || workspace_mounted > 0);
2795 where = workspace_mounted ? workspace : final;
2796
d3dcf4e3 2797 r = acquire_credentials(context, params, unit, where, uid, workspace_mounted);
bb0c0d6f
LP
2798 if (r < 0)
2799 return r;
2800
2801 if (workspace_mounted) {
2802 /* Make workspace read-only now, so that any bind mount we make from it defaults to read-only too */
21935150
LP
2803 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|MS_RDONLY|MS_NODEV|MS_NOEXEC|MS_NOSUID, NULL);
2804 if (r < 0)
2805 return r;
bb0c0d6f
LP
2806
2807 /* And mount it to the final place, read-only */
21935150
LP
2808 if (final_mounted)
2809 r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
2810 else
2811 r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
2812 if (r < 0)
2813 return r;
bb0c0d6f
LP
2814 } else {
2815 _cleanup_free_ char *parent = NULL;
2816
2817 /* If we do not have our own mount put used the plain directory fallback, then we need to
2818 * open access to the top-level credential directory and the per-service directory now */
2819
2820 parent = dirname_malloc(final);
2821 if (!parent)
2822 return -ENOMEM;
2823 if (chmod(parent, 0755) < 0)
2824 return -errno;
2825 }
2826
2827 return 0;
2828}
2829
2830static int setup_credentials(
2831 const ExecContext *context,
2832 const ExecParameters *params,
2833 const char *unit,
2834 uid_t uid) {
2835
2836 _cleanup_free_ char *p = NULL, *q = NULL;
2837 const char *i;
2838 int r;
2839
2840 assert(context);
2841 assert(params);
2842
2843 if (!exec_context_has_credentials(context))
2844 return 0;
2845
2846 if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
2847 return -EINVAL;
2848
2849 /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
2850 * and the subdir we mount over with a read-only file system readable by the service's user */
2851 q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
2852 if (!q)
2853 return -ENOMEM;
2854
2855 r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
2856 if (r < 0 && r != -EEXIST)
2857 return r;
2858
2859 p = path_join(q, unit);
2860 if (!p)
2861 return -ENOMEM;
2862
2863 r = mkdir_label(p, 0700); /* per-unit dir: private to user */
2864 if (r < 0 && r != -EEXIST)
2865 return r;
2866
2867 r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
2868 if (r < 0) {
2869 _cleanup_free_ char *t = NULL, *u = NULL;
2870
2871 /* If this is not a privilege or support issue then propagate the error */
2872 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
2873 return r;
2874
2875 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
2876 * it into place, so that users can't access half-initialized credential stores. */
2877 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
2878 if (!t)
2879 return -ENOMEM;
2880
2881 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
2882 * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
2883 * after it is fully set up */
2884 u = path_join(t, unit);
2885 if (!u)
2886 return -ENOMEM;
2887
2888 FOREACH_STRING(i, t, u) {
2889 r = mkdir_label(i, 0700);
2890 if (r < 0 && r != -EEXIST)
2891 return r;
2892 }
2893
2894 r = setup_credentials_internal(
2895 context,
2896 params,
d3dcf4e3 2897 unit,
bb0c0d6f
LP
2898 p, /* final mount point */
2899 u, /* temporary workspace to overmount */
2900 true, /* reuse the workspace if it is already a mount */
2901 false, /* it's OK to fall back to a plain directory if we can't mount anything */
2902 uid);
2903
2904 (void) rmdir(u); /* remove the workspace again if we can. */
2905
2906 if (r < 0)
2907 return r;
2908
2909 } else if (r == 0) {
2910
2911 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
2912 * we can use the same directory for all cases, after turning off propagation. Question
2913 * though is: where do we turn off propagation exactly, and where do we place the workspace
2914 * directory? We need some place that is guaranteed to be a mount point in the host, and
2915 * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
2916 * since we ultimately want to move the resulting file system there, i.e. we need propagation
2917 * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
2918 * would be visible in the host mount table all the time, which we want to avoid. Hence, what
2919 * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
2920 * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
2921 * propagation on the former, and then overmount the latter.
2922 *
2923 * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
2924 * for this purpose, but there are few other candidates that work equally well for us, and
2925 * given that the we do this in a privately namespaced short-lived single-threaded process
7802194a 2926 * that no one else sees this should be OK to do. */
bb0c0d6f 2927
21935150
LP
2928 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL); /* Turn off propagation from our namespace to host */
2929 if (r < 0)
bb0c0d6f
LP
2930 goto child_fail;
2931
2932 r = setup_credentials_internal(
2933 context,
2934 params,
d3dcf4e3 2935 unit,
bb0c0d6f
LP
2936 p, /* final mount point */
2937 "/dev/shm", /* temporary workspace to overmount */
2938 false, /* do not reuse /dev/shm if it is already a mount, under no circumstances */
2939 true, /* insist that something is mounted, do not allow fallback to plain directory */
2940 uid);
2941 if (r < 0)
2942 goto child_fail;
2943
2944 _exit(EXIT_SUCCESS);
2945
2946 child_fail:
2947 _exit(EXIT_FAILURE);
2948 }
2949
2950 return 0;
2951}
2952
92b423b9 2953#if ENABLE_SMACK
cefc33ae
LP
2954static int setup_smack(
2955 const ExecContext *context,
b83d5050 2956 int executable_fd) {
cefc33ae
LP
2957 int r;
2958
2959 assert(context);
b83d5050 2960 assert(executable_fd >= 0);
cefc33ae 2961
cefc33ae
LP
2962 if (context->smack_process_label) {
2963 r = mac_smack_apply_pid(0, context->smack_process_label);
2964 if (r < 0)
2965 return r;
2966 }
2967#ifdef SMACK_DEFAULT_PROCESS_LABEL
2968 else {
2969 _cleanup_free_ char *exec_label = NULL;
2970
b83d5050 2971 r = mac_smack_read_fd(executable_fd, SMACK_ATTR_EXEC, &exec_label);
4c701096 2972 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2973 return r;
2974
2975 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2976 if (r < 0)
2977 return r;
2978 }
cefc33ae
LP
2979#endif
2980
2981 return 0;
2982}
92b423b9 2983#endif
cefc33ae 2984
6c47cd7d
LP
2985static int compile_bind_mounts(
2986 const ExecContext *context,
2987 const ExecParameters *params,
2988 BindMount **ret_bind_mounts,
da6053d0 2989 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2990 char ***ret_empty_directories) {
2991
2992 _cleanup_strv_free_ char **empty_directories = NULL;
2993 BindMount *bind_mounts;
5b10116e 2994 size_t n, h = 0;
6c47cd7d
LP
2995 int r;
2996
2997 assert(context);
2998 assert(params);
2999 assert(ret_bind_mounts);
3000 assert(ret_n_bind_mounts);
3001 assert(ret_empty_directories);
3002
3003 n = context->n_bind_mounts;
5b10116e 3004 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3005 if (!params->prefix[t])
3006 continue;
3007
3008 n += strv_length(context->directories[t].paths);
3009 }
3010
3011 if (n <= 0) {
3012 *ret_bind_mounts = NULL;
3013 *ret_n_bind_mounts = 0;
3014 *ret_empty_directories = NULL;
3015 return 0;
3016 }
3017
3018 bind_mounts = new(BindMount, n);
3019 if (!bind_mounts)
3020 return -ENOMEM;
3021
5b10116e 3022 for (size_t i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
3023 BindMount *item = context->bind_mounts + i;
3024 char *s, *d;
3025
3026 s = strdup(item->source);
3027 if (!s) {
3028 r = -ENOMEM;
3029 goto finish;
3030 }
3031
3032 d = strdup(item->destination);
3033 if (!d) {
3034 free(s);
3035 r = -ENOMEM;
3036 goto finish;
3037 }
3038
3039 bind_mounts[h++] = (BindMount) {
3040 .source = s,
3041 .destination = d,
3042 .read_only = item->read_only,
3043 .recursive = item->recursive,
3044 .ignore_enoent = item->ignore_enoent,
3045 };
3046 }
3047
5b10116e 3048 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
6c47cd7d
LP
3049 char **suffix;
3050
3051 if (!params->prefix[t])
3052 continue;
3053
3054 if (strv_isempty(context->directories[t].paths))
3055 continue;
3056
494d0247 3057 if (exec_directory_is_private(context, t) &&
74e12520 3058 !exec_context_with_rootfs(context)) {
6c47cd7d
LP
3059 char *private_root;
3060
3061 /* So this is for a dynamic user, and we need to make sure the process can access its own
3062 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
3063 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
3064
657ee2d8 3065 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
3066 if (!private_root) {
3067 r = -ENOMEM;
3068 goto finish;
3069 }
3070
3071 r = strv_consume(&empty_directories, private_root);
a635a7ae 3072 if (r < 0)
6c47cd7d 3073 goto finish;
6c47cd7d
LP
3074 }
3075
3076 STRV_FOREACH(suffix, context->directories[t].paths) {
3077 char *s, *d;
3078
494d0247 3079 if (exec_directory_is_private(context, t))
657ee2d8 3080 s = path_join(params->prefix[t], "private", *suffix);
6c47cd7d 3081 else
657ee2d8 3082 s = path_join(params->prefix[t], *suffix);
6c47cd7d
LP
3083 if (!s) {
3084 r = -ENOMEM;
3085 goto finish;
3086 }
3087
494d0247 3088 if (exec_directory_is_private(context, t) &&
74e12520 3089 exec_context_with_rootfs(context))
5609f688
YW
3090 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
3091 * directory is not created on the root directory. So, let's bind-mount the directory
3092 * on the 'non-private' place. */
657ee2d8 3093 d = path_join(params->prefix[t], *suffix);
5609f688
YW
3094 else
3095 d = strdup(s);
6c47cd7d
LP
3096 if (!d) {
3097 free(s);
3098 r = -ENOMEM;
3099 goto finish;
3100 }
3101
3102 bind_mounts[h++] = (BindMount) {
3103 .source = s,
3104 .destination = d,
3105 .read_only = false,
9ce4e4b0 3106 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
3107 .recursive = true,
3108 .ignore_enoent = false,
3109 };
3110 }
3111 }
3112
3113 assert(h == n);
3114
3115 *ret_bind_mounts = bind_mounts;
3116 *ret_n_bind_mounts = n;
ae2a15bc 3117 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
3118
3119 return (int) n;
3120
3121finish:
3122 bind_mount_free_many(bind_mounts, h);
3123 return r;
3124}
3125
4e677599
LP
3126static bool insist_on_sandboxing(
3127 const ExecContext *context,
3128 const char *root_dir,
3129 const char *root_image,
3130 const BindMount *bind_mounts,
3131 size_t n_bind_mounts) {
3132
4e677599
LP
3133 assert(context);
3134 assert(n_bind_mounts == 0 || bind_mounts);
3135
3136 /* Checks whether we need to insist on fs namespacing. i.e. whether we have settings configured that
86b52a39 3137 * would alter the view on the file system beyond making things read-only or invisible, i.e. would
4e677599
LP
3138 * rearrange stuff in a way we cannot ignore gracefully. */
3139
3140 if (context->n_temporary_filesystems > 0)
3141 return true;
3142
3143 if (root_dir || root_image)
3144 return true;
3145
b3d13314
LB
3146 if (context->n_mount_images > 0)
3147 return true;
3148
4e677599
LP
3149 if (context->dynamic_user)
3150 return true;
3151
3152 /* If there are any bind mounts set that don't map back onto themselves, fs namespacing becomes
3153 * essential. */
5b10116e 3154 for (size_t i = 0; i < n_bind_mounts; i++)
4e677599
LP
3155 if (!path_equal(bind_mounts[i].source, bind_mounts[i].destination))
3156 return true;
3157
91dd5f7c
LP
3158 if (context->log_namespace)
3159 return true;
3160
4e677599
LP
3161 return false;
3162}
3163
6818c54c 3164static int apply_mount_namespace(
34cf6c43 3165 const Unit *u,
9f71ba8d 3166 ExecCommandFlags command_flags,
6818c54c
LP
3167 const ExecContext *context,
3168 const ExecParameters *params,
7cc5ef5f
ZJS
3169 const ExecRuntime *runtime,
3170 char **error_path) {
6818c54c 3171
7bcef4ef 3172 _cleanup_strv_free_ char **empty_directories = NULL;
56a13a49 3173 const char *tmp_dir = NULL, *var_tmp_dir = NULL;
915e6d16 3174 const char *root_dir = NULL, *root_image = NULL;
5e8deb94 3175 _cleanup_free_ char *creds_path = NULL, *incoming_dir = NULL, *propagate_dir = NULL;
228af36f 3176 NamespaceInfo ns_info;
165a31c0 3177 bool needs_sandboxing;
6c47cd7d 3178 BindMount *bind_mounts = NULL;
da6053d0 3179 size_t n_bind_mounts = 0;
6818c54c 3180 int r;
93c6bb51 3181
2b3c1b9e
DH
3182 assert(context);
3183
915e6d16
LP
3184 if (params->flags & EXEC_APPLY_CHROOT) {
3185 root_image = context->root_image;
3186
3187 if (!root_image)
3188 root_dir = context->root_directory;
3189 }
93c6bb51 3190
6c47cd7d
LP
3191 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
3192 if (r < 0)
3193 return r;
3194
9f71ba8d 3195 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command_flags & EXEC_COMMAND_FULLY_PRIVILEGED);
ecf63c91
NJ
3196 if (needs_sandboxing) {
3197 /* The runtime struct only contains the parent of the private /tmp,
3198 * which is non-accessible to world users. Inside of it there's a /tmp
56a13a49
ZJS
3199 * that is sticky, and that's the one we want to use here.
3200 * This does not apply when we are using /run/systemd/empty as fallback. */
ecf63c91
NJ
3201
3202 if (context->private_tmp && runtime) {
56a13a49
ZJS
3203 if (streq_ptr(runtime->tmp_dir, RUN_SYSTEMD_EMPTY))
3204 tmp_dir = runtime->tmp_dir;
3205 else if (runtime->tmp_dir)
3206 tmp_dir = strjoina(runtime->tmp_dir, "/tmp");
3207
3208 if (streq_ptr(runtime->var_tmp_dir, RUN_SYSTEMD_EMPTY))
3209 var_tmp_dir = runtime->var_tmp_dir;
f63ef937 3210 else if (runtime->var_tmp_dir)
56a13a49 3211 var_tmp_dir = strjoina(runtime->var_tmp_dir, "/tmp");
ecf63c91
NJ
3212 }
3213
b5a33299
YW
3214 ns_info = (NamespaceInfo) {
3215 .ignore_protect_paths = false,
3216 .private_dev = context->private_devices,
3217 .protect_control_groups = context->protect_control_groups,
3218 .protect_kernel_tunables = context->protect_kernel_tunables,
3219 .protect_kernel_modules = context->protect_kernel_modules,
94a7b275 3220 .protect_kernel_logs = context->protect_kernel_logs,
aecd5ac6 3221 .protect_hostname = context->protect_hostname,
5e98086d 3222 .mount_apivfs = exec_context_get_effective_mount_apivfs(context),
228af36f 3223 .private_mounts = context->private_mounts,
52b3d652
LP
3224 .protect_home = context->protect_home,
3225 .protect_system = context->protect_system,
4e399953
LP
3226 .protect_proc = context->protect_proc,
3227 .proc_subset = context->proc_subset,
80271a44 3228 .private_ipc = context->private_ipc || context->ipc_namespace_path,
6720e356 3229 /* If NNP is on, we can turn on MS_NOSUID, since it won't have any effect anymore. */
5181630f 3230 .mount_nosuid = context->no_new_privileges && !mac_selinux_use(),
b5a33299 3231 };
ecf63c91 3232 } else if (!context->dynamic_user && root_dir)
228af36f
LP
3233 /*
3234 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
3235 * sandbox info, otherwise enforce it, don't ignore protected paths and
3236 * fail if we are enable to apply the sandbox inside the mount namespace.
3237 */
3238 ns_info = (NamespaceInfo) {
3239 .ignore_protect_paths = true,
3240 };
3241 else
3242 ns_info = (NamespaceInfo) {};
b5a33299 3243
37ed15d7
FB
3244 if (context->mount_flags == MS_SHARED)
3245 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
3246
a631cbfa
LP
3247 if (exec_context_has_credentials(context) &&
3248 params->prefix[EXEC_DIRECTORY_RUNTIME] &&
3249 FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
bbb4e7f3 3250 creds_path = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials", u->id);
8062e643
YW
3251 if (!creds_path) {
3252 r = -ENOMEM;
3253 goto finalize;
3254 }
bbb4e7f3
LP
3255 }
3256
5e8deb94
LB
3257 if (MANAGER_IS_SYSTEM(u->manager)) {
3258 propagate_dir = path_join("/run/systemd/propagate/", u->id);
f2550b98
LP
3259 if (!propagate_dir) {
3260 r = -ENOMEM;
3261 goto finalize;
3262 }
3263
5e8deb94 3264 incoming_dir = strdup("/run/systemd/incoming");
f2550b98
LP
3265 if (!incoming_dir) {
3266 r = -ENOMEM;
3267 goto finalize;
3268 }
5e8deb94
LB
3269 }
3270
18d73705 3271 r = setup_namespace(root_dir, root_image, context->root_image_options,
7bcef4ef 3272 &ns_info, context->read_write_paths,
165a31c0
LP
3273 needs_sandboxing ? context->read_only_paths : NULL,
3274 needs_sandboxing ? context->inaccessible_paths : NULL,
ddc155b2
TM
3275 needs_sandboxing ? context->exec_paths : NULL,
3276 needs_sandboxing ? context->no_exec_paths : NULL,
6c47cd7d
LP
3277 empty_directories,
3278 bind_mounts,
3279 n_bind_mounts,
2abd4e38
YW
3280 context->temporary_filesystems,
3281 context->n_temporary_filesystems,
b3d13314
LB
3282 context->mount_images,
3283 context->n_mount_images,
56a13a49
ZJS
3284 tmp_dir,
3285 var_tmp_dir,
bbb4e7f3 3286 creds_path,
91dd5f7c 3287 context->log_namespace,
915e6d16 3288 context->mount_flags,
d4d55b0d
LB
3289 context->root_hash, context->root_hash_size, context->root_hash_path,
3290 context->root_hash_sig, context->root_hash_sig_size, context->root_hash_sig_path,
3291 context->root_verity,
93f59701
LB
3292 context->extension_images,
3293 context->n_extension_images,
5e8deb94
LB
3294 propagate_dir,
3295 incoming_dir,
3bdc25a4 3296 root_dir || root_image ? params->notify_socket : NULL,
7cc5ef5f 3297 error_path);
93c6bb51 3298
1beab8b0 3299 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 3300 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
3301 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
3302 * completely different execution environment. */
aca835ed 3303 if (r == -ENOANO) {
4e677599
LP
3304 if (insist_on_sandboxing(
3305 context,
3306 root_dir, root_image,
3307 bind_mounts,
3308 n_bind_mounts)) {
3309 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
3310 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
3311 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
3312
3313 r = -EOPNOTSUPP;
3314 } else {
aca835ed 3315 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
4e677599 3316 r = 0;
aca835ed 3317 }
93c6bb51
DH
3318 }
3319
8062e643 3320finalize:
4e677599 3321 bind_mount_free_many(bind_mounts, n_bind_mounts);
93c6bb51
DH
3322 return r;
3323}
3324
915e6d16
LP
3325static int apply_working_directory(
3326 const ExecContext *context,
3327 const ExecParameters *params,
3328 const char *home,
376fecf6 3329 int *exit_status) {
915e6d16 3330
6732edab 3331 const char *d, *wd;
2b3c1b9e
DH
3332
3333 assert(context);
376fecf6 3334 assert(exit_status);
2b3c1b9e 3335
6732edab
LP
3336 if (context->working_directory_home) {
3337
376fecf6
LP
3338 if (!home) {
3339 *exit_status = EXIT_CHDIR;
6732edab 3340 return -ENXIO;
376fecf6 3341 }
6732edab 3342
2b3c1b9e 3343 wd = home;
6732edab 3344
14eb3285
LP
3345 } else
3346 wd = empty_to_root(context->working_directory);
e7f1e7c6 3347
fa97f630 3348 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 3349 d = wd;
fa97f630 3350 else
3b0e5bb5 3351 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 3352
376fecf6
LP
3353 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
3354 *exit_status = EXIT_CHDIR;
2b3c1b9e 3355 return -errno;
376fecf6 3356 }
e7f1e7c6
DH
3357
3358 return 0;
3359}
3360
fa97f630
JB
3361static int apply_root_directory(
3362 const ExecContext *context,
3363 const ExecParameters *params,
3364 const bool needs_mount_ns,
3365 int *exit_status) {
3366
3367 assert(context);
3368 assert(exit_status);
3369
5b10116e 3370 if (params->flags & EXEC_APPLY_CHROOT)
fa97f630
JB
3371 if (!needs_mount_ns && context->root_directory)
3372 if (chroot(context->root_directory) < 0) {
3373 *exit_status = EXIT_CHROOT;
3374 return -errno;
3375 }
fa97f630
JB
3376
3377 return 0;
3378}
3379
b1edf445 3380static int setup_keyring(
34cf6c43 3381 const Unit *u,
b1edf445
LP
3382 const ExecContext *context,
3383 const ExecParameters *p,
3384 uid_t uid, gid_t gid) {
3385
74dd6b51 3386 key_serial_t keyring;
e64c2d0b
DJL
3387 int r = 0;
3388 uid_t saved_uid;
3389 gid_t saved_gid;
74dd6b51
LP
3390
3391 assert(u);
b1edf445 3392 assert(context);
74dd6b51
LP
3393 assert(p);
3394
3395 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
3396 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
3397 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
3398 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
3399 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
3400 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
3401
b1edf445
LP
3402 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
3403 return 0;
3404
e64c2d0b
DJL
3405 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
3406 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
3407 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
3408 * & group is just as nasty as acquiring a reference to the user keyring. */
3409
3410 saved_uid = getuid();
3411 saved_gid = getgid();
3412
3413 if (gid_is_valid(gid) && gid != saved_gid) {
3414 if (setregid(gid, -1) < 0)
3415 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
3416 }
3417
3418 if (uid_is_valid(uid) && uid != saved_uid) {
3419 if (setreuid(uid, -1) < 0) {
3420 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
3421 goto out;
3422 }
3423 }
3424
74dd6b51
LP
3425 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
3426 if (keyring == -1) {
3427 if (errno == ENOSYS)
8002fb97 3428 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
065b4774 3429 else if (ERRNO_IS_PRIVILEGE(errno))
8002fb97 3430 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 3431 else if (errno == EDQUOT)
8002fb97 3432 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 3433 else
e64c2d0b 3434 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 3435
e64c2d0b 3436 goto out;
74dd6b51
LP
3437 }
3438
e64c2d0b
DJL
3439 /* When requested link the user keyring into the session keyring. */
3440 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
3441
3442 if (keyctl(KEYCTL_LINK,
3443 KEY_SPEC_USER_KEYRING,
3444 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
3445 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
3446 goto out;
3447 }
3448 }
3449
3450 /* Restore uid/gid back */
3451 if (uid_is_valid(uid) && uid != saved_uid) {
3452 if (setreuid(saved_uid, -1) < 0) {
3453 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
3454 goto out;
3455 }
3456 }
3457
3458 if (gid_is_valid(gid) && gid != saved_gid) {
3459 if (setregid(saved_gid, -1) < 0)
3460 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
3461 }
3462
3463 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
3464 if (!sd_id128_is_null(u->invocation_id)) {
3465 key_serial_t key;
3466
3467 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
3468 if (key == -1)
8002fb97 3469 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
3470 else {
3471 if (keyctl(KEYCTL_SETPERM, key,
3472 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
3473 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 3474 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
3475 }
3476 }
3477
e64c2d0b 3478out:
37b22b3b 3479 /* Revert back uid & gid for the last time, and exit */
e64c2d0b
DJL
3480 /* no extra logging, as only the first already reported error matters */
3481 if (getuid() != saved_uid)
3482 (void) setreuid(saved_uid, -1);
b1edf445 3483
e64c2d0b
DJL
3484 if (getgid() != saved_gid)
3485 (void) setregid(saved_gid, -1);
b1edf445 3486
e64c2d0b 3487 return r;
74dd6b51
LP
3488}
3489
3042bbeb 3490static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
3491 assert(array);
3492 assert(n);
2caa38e9 3493 assert(pair);
29206d46
LP
3494
3495 if (pair[0] >= 0)
3496 array[(*n)++] = pair[0];
3497 if (pair[1] >= 0)
3498 array[(*n)++] = pair[1];
3499}
3500
a34ceba6
LP
3501static int close_remaining_fds(
3502 const ExecParameters *params,
34cf6c43
YW
3503 const ExecRuntime *runtime,
3504 const DynamicCreds *dcreds,
00d9ef85 3505 int user_lookup_fd,
a34ceba6 3506 int socket_fd,
5b8d1f6b 3507 const int *fds, size_t n_fds) {
a34ceba6 3508
da6053d0 3509 size_t n_dont_close = 0;
00d9ef85 3510 int dont_close[n_fds + 12];
a34ceba6
LP
3511
3512 assert(params);
3513
3514 if (params->stdin_fd >= 0)
3515 dont_close[n_dont_close++] = params->stdin_fd;
3516 if (params->stdout_fd >= 0)
3517 dont_close[n_dont_close++] = params->stdout_fd;
3518 if (params->stderr_fd >= 0)
3519 dont_close[n_dont_close++] = params->stderr_fd;
3520
3521 if (socket_fd >= 0)
3522 dont_close[n_dont_close++] = socket_fd;
3523 if (n_fds > 0) {
3524 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
3525 n_dont_close += n_fds;
3526 }
3527
a70581ff 3528 if (runtime) {
29206d46 3529 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
a70581ff
XR
3530 append_socket_pair(dont_close, &n_dont_close, runtime->ipcns_storage_socket);
3531 }
29206d46
LP
3532
3533 if (dcreds) {
3534 if (dcreds->user)
3535 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
3536 if (dcreds->group)
3537 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
3538 }
3539
00d9ef85
LP
3540 if (user_lookup_fd >= 0)
3541 dont_close[n_dont_close++] = user_lookup_fd;
3542
a34ceba6
LP
3543 return close_all_fds(dont_close, n_dont_close);
3544}
3545
00d9ef85
LP
3546static int send_user_lookup(
3547 Unit *unit,
3548 int user_lookup_fd,
3549 uid_t uid,
3550 gid_t gid) {
3551
3552 assert(unit);
3553
3554 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
3555 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
3556 * specified. */
3557
3558 if (user_lookup_fd < 0)
3559 return 0;
3560
3561 if (!uid_is_valid(uid) && !gid_is_valid(gid))
3562 return 0;
3563
3564 if (writev(user_lookup_fd,
3565 (struct iovec[]) {
e6a7ec4b
LP
3566 IOVEC_INIT(&uid, sizeof(uid)),
3567 IOVEC_INIT(&gid, sizeof(gid)),
3568 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
3569 return -errno;
3570
3571 return 0;
3572}
3573
6732edab
LP
3574static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
3575 int r;
3576
3577 assert(c);
3578 assert(home);
3579 assert(buf);
3580
3581 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
3582
3583 if (*home)
3584 return 0;
3585
3586 if (!c->working_directory_home)
3587 return 0;
3588
6732edab
LP
3589 r = get_home_dir(buf);
3590 if (r < 0)
3591 return r;
3592
3593 *home = *buf;
3594 return 1;
3595}
3596
da50b85a
LP
3597static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
3598 _cleanup_strv_free_ char ** list = NULL;
da50b85a
LP
3599 int r;
3600
3601 assert(c);
3602 assert(p);
3603 assert(ret);
3604
3605 assert(c->dynamic_user);
3606
3607 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
3608 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
3609 * directories. */
3610
5b10116e 3611 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
da50b85a
LP
3612 char **i;
3613
3614 if (t == EXEC_DIRECTORY_CONFIGURATION)
3615 continue;
3616
3617 if (!p->prefix[t])
3618 continue;
3619
3620 STRV_FOREACH(i, c->directories[t].paths) {
3621 char *e;
3622
494d0247 3623 if (exec_directory_is_private(c, t))
657ee2d8 3624 e = path_join(p->prefix[t], "private", *i);
494d0247
YW
3625 else
3626 e = path_join(p->prefix[t], *i);
da50b85a
LP
3627 if (!e)
3628 return -ENOMEM;
3629
3630 r = strv_consume(&list, e);
3631 if (r < 0)
3632 return r;
3633 }
3634 }
3635
ae2a15bc 3636 *ret = TAKE_PTR(list);
da50b85a
LP
3637
3638 return 0;
3639}
3640
34cf6c43
YW
3641static char *exec_command_line(char **argv);
3642
78f93209
LP
3643static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
3644 bool using_subcgroup;
3645 char *p;
3646
3647 assert(params);
3648 assert(ret);
3649
3650 if (!params->cgroup_path)
3651 return -EINVAL;
3652
3653 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
3654 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
3655 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
3656 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
3657 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
3658 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
3659 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
3660 * flag, which is only passed for the former statements, not for the latter. */
3661
3662 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
3663 if (using_subcgroup)
657ee2d8 3664 p = path_join(params->cgroup_path, ".control");
78f93209
LP
3665 else
3666 p = strdup(params->cgroup_path);
3667 if (!p)
3668 return -ENOMEM;
3669
3670 *ret = p;
3671 return using_subcgroup;
3672}
3673
e2b2fb7f
MS
3674static int exec_context_cpu_affinity_from_numa(const ExecContext *c, CPUSet *ret) {
3675 _cleanup_(cpu_set_reset) CPUSet s = {};
3676 int r;
3677
3678 assert(c);
3679 assert(ret);
3680
3681 if (!c->numa_policy.nodes.set) {
3682 log_debug("Can't derive CPU affinity mask from NUMA mask because NUMA mask is not set, ignoring");
3683 return 0;
3684 }
3685
3686 r = numa_to_cpu_set(&c->numa_policy, &s);
3687 if (r < 0)
3688 return r;
3689
3690 cpu_set_reset(ret);
3691
3692 return cpu_set_add_all(ret, &s);
3693}
3694
3695bool exec_context_get_cpu_affinity_from_numa(const ExecContext *c) {
3696 assert(c);
3697
3698 return c->cpu_affinity_from_numa;
3699}
3700
1da37e58
ZJS
3701static int add_shifted_fd(int *fds, size_t fds_size, size_t *n_fds, int fd, int *ret_fd) {
3702 int r;
3703
3704 assert(fds);
3705 assert(n_fds);
3706 assert(*n_fds < fds_size);
3707 assert(ret_fd);
3708
3709 if (fd < 0) {
3710 *ret_fd = -1;
3711 return 0;
3712 }
3713
3714 if (fd < 3 + (int) *n_fds) {
3715 /* Let's move the fd up, so that it's outside of the fd range we will use to store
3716 * the fds we pass to the process (or which are closed only during execve). */
3717
3718 r = fcntl(fd, F_DUPFD_CLOEXEC, 3 + (int) *n_fds);
3719 if (r < 0)
3720 return -errno;
3721
3722 CLOSE_AND_REPLACE(fd, r);
3723 }
3724
3725 *ret_fd = fds[*n_fds] = fd;
3726 (*n_fds) ++;
3727 return 1;
3728}
3729
ff0af2a1 3730static int exec_child(
f2341e0a 3731 Unit *unit,
34cf6c43 3732 const ExecCommand *command,
ff0af2a1
LP
3733 const ExecContext *context,
3734 const ExecParameters *params,
3735 ExecRuntime *runtime,
29206d46 3736 DynamicCreds *dcreds,
ff0af2a1 3737 int socket_fd,
2caa38e9 3738 const int named_iofds[static 3],
4c47affc 3739 int *fds,
da6053d0 3740 size_t n_socket_fds,
25b583d7 3741 size_t n_storage_fds,
ff0af2a1 3742 char **files_env,
00d9ef85 3743 int user_lookup_fd,
12145637 3744 int *exit_status) {
d35fbf6b 3745
7ca69792 3746 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
1da37e58 3747 int r, ngids = 0, exec_fd;
4d885bd3
DH
3748 _cleanup_free_ gid_t *supplementary_gids = NULL;
3749 const char *username = NULL, *groupname = NULL;
5686391b 3750 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 3751 const char *home = NULL, *shell = NULL;
7ca69792 3752 char **final_argv = NULL;
7bce046b
LP
3753 dev_t journal_stream_dev = 0;
3754 ino_t journal_stream_ino = 0;
5749f855 3755 bool userns_set_up = false;
165a31c0
LP
3756 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
3757 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
3758 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
3759 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 3760#if HAVE_SELINUX
7f59dd35 3761 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 3762 bool use_selinux = false;
ecfbc84f 3763#endif
f9fa32f0 3764#if ENABLE_SMACK
43b1f709 3765 bool use_smack = false;
ecfbc84f 3766#endif
349cc4a5 3767#if HAVE_APPARMOR
43b1f709 3768 bool use_apparmor = false;
ecfbc84f 3769#endif
5749f855
AZ
3770 uid_t saved_uid = getuid();
3771 gid_t saved_gid = getgid();
fed1e721
LP
3772 uid_t uid = UID_INVALID;
3773 gid_t gid = GID_INVALID;
1da37e58
ZJS
3774 size_t n_fds = n_socket_fds + n_storage_fds, /* fds to pass to the child */
3775 n_keep_fds; /* total number of fds not to close */
165a31c0 3776 int secure_bits;
afb11bf1
DG
3777 _cleanup_free_ gid_t *gids_after_pam = NULL;
3778 int ngids_after_pam = 0;
034c6ed7 3779
f2341e0a 3780 assert(unit);
5cb5a6ff
LP
3781 assert(command);
3782 assert(context);
d35fbf6b 3783 assert(params);
ff0af2a1 3784 assert(exit_status);
d35fbf6b
DM
3785
3786 rename_process_from_path(command->path);
3787
9c274488
LP
3788 /* We reset exactly these signals, since they are the only ones we set to SIG_IGN in the main
3789 * daemon. All others we leave untouched because we set them to SIG_DFL or a valid handler initially,
3790 * both of which will be demoted to SIG_DFL. */
ce30c8dc 3791 (void) default_signals(SIGNALS_CRASH_HANDLER,
9c274488 3792 SIGNALS_IGNORE);
d35fbf6b
DM
3793
3794 if (context->ignore_sigpipe)
9c274488 3795 (void) ignore_signals(SIGPIPE);
d35fbf6b 3796
ff0af2a1
LP
3797 r = reset_signal_mask();
3798 if (r < 0) {
3799 *exit_status = EXIT_SIGNAL_MASK;
12145637 3800 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 3801 }
034c6ed7 3802
d35fbf6b
DM
3803 if (params->idle_pipe)
3804 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 3805
2c027c62
LP
3806 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
3807 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
3808 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
3809 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 3810
d35fbf6b 3811 log_forget_fds();
2c027c62 3812 log_set_open_when_needed(true);
4f2d528d 3813
40a80078
LP
3814 /* In case anything used libc syslog(), close this here, too */
3815 closelog();
3816
b83d5050 3817 int keep_fds[n_fds + 2];
1da37e58
ZJS
3818 memcpy_safe(keep_fds, fds, n_fds * sizeof(int));
3819 n_keep_fds = n_fds;
3820
3821 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, params->exec_fd, &exec_fd);
3822 if (r < 0) {
3823 *exit_status = EXIT_FDS;
3824 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
3825 }
3826
3827 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, keep_fds, n_keep_fds);
ff0af2a1
LP
3828 if (r < 0) {
3829 *exit_status = EXIT_FDS;
12145637 3830 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3831 }
3832
0af07108
ZJS
3833 if (!context->same_pgrp &&
3834 setsid() < 0) {
3835 *exit_status = EXIT_SETSID;
3836 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
3837 }
9e2f7c11 3838
1e22b5cd 3839 exec_context_tty_reset(context, params);
d35fbf6b 3840
c891efaf 3841 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3842 const char *vc = params->confirm_spawn;
3b20f877
FB
3843 _cleanup_free_ char *cmdline = NULL;
3844
ee39ca20 3845 cmdline = exec_command_line(command->argv);
3b20f877 3846 if (!cmdline) {
0460aa5c 3847 *exit_status = EXIT_MEMORY;
12145637 3848 return log_oom();
3b20f877 3849 }
d35fbf6b 3850
eedf223a 3851 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3852 if (r != CONFIRM_EXECUTE) {
3853 if (r == CONFIRM_PRETEND_SUCCESS) {
3854 *exit_status = EXIT_SUCCESS;
3855 return 0;
3856 }
ff0af2a1 3857 *exit_status = EXIT_CONFIRM;
0af07108
ZJS
3858 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ECANCELED),
3859 "Execution cancelled by the user");
d35fbf6b
DM
3860 }
3861 }
1a63a750 3862
d521916d
LP
3863 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3864 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3865 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3866 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3867 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3868 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3869 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3870 *exit_status = EXIT_MEMORY;
3871 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3872 }
3873
29206d46 3874 if (context->dynamic_user && dcreds) {
da50b85a 3875 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 3876
d521916d 3877 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
7802194a 3878 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here. */
409093fe
LP
3879 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3880 *exit_status = EXIT_USER;
12145637 3881 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
3882 }
3883
da50b85a
LP
3884 r = compile_suggested_paths(context, params, &suggested_paths);
3885 if (r < 0) {
3886 *exit_status = EXIT_MEMORY;
3887 return log_oom();
3888 }
3889
3890 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
3891 if (r < 0) {
3892 *exit_status = EXIT_USER;
d85ff944
YW
3893 if (r == -EILSEQ)
3894 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
3895 "Failed to update dynamic user credentials: User or group with specified name already exists.");
12145637 3896 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 3897 }
524daa8c 3898
70dd455c 3899 if (!uid_is_valid(uid)) {
29206d46 3900 *exit_status = EXIT_USER;
d85ff944 3901 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
3902 }
3903
3904 if (!gid_is_valid(gid)) {
3905 *exit_status = EXIT_USER;
d85ff944 3906 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(ESRCH), "GID validation failed for \""GID_FMT"\"", gid);
29206d46 3907 }
5bc7452b 3908
29206d46
LP
3909 if (dcreds->user)
3910 username = dcreds->user->name;
3911
3912 } else {
4d885bd3
DH
3913 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3914 if (r < 0) {
3915 *exit_status = EXIT_USER;
12145637 3916 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 3917 }
5bc7452b 3918
4d885bd3
DH
3919 r = get_fixed_group(context, &groupname, &gid);
3920 if (r < 0) {
3921 *exit_status = EXIT_GROUP;
12145637 3922 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 3923 }
cdc5d5c5 3924 }
29206d46 3925
cdc5d5c5
DH
3926 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3927 r = get_supplementary_groups(context, username, groupname, gid,
3928 &supplementary_gids, &ngids);
3929 if (r < 0) {
3930 *exit_status = EXIT_GROUP;
12145637 3931 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 3932 }
5bc7452b 3933
00d9ef85
LP
3934 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3935 if (r < 0) {
3936 *exit_status = EXIT_USER;
12145637 3937 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
3938 }
3939
3940 user_lookup_fd = safe_close(user_lookup_fd);
3941
6732edab
LP
3942 r = acquire_home(context, uid, &home, &home_buffer);
3943 if (r < 0) {
3944 *exit_status = EXIT_CHDIR;
12145637 3945 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
3946 }
3947
d35fbf6b
DM
3948 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3949 * must sure to drop O_NONBLOCK */
3950 if (socket_fd >= 0)
a34ceba6 3951 (void) fd_nonblock(socket_fd, false);
acbb0225 3952
4c70a4a7
MS
3953 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3954 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3955 if (params->cgroup_path) {
3956 _cleanup_free_ char *p = NULL;
3957
3958 r = exec_parameters_get_cgroup_path(params, &p);
3959 if (r < 0) {
3960 *exit_status = EXIT_CGROUP;
3961 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3962 }
3963
3964 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3965 if (r < 0) {
3966 *exit_status = EXIT_CGROUP;
3967 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3968 }
3969 }
3970
a8d08f39 3971 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
54c2459d 3972 r = open_shareable_ns_path(runtime->netns_storage_socket, context->network_namespace_path, CLONE_NEWNET);
a8d08f39
LP
3973 if (r < 0) {
3974 *exit_status = EXIT_NETWORK;
3975 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3976 }
3977 }
3978
a70581ff
XR
3979 if (context->ipc_namespace_path && runtime && runtime->ipcns_storage_socket[0] >= 0) {
3980 r = open_shareable_ns_path(runtime->ipcns_storage_socket, context->ipc_namespace_path, CLONE_NEWIPC);
3981 if (r < 0) {
3982 *exit_status = EXIT_NAMESPACE;
3983 return log_unit_error_errno(unit, r, "Failed to open IPC namespace path %s: %m", context->ipc_namespace_path);
3984 }
3985 }
3986
52c239d7 3987 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
3988 if (r < 0) {
3989 *exit_status = EXIT_STDIN;
12145637 3990 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 3991 }
034c6ed7 3992
52c239d7 3993 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3994 if (r < 0) {
3995 *exit_status = EXIT_STDOUT;
12145637 3996 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
3997 }
3998
52c239d7 3999 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
4000 if (r < 0) {
4001 *exit_status = EXIT_STDERR;
12145637 4002 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
4003 }
4004
d35fbf6b 4005 if (context->oom_score_adjust_set) {
9f8168eb
LP
4006 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
4007 * prohibit write access to this file, and we shouldn't trip up over that. */
4008 r = set_oom_score_adjust(context->oom_score_adjust);
065b4774 4009 if (ERRNO_IS_PRIVILEGE(r))
f2341e0a 4010 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 4011 else if (r < 0) {
ff0af2a1 4012 *exit_status = EXIT_OOM_ADJUST;
12145637 4013 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 4014 }
d35fbf6b
DM
4015 }
4016
ad21e542
ZJS
4017 if (context->coredump_filter_set) {
4018 r = set_coredump_filter(context->coredump_filter);
4019 if (ERRNO_IS_PRIVILEGE(r))
4020 log_unit_debug_errno(unit, r, "Failed to adjust coredump_filter, ignoring: %m");
4021 else if (r < 0)
4022 return log_unit_error_errno(unit, r, "Failed to adjust coredump_filter: %m");
4023 }
4024
39090201
DJL
4025 if (context->nice_set) {
4026 r = setpriority_closest(context->nice);
4027 if (r < 0)
4028 return log_unit_error_errno(unit, r, "Failed to set up process scheduling priority (nice level): %m");
4029 }
613b411c 4030
d35fbf6b
DM
4031 if (context->cpu_sched_set) {
4032 struct sched_param param = {
4033 .sched_priority = context->cpu_sched_priority,
4034 };
4035
ff0af2a1
LP
4036 r = sched_setscheduler(0,
4037 context->cpu_sched_policy |
4038 (context->cpu_sched_reset_on_fork ?
4039 SCHED_RESET_ON_FORK : 0),
4040 &param);
4041 if (r < 0) {
4042 *exit_status = EXIT_SETSCHEDULER;
12145637 4043 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 4044 }
d35fbf6b 4045 }
fc9b2a84 4046
e2b2fb7f
MS
4047 if (context->cpu_affinity_from_numa || context->cpu_set.set) {
4048 _cleanup_(cpu_set_reset) CPUSet converted_cpu_set = {};
4049 const CPUSet *cpu_set;
4050
4051 if (context->cpu_affinity_from_numa) {
4052 r = exec_context_cpu_affinity_from_numa(context, &converted_cpu_set);
4053 if (r < 0) {
4054 *exit_status = EXIT_CPUAFFINITY;
4055 return log_unit_error_errno(unit, r, "Failed to derive CPU affinity mask from NUMA mask: %m");
4056 }
4057
4058 cpu_set = &converted_cpu_set;
4059 } else
4060 cpu_set = &context->cpu_set;
4061
4062 if (sched_setaffinity(0, cpu_set->allocated, cpu_set->set) < 0) {
ff0af2a1 4063 *exit_status = EXIT_CPUAFFINITY;
12145637 4064 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7 4065 }
e2b2fb7f 4066 }
034c6ed7 4067
b070c7c0
MS
4068 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
4069 r = apply_numa_policy(&context->numa_policy);
4070 if (r == -EOPNOTSUPP)
33fe9e3f 4071 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
4072 else if (r < 0) {
4073 *exit_status = EXIT_NUMA_POLICY;
4074 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
4075 }
4076 }
4077
d35fbf6b
DM
4078 if (context->ioprio_set)
4079 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 4080 *exit_status = EXIT_IOPRIO;
12145637 4081 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 4082 }
da726a4d 4083
d35fbf6b
DM
4084 if (context->timer_slack_nsec != NSEC_INFINITY)
4085 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 4086 *exit_status = EXIT_TIMERSLACK;
12145637 4087 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 4088 }
9eba9da4 4089
21022b9d
LP
4090 if (context->personality != PERSONALITY_INVALID) {
4091 r = safe_personality(context->personality);
4092 if (r < 0) {
ff0af2a1 4093 *exit_status = EXIT_PERSONALITY;
12145637 4094 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 4095 }
21022b9d 4096 }
94f04347 4097
d35fbf6b 4098 if (context->utmp_id)
df0ff127 4099 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 4100 context->tty_path,
023a4f67
LP
4101 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
4102 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
4103 USER_PROCESS,
6a93917d 4104 username);
d35fbf6b 4105
08f67696 4106 if (uid_is_valid(uid)) {
ff0af2a1
LP
4107 r = chown_terminal(STDIN_FILENO, uid);
4108 if (r < 0) {
4109 *exit_status = EXIT_STDIN;
12145637 4110 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 4111 }
d35fbf6b 4112 }
8e274523 4113
4e1dfa45 4114 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 4115 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 4116 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 4117 * touch a single hierarchy too. */
584b8688 4118 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 4119 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
4120 if (r < 0) {
4121 *exit_status = EXIT_CGROUP;
12145637 4122 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 4123 }
d35fbf6b 4124 }
034c6ed7 4125
5b10116e 4126 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 4127 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
4128 if (r < 0)
4129 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 4130 }
94f04347 4131
bb0c0d6f
LP
4132 if (FLAGS_SET(params->flags, EXEC_WRITE_CREDENTIALS)) {
4133 r = setup_credentials(context, params, unit->id, uid);
4134 if (r < 0) {
4135 *exit_status = EXIT_CREDENTIALS;
4136 return log_unit_error_errno(unit, r, "Failed to set up credentials: %m");
4137 }
4138 }
4139
7bce046b 4140 r = build_environment(
fd63e712 4141 unit,
7bce046b
LP
4142 context,
4143 params,
4144 n_fds,
4145 home,
4146 username,
4147 shell,
4148 journal_stream_dev,
4149 journal_stream_ino,
4150 &our_env);
2065ca69
JW
4151 if (r < 0) {
4152 *exit_status = EXIT_MEMORY;
12145637 4153 return log_oom();
2065ca69
JW
4154 }
4155
4156 r = build_pass_environment(context, &pass_env);
4157 if (r < 0) {
4158 *exit_status = EXIT_MEMORY;
12145637 4159 return log_oom();
2065ca69
JW
4160 }
4161
4162 accum_env = strv_env_merge(5,
4163 params->environment,
4164 our_env,
4165 pass_env,
4166 context->environment,
44e5d006 4167 files_env);
2065ca69
JW
4168 if (!accum_env) {
4169 *exit_status = EXIT_MEMORY;
12145637 4170 return log_oom();
2065ca69 4171 }
1280503b 4172 accum_env = strv_env_clean(accum_env);
2065ca69 4173
096424d1 4174 (void) umask(context->umask);
b213e1c1 4175
b1edf445 4176 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
4177 if (r < 0) {
4178 *exit_status = EXIT_KEYRING;
12145637 4179 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
4180 }
4181
165a31c0 4182 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 4183 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 4184
165a31c0
LP
4185 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
4186 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 4187
165a31c0
LP
4188 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
4189 if (needs_ambient_hack)
4190 needs_setuid = false;
4191 else
4192 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
4193
4194 if (needs_sandboxing) {
7f18ef0a
FK
4195 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
4196 * present. The actual MAC context application will happen later, as late as possible, to avoid
4197 * impacting our own code paths. */
4198
349cc4a5 4199#if HAVE_SELINUX
43b1f709 4200 use_selinux = mac_selinux_use();
7f18ef0a 4201#endif
f9fa32f0 4202#if ENABLE_SMACK
43b1f709 4203 use_smack = mac_smack_use();
7f18ef0a 4204#endif
349cc4a5 4205#if HAVE_APPARMOR
43b1f709 4206 use_apparmor = mac_apparmor_use();
7f18ef0a 4207#endif
165a31c0 4208 }
7f18ef0a 4209
ce932d2d
LP
4210 if (needs_sandboxing) {
4211 int which_failed;
4212
4213 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
4214 * is set here. (See below.) */
4215
4216 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
4217 if (r < 0) {
4218 *exit_status = EXIT_LIMITS;
4219 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
4220 }
4221 }
4222
0af07108 4223 if (needs_setuid && context->pam_name && username) {
ce932d2d
LP
4224 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
4225 * wins here. (See above.) */
4226
1da37e58 4227 /* All fds passed in the fds array will be closed in the pam child process. */
0af07108
ZJS
4228 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
4229 if (r < 0) {
4230 *exit_status = EXIT_PAM;
4231 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0 4232 }
ac45f971 4233
0af07108
ZJS
4234 ngids_after_pam = getgroups_alloc(&gids_after_pam);
4235 if (ngids_after_pam < 0) {
4236 *exit_status = EXIT_MEMORY;
4237 return log_unit_error_errno(unit, ngids_after_pam, "Failed to obtain groups after setting up PAM: %m");
5749f855 4238 }
b213e1c1 4239 }
5749f855 4240
0af07108 4241 if (needs_sandboxing && context->private_users && !have_effective_cap(CAP_SYS_ADMIN)) {
5749f855
AZ
4242 /* If we're unprivileged, set up the user namespace first to enable use of the other namespaces.
4243 * Users with CAP_SYS_ADMIN can set up user namespaces last because they will be able to
4244 * set up the all of the other namespaces (i.e. network, mount, UTS) without a user namespace. */
0af07108
ZJS
4245
4246 userns_set_up = true;
4247 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4248 if (r < 0) {
4249 *exit_status = EXIT_USER;
4250 return log_unit_error_errno(unit, r, "Failed to set up user namespacing for unprivileged user: %m");
5749f855
AZ
4251 }
4252 }
4253
a8d08f39
LP
4254 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
4255
6e2d7c4f 4256 if (ns_type_supported(NAMESPACE_NET)) {
54c2459d 4257 r = setup_shareable_ns(runtime->netns_storage_socket, CLONE_NEWNET);
ee00d1e9
ZJS
4258 if (r == -EPERM)
4259 log_unit_warning_errno(unit, r,
4260 "PrivateNetwork=yes is configured, but network namespace setup failed, ignoring: %m");
4261 else if (r < 0) {
6e2d7c4f
MS
4262 *exit_status = EXIT_NETWORK;
4263 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
4264 }
a8d08f39
LP
4265 } else if (context->network_namespace_path) {
4266 *exit_status = EXIT_NETWORK;
ee00d1e9
ZJS
4267 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4268 "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
4269 } else
4270 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 4271 }
169c1bda 4272
a70581ff
XR
4273 if ((context->private_ipc || context->ipc_namespace_path) && runtime && runtime->ipcns_storage_socket[0] >= 0) {
4274
4275 if (ns_type_supported(NAMESPACE_IPC)) {
4276 r = setup_shareable_ns(runtime->ipcns_storage_socket, CLONE_NEWIPC);
4277 if (r == -EPERM)
4278 log_unit_warning_errno(unit, r,
4279 "PrivateIPC=yes is configured, but IPC namespace setup failed, ignoring: %m");
4280 else if (r < 0) {
4281 *exit_status = EXIT_NAMESPACE;
4282 return log_unit_error_errno(unit, r, "Failed to set up IPC namespacing: %m");
4283 }
4284 } else if (context->ipc_namespace_path) {
4285 *exit_status = EXIT_NAMESPACE;
4286 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP),
4287 "IPCNamespacePath= is not supported, refusing.");
4288 } else
4289 log_unit_warning(unit, "PrivateIPC=yes is configured, but the kernel does not support IPC namespaces, ignoring.");
4290 }
4291
ee818b89 4292 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 4293 if (needs_mount_namespace) {
7cc5ef5f
ZJS
4294 _cleanup_free_ char *error_path = NULL;
4295
9f71ba8d 4296 r = apply_mount_namespace(unit, command->flags, context, params, runtime, &error_path);
3fbe8dbe
LP
4297 if (r < 0) {
4298 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
4299 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
4300 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 4301 }
d35fbf6b 4302 }
81a2b7ce 4303
daf8f72b
LP
4304 if (needs_sandboxing) {
4305 r = apply_protect_hostname(unit, context, exit_status);
4306 if (r < 0)
4307 return r;
aecd5ac6
TM
4308 }
4309
5749f855
AZ
4310 /* Drop groups as early as possible.
4311 * This needs to be done after PrivateDevices=y setup as device nodes should be owned by the host's root.
4312 * For non-root in a userns, devices will be owned by the user/group before the group change, and nobody. */
165a31c0 4313 if (needs_setuid) {
afb11bf1
DG
4314 _cleanup_free_ gid_t *gids_to_enforce = NULL;
4315 int ngids_to_enforce = 0;
4316
4317 ngids_to_enforce = merge_gid_lists(supplementary_gids,
4318 ngids,
4319 gids_after_pam,
4320 ngids_after_pam,
4321 &gids_to_enforce);
4322 if (ngids_to_enforce < 0) {
4323 *exit_status = EXIT_MEMORY;
4324 return log_unit_error_errno(unit,
4325 ngids_to_enforce,
4326 "Failed to merge group lists. Group membership might be incorrect: %m");
4327 }
4328
4329 r = enforce_groups(gid, gids_to_enforce, ngids_to_enforce);
096424d1
LP
4330 if (r < 0) {
4331 *exit_status = EXIT_GROUP;
12145637 4332 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 4333 }
165a31c0 4334 }
096424d1 4335
5749f855
AZ
4336 /* If the user namespace was not set up above, try to do it now.
4337 * It's preferred to set up the user namespace later (after all other namespaces) so as not to be
4338 * restricted by rules pertaining to combining user namspaces with other namespaces (e.g. in the
4339 * case of mount namespaces being less privileged when the mount point list is copied from a
4340 * different user namespace). */
9008e1ac 4341
5749f855
AZ
4342 if (needs_sandboxing && context->private_users && !userns_set_up) {
4343 r = setup_private_users(saved_uid, saved_gid, uid, gid);
4344 if (r < 0) {
4345 *exit_status = EXIT_USER;
4346 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
d251207d
LP
4347 }
4348 }
4349
9f71ba8d
ZJS
4350 /* Now that the mount namespace has been set up and privileges adjusted, let's look for the thing we
4351 * shall execute. */
4352
4353 _cleanup_free_ char *executable = NULL;
b83d5050
ZJS
4354 _cleanup_close_ int executable_fd = -1;
4355 r = find_executable_full(command->path, false, &executable, &executable_fd);
9f71ba8d
ZJS
4356 if (r < 0) {
4357 if (r != -ENOMEM && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
c2503e35
RH
4358 log_unit_struct_errno(unit, LOG_INFO, r,
4359 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4360 LOG_UNIT_INVOCATION_ID(unit),
4361 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
4362 command->path),
4363 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4364 return 0;
4365 }
4366
4367 *exit_status = EXIT_EXEC;
c2503e35
RH
4368
4369 return log_unit_struct_errno(unit, LOG_INFO, r,
4370 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4371 LOG_UNIT_INVOCATION_ID(unit),
4372 LOG_UNIT_MESSAGE(unit, "Failed to locate executable %s: %m",
4373 command->path),
4374 "EXECUTABLE=%s", command->path);
9f71ba8d
ZJS
4375 }
4376
b83d5050
ZJS
4377 r = add_shifted_fd(keep_fds, ELEMENTSOF(keep_fds), &n_keep_fds, executable_fd, &executable_fd);
4378 if (r < 0) {
4379 *exit_status = EXIT_FDS;
4380 return log_unit_error_errno(unit, r, "Failed to shift fd and set FD_CLOEXEC: %m");
4381 }
4382
9f71ba8d 4383#if HAVE_SELINUX
49590d67
MS
4384 if (needs_sandboxing && use_selinux && params->selinux_context_net) {
4385 int fd = -1;
4386
4387 if (socket_fd >= 0)
4388 fd = socket_fd;
4389 else if (params->n_socket_fds == 1)
4390 /* If stdin is not connected to a socket but we are triggered by exactly one socket unit then we
4391 * use context from that fd to compute the label. */
4392 fd = params->fds[0];
4393
4394 if (fd >= 0) {
4395 r = mac_selinux_get_child_mls_label(fd, executable, context->selinux_context, &mac_selinux_context_net);
4396 if (r < 0) {
4397 *exit_status = EXIT_SELINUX_CONTEXT;
4398 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
4399 }
9f71ba8d
ZJS
4400 }
4401 }
4402#endif
4403
165a31c0 4404 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
a70581ff 4405 * more aggressive this time since socket_fd and the netns and ipcns fds we don't need anymore. We do keep the exec_fd
5686391b
LP
4406 * however if we have it as we want to keep it open until the final execve(). */
4407
1da37e58 4408 r = close_all_fds(keep_fds, n_keep_fds);
ff0af2a1
LP
4409 if (r >= 0)
4410 r = shift_fds(fds, n_fds);
4411 if (r >= 0)
25b583d7 4412 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
4413 if (r < 0) {
4414 *exit_status = EXIT_FDS;
12145637 4415 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 4416 }
e66cf1a3 4417
5686391b
LP
4418 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
4419 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
4420 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
4421 * came this far. */
4422
165a31c0 4423 secure_bits = context->secure_bits;
e66cf1a3 4424
165a31c0
LP
4425 if (needs_sandboxing) {
4426 uint64_t bset;
e66cf1a3 4427
ce932d2d
LP
4428 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
4429 * requested. (Note this is placed after the general resource limit initialization, see
4430 * above, in order to take precedence.) */
f4170c67
LP
4431 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
4432 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
4433 *exit_status = EXIT_LIMITS;
12145637 4434 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
4435 }
4436 }
4437
37ac2744
JB
4438#if ENABLE_SMACK
4439 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
4440 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
4441 if (use_smack) {
b83d5050 4442 r = setup_smack(context, executable_fd);
37ac2744
JB
4443 if (r < 0) {
4444 *exit_status = EXIT_SMACK_PROCESS_LABEL;
4445 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
4446 }
4447 }
4448#endif
4449
165a31c0
LP
4450 bset = context->capability_bounding_set;
4451 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
4452 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
4453 * instead of us doing that */
4454 if (needs_ambient_hack)
4455 bset |= (UINT64_C(1) << CAP_SETPCAP) |
4456 (UINT64_C(1) << CAP_SETUID) |
4457 (UINT64_C(1) << CAP_SETGID);
4458
4459 if (!cap_test_all(bset)) {
4460 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
4461 if (r < 0) {
4462 *exit_status = EXIT_CAPABILITIES;
12145637 4463 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 4464 }
4c2630eb 4465 }
3b8bddde 4466
16fcb191
TK
4467 /* Ambient capabilities are cleared during setresuid() (in enforce_user()) even with
4468 * keep-caps set.
4469 * To be able to raise the ambient capabilities after setresuid() they have to be
4470 * added to the inherited set and keep caps has to be set (done in enforce_user()).
4471 * After setresuid() the ambient capabilities can be raised as they are present in
4472 * the permitted and inhertiable set. However it is possible that someone wants to
4473 * set ambient capabilities without changing the user, so we also set the ambient
4474 * capabilities here.
4475 * The requested ambient capabilities are raised in the inheritable set if the
4476 * second argument is true. */
943800f4 4477 if (!needs_ambient_hack) {
755d4b67
IP
4478 r = capability_ambient_set_apply(context->capability_ambient_set, true);
4479 if (r < 0) {
4480 *exit_status = EXIT_CAPABILITIES;
12145637 4481 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 4482 }
755d4b67 4483 }
165a31c0 4484 }
755d4b67 4485
fa97f630
JB
4486 /* chroot to root directory first, before we lose the ability to chroot */
4487 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
4488 if (r < 0)
4489 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
4490
165a31c0 4491 if (needs_setuid) {
08f67696 4492 if (uid_is_valid(uid)) {
ff0af2a1
LP
4493 r = enforce_user(context, uid);
4494 if (r < 0) {
4495 *exit_status = EXIT_USER;
12145637 4496 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 4497 }
165a31c0
LP
4498
4499 if (!needs_ambient_hack &&
4500 context->capability_ambient_set != 0) {
755d4b67 4501
16fcb191 4502 /* Raise the ambient capabilities after user change. */
755d4b67
IP
4503 r = capability_ambient_set_apply(context->capability_ambient_set, false);
4504 if (r < 0) {
4505 *exit_status = EXIT_CAPABILITIES;
12145637 4506 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67 4507 }
755d4b67 4508 }
5b6319dc 4509 }
165a31c0 4510 }
d35fbf6b 4511
56ef8db9
JB
4512 /* Apply working directory here, because the working directory might be on NFS and only the user running
4513 * this service might have the correct privilege to change to the working directory */
fa97f630 4514 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
4515 if (r < 0)
4516 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
4517
165a31c0 4518 if (needs_sandboxing) {
37ac2744 4519 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
4520 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
4521 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
4522 * are restricted. */
4523
349cc4a5 4524#if HAVE_SELINUX
43b1f709 4525 if (use_selinux) {
5cd9cd35
LP
4526 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
4527
4528 if (exec_context) {
4529 r = setexeccon(exec_context);
4530 if (r < 0) {
4531 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 4532 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
4533 }
4534 }
4535 }
4536#endif
4537
349cc4a5 4538#if HAVE_APPARMOR
43b1f709 4539 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
4540 r = aa_change_onexec(context->apparmor_profile);
4541 if (r < 0 && !context->apparmor_profile_ignore) {
4542 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 4543 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
4544 }
4545 }
4546#endif
4547
165a31c0 4548 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
dbdc4098
TK
4549 * we'll try not to call PR_SET_SECUREBITS unless necessary. Setting securebits requires
4550 * CAP_SETPCAP. */
4551 if (prctl(PR_GET_SECUREBITS) != secure_bits) {
69e3234d 4552 /* CAP_SETPCAP is required to set securebits. This capability is raised into the
dbdc4098
TK
4553 * effective set here.
4554 * The effective set is overwritten during execve with the following values:
4555 * - ambient set (for non-root processes)
4556 * - (inheritable | bounding) set for root processes)
4557 *
4558 * Hence there is no security impact to raise it in the effective set before execve
4559 */
4560 r = capability_gain_cap_setpcap(NULL);
4561 if (r < 0) {
4562 *exit_status = EXIT_CAPABILITIES;
4563 return log_unit_error_errno(unit, r, "Failed to gain CAP_SETPCAP for setting secure bits");
4564 }
755d4b67 4565 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 4566 *exit_status = EXIT_SECUREBITS;
12145637 4567 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 4568 }
dbdc4098 4569 }
5b6319dc 4570
59eeb84b 4571 if (context_has_no_new_privileges(context))
d35fbf6b 4572 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 4573 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 4574 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
4575 }
4576
349cc4a5 4577#if HAVE_SECCOMP
469830d1
LP
4578 r = apply_address_families(unit, context);
4579 if (r < 0) {
4580 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 4581 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 4582 }
04aa0cb9 4583
469830d1
LP
4584 r = apply_memory_deny_write_execute(unit, context);
4585 if (r < 0) {
4586 *exit_status = EXIT_SECCOMP;
12145637 4587 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 4588 }
f4170c67 4589
469830d1
LP
4590 r = apply_restrict_realtime(unit, context);
4591 if (r < 0) {
4592 *exit_status = EXIT_SECCOMP;
12145637 4593 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
4594 }
4595
f69567cb
LP
4596 r = apply_restrict_suid_sgid(unit, context);
4597 if (r < 0) {
4598 *exit_status = EXIT_SECCOMP;
4599 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
4600 }
4601
add00535
LP
4602 r = apply_restrict_namespaces(unit, context);
4603 if (r < 0) {
4604 *exit_status = EXIT_SECCOMP;
12145637 4605 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
4606 }
4607
469830d1
LP
4608 r = apply_protect_sysctl(unit, context);
4609 if (r < 0) {
4610 *exit_status = EXIT_SECCOMP;
12145637 4611 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
4612 }
4613
469830d1
LP
4614 r = apply_protect_kernel_modules(unit, context);
4615 if (r < 0) {
4616 *exit_status = EXIT_SECCOMP;
12145637 4617 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
4618 }
4619
84703040
KK
4620 r = apply_protect_kernel_logs(unit, context);
4621 if (r < 0) {
4622 *exit_status = EXIT_SECCOMP;
4623 return log_unit_error_errno(unit, r, "Failed to apply kernel log restrictions: %m");
4624 }
4625
fc64760d
KK
4626 r = apply_protect_clock(unit, context);
4627 if (r < 0) {
4628 *exit_status = EXIT_SECCOMP;
4629 return log_unit_error_errno(unit, r, "Failed to apply clock restrictions: %m");
4630 }
4631
469830d1
LP
4632 r = apply_private_devices(unit, context);
4633 if (r < 0) {
4634 *exit_status = EXIT_SECCOMP;
12145637 4635 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
4636 }
4637
4638 r = apply_syscall_archs(unit, context);
4639 if (r < 0) {
4640 *exit_status = EXIT_SECCOMP;
12145637 4641 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
4642 }
4643
78e864e5
TM
4644 r = apply_lock_personality(unit, context);
4645 if (r < 0) {
4646 *exit_status = EXIT_SECCOMP;
12145637 4647 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
4648 }
4649
9df2cdd8
TM
4650 r = apply_syscall_log(unit, context);
4651 if (r < 0) {
4652 *exit_status = EXIT_SECCOMP;
4653 return log_unit_error_errno(unit, r, "Failed to apply system call log filters: %m");
4654 }
4655
5cd9cd35
LP
4656 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
4657 * by the filter as little as possible. */
165a31c0 4658 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
4659 if (r < 0) {
4660 *exit_status = EXIT_SECCOMP;
12145637 4661 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
4662 }
4663#endif
d35fbf6b 4664 }
034c6ed7 4665
00819cc1
LP
4666 if (!strv_isempty(context->unset_environment)) {
4667 char **ee = NULL;
4668
4669 ee = strv_env_delete(accum_env, 1, context->unset_environment);
4670 if (!ee) {
4671 *exit_status = EXIT_MEMORY;
12145637 4672 return log_oom();
00819cc1
LP
4673 }
4674
130d3d22 4675 strv_free_and_replace(accum_env, ee);
00819cc1
LP
4676 }
4677
7ca69792
AZ
4678 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
4679 replaced_argv = replace_env_argv(command->argv, accum_env);
4680 if (!replaced_argv) {
4681 *exit_status = EXIT_MEMORY;
4682 return log_oom();
4683 }
4684 final_argv = replaced_argv;
4685 } else
4686 final_argv = command->argv;
034c6ed7 4687
f1d34068 4688 if (DEBUG_LOGGING) {
c2b2df60 4689 _cleanup_free_ char *line = NULL;
81a2b7ce 4690
d35fbf6b 4691 line = exec_command_line(final_argv);
a1230ff9 4692 if (line)
c2503e35
RH
4693 log_unit_struct(unit, LOG_DEBUG,
4694 "EXECUTABLE=%s", executable,
4695 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
4696 LOG_UNIT_INVOCATION_ID(unit));
d35fbf6b 4697 }
dd305ec9 4698
5686391b
LP
4699 if (exec_fd >= 0) {
4700 uint8_t hot = 1;
4701
4702 /* We have finished with all our initializations. Let's now let the manager know that. From this point
4703 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
4704
4705 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4706 *exit_status = EXIT_EXEC;
4707 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
4708 }
4709 }
4710
a6d9111c 4711 r = fexecve_or_execve(executable_fd, executable, final_argv, accum_env);
5686391b
LP
4712
4713 if (exec_fd >= 0) {
4714 uint8_t hot = 0;
4715
4716 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
4717 * that POLLHUP on it no longer means execve() succeeded. */
4718
4719 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
4720 *exit_status = EXIT_EXEC;
4721 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
4722 }
4723 }
12145637 4724
ff0af2a1 4725 *exit_status = EXIT_EXEC;
9f71ba8d 4726 return log_unit_error_errno(unit, r, "Failed to execute %s: %m", executable);
d35fbf6b 4727}
81a2b7ce 4728
34cf6c43 4729static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 4730static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 4731
f2341e0a
LP
4732int exec_spawn(Unit *unit,
4733 ExecCommand *command,
d35fbf6b
DM
4734 const ExecContext *context,
4735 const ExecParameters *params,
4736 ExecRuntime *runtime,
29206d46 4737 DynamicCreds *dcreds,
d35fbf6b 4738 pid_t *ret) {
8351ceae 4739
ee39ca20 4740 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 4741 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 4742 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 4743 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 4744 _cleanup_free_ char *line = NULL;
d35fbf6b 4745 pid_t pid;
8351ceae 4746
f2341e0a 4747 assert(unit);
d35fbf6b
DM
4748 assert(command);
4749 assert(context);
4750 assert(ret);
4751 assert(params);
25b583d7 4752 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 4753
d35fbf6b
DM
4754 if (context->std_input == EXEC_INPUT_SOCKET ||
4755 context->std_output == EXEC_OUTPUT_SOCKET ||
4756 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 4757
d85ff944
YW
4758 if (params->n_socket_fds > 1)
4759 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got more than one socket.");
eef65bf3 4760
d85ff944
YW
4761 if (params->n_socket_fds == 0)
4762 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EINVAL), "Got no socket.");
488ab41c 4763
d35fbf6b
DM
4764 socket_fd = params->fds[0];
4765 } else {
4766 socket_fd = -1;
4767 fds = params->fds;
9b141911 4768 n_socket_fds = params->n_socket_fds;
25b583d7 4769 n_storage_fds = params->n_storage_fds;
d35fbf6b 4770 }
94f04347 4771
34cf6c43 4772 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
4773 if (r < 0)
4774 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
4775
f2341e0a 4776 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 4777 if (r < 0)
f2341e0a 4778 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 4779
ee39ca20 4780 line = exec_command_line(command->argv);
d35fbf6b
DM
4781 if (!line)
4782 return log_oom();
fab56fc5 4783
9f71ba8d
ZJS
4784 /* Fork with up-to-date SELinux label database, so the child inherits the up-to-date db
4785 and, until the next SELinux policy changes, we save further reloads in future children. */
2df2152c
CG
4786 mac_selinux_maybe_reload();
4787
c2503e35
RH
4788 log_unit_struct(unit, LOG_DEBUG,
4789 LOG_UNIT_MESSAGE(unit, "About to execute %s", line),
4790 "EXECUTABLE=%s", command->path, /* We won't know the real executable path until we create
4791 the mount namespace in the child, but we want to log
4792 from the parent, so we need to use the (possibly
4793 inaccurate) path here. */
4794 LOG_UNIT_INVOCATION_ID(unit));
12145637 4795
78f93209
LP
4796 if (params->cgroup_path) {
4797 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
4798 if (r < 0)
4799 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
4800 if (r > 0) { /* We are using a child cgroup */
4801 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
4802 if (r < 0)
4803 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
4e806bfa
AZ
4804
4805 /* Normally we would not propagate the oomd xattrs to children but since we created this
4806 * sub-cgroup internally we should do it. */
4807 cgroup_oomd_xattr_apply(unit, subcgroup_path);
78f93209
LP
4808 }
4809 }
4810
d35fbf6b
DM
4811 pid = fork();
4812 if (pid < 0)
74129a12 4813 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
4814
4815 if (pid == 0) {
12145637 4816 int exit_status = EXIT_SUCCESS;
ff0af2a1 4817
f2341e0a
LP
4818 r = exec_child(unit,
4819 command,
ff0af2a1
LP
4820 context,
4821 params,
4822 runtime,
29206d46 4823 dcreds,
ff0af2a1 4824 socket_fd,
52c239d7 4825 named_iofds,
4c47affc 4826 fds,
9b141911 4827 n_socket_fds,
25b583d7 4828 n_storage_fds,
ff0af2a1 4829 files_env,
00d9ef85 4830 unit->manager->user_lookup_fds[1],
12145637
LP
4831 &exit_status);
4832
e1714f02
ZJS
4833 if (r < 0) {
4834 const char *status =
4835 exit_status_to_string(exit_status,
e04ed6db 4836 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 4837
c2503e35
RH
4838 log_unit_struct_errno(unit, LOG_ERR, r,
4839 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
4840 LOG_UNIT_INVOCATION_ID(unit),
4841 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
4842 status, command->path),
4843 "EXECUTABLE=%s", command->path);
e1714f02 4844 }
4c2630eb 4845
ff0af2a1 4846 _exit(exit_status);
034c6ed7
LP
4847 }
4848
f2341e0a 4849 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 4850
78f93209
LP
4851 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
4852 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
4853 * process will be killed too). */
4854 if (subcgroup_path)
4855 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 4856
b58b4116 4857 exec_status_start(&command->exec_status, pid);
9fb86720 4858
034c6ed7 4859 *ret = pid;
5cb5a6ff
LP
4860 return 0;
4861}
4862
034c6ed7
LP
4863void exec_context_init(ExecContext *c) {
4864 assert(c);
4865
4c12626c 4866 c->umask = 0022;
9eba9da4 4867 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 4868 c->cpu_sched_policy = SCHED_OTHER;
071830ff 4869 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 4870 c->syslog_level_prefix = true;
353e12c2 4871 c->ignore_sigpipe = true;
3a43da28 4872 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 4873 c->personality = PERSONALITY_INVALID;
5b10116e
ZJS
4874 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4875 c->directories[t].mode = 0755;
12213aed 4876 c->timeout_clean_usec = USEC_INFINITY;
a103496c 4877 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
4878 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
4879 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 4880 c->log_level_max = -1;
005bfaf1
TM
4881#if HAVE_SECCOMP
4882 c->syscall_errno = SECCOMP_ERROR_NUMBER_KILL;
4883#endif
b070c7c0 4884 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
4885}
4886
613b411c 4887void exec_context_done(ExecContext *c) {
5cb5a6ff
LP
4888 assert(c);
4889
6796073e
LP
4890 c->environment = strv_free(c->environment);
4891 c->environment_files = strv_free(c->environment_files);
b4c14404 4892 c->pass_environment = strv_free(c->pass_environment);
00819cc1 4893 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 4894
31ce987c 4895 rlimit_free_all(c->rlimit);
034c6ed7 4896
5b10116e 4897 for (size_t l = 0; l < 3; l++) {
52c239d7 4898 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
4899 c->stdio_file[l] = mfree(c->stdio_file[l]);
4900 }
52c239d7 4901
a1e58e8e
LP
4902 c->working_directory = mfree(c->working_directory);
4903 c->root_directory = mfree(c->root_directory);
915e6d16 4904 c->root_image = mfree(c->root_image);
18d73705 4905 c->root_image_options = mount_options_free_all(c->root_image_options);
0389f4fa
LB
4906 c->root_hash = mfree(c->root_hash);
4907 c->root_hash_size = 0;
4908 c->root_hash_path = mfree(c->root_hash_path);
d4d55b0d
LB
4909 c->root_hash_sig = mfree(c->root_hash_sig);
4910 c->root_hash_sig_size = 0;
4911 c->root_hash_sig_path = mfree(c->root_hash_sig_path);
0389f4fa 4912 c->root_verity = mfree(c->root_verity);
93f59701 4913 c->extension_images = mount_image_free_many(c->extension_images, &c->n_extension_images);
a1e58e8e
LP
4914 c->tty_path = mfree(c->tty_path);
4915 c->syslog_identifier = mfree(c->syslog_identifier);
4916 c->user = mfree(c->user);
4917 c->group = mfree(c->group);
034c6ed7 4918
6796073e 4919 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 4920
a1e58e8e 4921 c->pam_name = mfree(c->pam_name);
5b6319dc 4922
2a624c36
AP
4923 c->read_only_paths = strv_free(c->read_only_paths);
4924 c->read_write_paths = strv_free(c->read_write_paths);
4925 c->inaccessible_paths = strv_free(c->inaccessible_paths);
ddc155b2
TM
4926 c->exec_paths = strv_free(c->exec_paths);
4927 c->no_exec_paths = strv_free(c->no_exec_paths);
82c121a4 4928
d2d6c096 4929 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
4930 c->bind_mounts = NULL;
4931 c->n_bind_mounts = 0;
2abd4e38
YW
4932 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
4933 c->temporary_filesystems = NULL;
4934 c->n_temporary_filesystems = 0;
b3d13314 4935 c->mount_images = mount_image_free_many(c->mount_images, &c->n_mount_images);
d2d6c096 4936
0985c7c4 4937 cpu_set_reset(&c->cpu_set);
b070c7c0 4938 numa_policy_reset(&c->numa_policy);
86a3475b 4939
a1e58e8e
LP
4940 c->utmp_id = mfree(c->utmp_id);
4941 c->selinux_context = mfree(c->selinux_context);
4942 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 4943 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 4944
8cfa775f 4945 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
4946 c->syscall_archs = set_free(c->syscall_archs);
4947 c->address_families = set_free(c->address_families);
e66cf1a3 4948
5b10116e
ZJS
4949 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4950 c->directories[t].paths = strv_free(c->directories[t].paths);
d3070fbd
LP
4951
4952 c->log_level_max = -1;
4953
4954 exec_context_free_log_extra_fields(c);
08f3be7a 4955
5ac1530e
ZJS
4956 c->log_ratelimit_interval_usec = 0;
4957 c->log_ratelimit_burst = 0;
90fc172e 4958
08f3be7a
LP
4959 c->stdin_data = mfree(c->stdin_data);
4960 c->stdin_data_size = 0;
a8d08f39
LP
4961
4962 c->network_namespace_path = mfree(c->network_namespace_path);
71d1e583 4963 c->ipc_namespace_path = mfree(c->ipc_namespace_path);
91dd5f7c
LP
4964
4965 c->log_namespace = mfree(c->log_namespace);
bb0c0d6f 4966
43144be4 4967 c->load_credentials = hashmap_free(c->load_credentials);
bb0c0d6f 4968 c->set_credentials = hashmap_free(c->set_credentials);
e66cf1a3
LP
4969}
4970
34cf6c43 4971int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
4972 char **i;
4973
4974 assert(c);
4975
4976 if (!runtime_prefix)
4977 return 0;
4978
3536f49e 4979 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
c2b2df60 4980 _cleanup_free_ char *p = NULL;
e66cf1a3 4981
494d0247
YW
4982 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4983 p = path_join(runtime_prefix, "private", *i);
4984 else
4985 p = path_join(runtime_prefix, *i);
e66cf1a3
LP
4986 if (!p)
4987 return -ENOMEM;
4988
7bc4bf4a
LP
4989 /* We execute this synchronously, since we need to be sure this is gone when we start the
4990 * service next. */
c6878637 4991 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
4992 }
4993
4994 return 0;
5cb5a6ff
LP
4995}
4996
bb0c0d6f
LP
4997int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
4998 _cleanup_free_ char *p = NULL;
4999
5000 assert(c);
5001
5002 if (!runtime_prefix || !unit)
5003 return 0;
5004
5005 p = path_join(runtime_prefix, "credentials", unit);
5006 if (!p)
5007 return -ENOMEM;
5008
5009 /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
5010 * unmount it, and afterwards remove the mount point */
5011 (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
5012 (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
5013
5014 return 0;
5015}
5016
34cf6c43 5017static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
5018 assert(c);
5019
a1e58e8e 5020 c->path = mfree(c->path);
6796073e 5021 c->argv = strv_free(c->argv);
43d0fcbd
LP
5022}
5023
da6053d0 5024void exec_command_done_array(ExecCommand *c, size_t n) {
fe96c0f8 5025 for (size_t i = 0; i < n; i++)
43d0fcbd
LP
5026 exec_command_done(c+i);
5027}
5028
f1acf85a 5029ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
5030 ExecCommand *i;
5031
5032 while ((i = c)) {
71fda00f 5033 LIST_REMOVE(command, c, i);
43d0fcbd 5034 exec_command_done(i);
5cb5a6ff
LP
5035 free(i);
5036 }
f1acf85a
ZJS
5037
5038 return NULL;
5cb5a6ff
LP
5039}
5040
da6053d0 5041void exec_command_free_array(ExecCommand **c, size_t n) {
5b10116e 5042 for (size_t i = 0; i < n; i++)
f1acf85a 5043 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
5044}
5045
6a1d4d9f 5046void exec_command_reset_status_array(ExecCommand *c, size_t n) {
5b10116e 5047 for (size_t i = 0; i < n; i++)
6a1d4d9f
LP
5048 exec_status_reset(&c[i].exec_status);
5049}
5050
5051void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
5b10116e 5052 for (size_t i = 0; i < n; i++) {
6a1d4d9f
LP
5053 ExecCommand *z;
5054
5055 LIST_FOREACH(command, z, c[i])
5056 exec_status_reset(&z->exec_status);
5057 }
5058}
5059
039f0e70 5060typedef struct InvalidEnvInfo {
34cf6c43 5061 const Unit *unit;
039f0e70
LP
5062 const char *path;
5063} InvalidEnvInfo;
5064
5065static void invalid_env(const char *p, void *userdata) {
5066 InvalidEnvInfo *info = userdata;
5067
f2341e0a 5068 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
5069}
5070
52c239d7
LB
5071const char* exec_context_fdname(const ExecContext *c, int fd_index) {
5072 assert(c);
5073
5074 switch (fd_index) {
5073ff6b 5075
52c239d7
LB
5076 case STDIN_FILENO:
5077 if (c->std_input != EXEC_INPUT_NAMED_FD)
5078 return NULL;
5073ff6b 5079
52c239d7 5080 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 5081
52c239d7
LB
5082 case STDOUT_FILENO:
5083 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
5084 return NULL;
5073ff6b 5085
52c239d7 5086 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 5087
52c239d7
LB
5088 case STDERR_FILENO:
5089 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
5090 return NULL;
5073ff6b 5091
52c239d7 5092 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 5093
52c239d7
LB
5094 default:
5095 return NULL;
5096 }
5097}
5098
2caa38e9
LP
5099static int exec_context_named_iofds(
5100 const ExecContext *c,
5101 const ExecParameters *p,
5102 int named_iofds[static 3]) {
5103
5b10116e 5104 size_t targets;
56fbd561 5105 const char* stdio_fdname[3];
da6053d0 5106 size_t n_fds;
52c239d7
LB
5107
5108 assert(c);
5109 assert(p);
2caa38e9 5110 assert(named_iofds);
52c239d7
LB
5111
5112 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
5113 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
5114 (c->std_error == EXEC_OUTPUT_NAMED_FD);
5115
5b10116e 5116 for (size_t i = 0; i < 3; i++)
52c239d7
LB
5117 stdio_fdname[i] = exec_context_fdname(c, i);
5118
4c47affc
FB
5119 n_fds = p->n_storage_fds + p->n_socket_fds;
5120
5b10116e 5121 for (size_t i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
5122 if (named_iofds[STDIN_FILENO] < 0 &&
5123 c->std_input == EXEC_INPUT_NAMED_FD &&
5124 stdio_fdname[STDIN_FILENO] &&
5125 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
5126
52c239d7
LB
5127 named_iofds[STDIN_FILENO] = p->fds[i];
5128 targets--;
56fbd561
ZJS
5129
5130 } else if (named_iofds[STDOUT_FILENO] < 0 &&
5131 c->std_output == EXEC_OUTPUT_NAMED_FD &&
5132 stdio_fdname[STDOUT_FILENO] &&
5133 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
5134
52c239d7
LB
5135 named_iofds[STDOUT_FILENO] = p->fds[i];
5136 targets--;
56fbd561
ZJS
5137
5138 } else if (named_iofds[STDERR_FILENO] < 0 &&
5139 c->std_error == EXEC_OUTPUT_NAMED_FD &&
5140 stdio_fdname[STDERR_FILENO] &&
5141 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
5142
52c239d7
LB
5143 named_iofds[STDERR_FILENO] = p->fds[i];
5144 targets--;
5145 }
5146
56fbd561 5147 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
5148}
5149
34cf6c43 5150static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
5151 char **i, **r = NULL;
5152
5153 assert(c);
5154 assert(l);
5155
5156 STRV_FOREACH(i, c->environment_files) {
5157 char *fn;
52511fae 5158 int k;
8c7be95e
LP
5159 bool ignore = false;
5160 char **p;
7fd1b19b 5161 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
5162
5163 fn = *i;
5164
5165 if (fn[0] == '-') {
5166 ignore = true;
313cefa1 5167 fn++;
8c7be95e
LP
5168 }
5169
5170 if (!path_is_absolute(fn)) {
8c7be95e
LP
5171 if (ignore)
5172 continue;
5173
5174 strv_free(r);
5175 return -EINVAL;
5176 }
5177
2bef10ab 5178 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
5179 k = safe_glob(fn, 0, &pglob);
5180 if (k < 0) {
2bef10ab
PL
5181 if (ignore)
5182 continue;
8c7be95e 5183
2bef10ab 5184 strv_free(r);
d8c92e8b 5185 return k;
2bef10ab 5186 }
8c7be95e 5187
d8c92e8b
ZJS
5188 /* When we don't match anything, -ENOENT should be returned */
5189 assert(pglob.gl_pathc > 0);
5190
5b10116e 5191 for (unsigned n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 5192 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
5193 if (k < 0) {
5194 if (ignore)
5195 continue;
8c7be95e 5196
2bef10ab 5197 strv_free(r);
2bef10ab 5198 return k;
e9c1ea9d 5199 }
ebc05a09 5200 /* Log invalid environment variables with filename */
039f0e70
LP
5201 if (p) {
5202 InvalidEnvInfo info = {
f2341e0a 5203 .unit = unit,
039f0e70
LP
5204 .path = pglob.gl_pathv[n]
5205 };
5206
5207 p = strv_env_clean_with_callback(p, invalid_env, &info);
5208 }
8c7be95e 5209
234519ae 5210 if (!r)
2bef10ab
PL
5211 r = p;
5212 else {
5213 char **m;
8c7be95e 5214
2bef10ab
PL
5215 m = strv_env_merge(2, r, p);
5216 strv_free(r);
5217 strv_free(p);
c84a9488 5218 if (!m)
2bef10ab 5219 return -ENOMEM;
2bef10ab
PL
5220
5221 r = m;
5222 }
8c7be95e
LP
5223 }
5224 }
5225
5226 *l = r;
5227
5228 return 0;
5229}
5230
6ac8fdc9 5231static bool tty_may_match_dev_console(const char *tty) {
7b912648 5232 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 5233
1e22b5cd
LP
5234 if (!tty)
5235 return true;
5236
a119ec7c 5237 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
5238
5239 /* trivial identity? */
5240 if (streq(tty, "console"))
5241 return true;
5242
7b912648
LP
5243 if (resolve_dev_console(&resolved) < 0)
5244 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
5245
5246 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 5247 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
5248}
5249
6c0ae739
LP
5250static bool exec_context_may_touch_tty(const ExecContext *ec) {
5251 assert(ec);
1e22b5cd 5252
6c0ae739 5253 return ec->tty_reset ||
1e22b5cd
LP
5254 ec->tty_vhangup ||
5255 ec->tty_vt_disallocate ||
6ac8fdc9
MS
5256 is_terminal_input(ec->std_input) ||
5257 is_terminal_output(ec->std_output) ||
6c0ae739
LP
5258 is_terminal_output(ec->std_error);
5259}
5260
5261bool exec_context_may_touch_console(const ExecContext *ec) {
5262
5263 return exec_context_may_touch_tty(ec) &&
1e22b5cd 5264 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
5265}
5266
15ae422b
LP
5267static void strv_fprintf(FILE *f, char **l) {
5268 char **g;
5269
5270 assert(f);
5271
5272 STRV_FOREACH(g, l)
5273 fprintf(f, " %s", *g);
5274}
5275
ddc155b2
TM
5276static void strv_dump(FILE* f, const char *prefix, const char *name, char **strv) {
5277 assert(f);
5278 assert(prefix);
5279 assert(name);
5280
5281 if (!strv_isempty(strv)) {
a7bd1656 5282 fprintf(f, "%s%s:", prefix, name);
ddc155b2
TM
5283 strv_fprintf(f, strv);
5284 fputs("\n", f);
5285 }
5286}
5287
34cf6c43 5288void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
12213aed 5289 char **e, **d, buf_clean[FORMAT_TIMESPAN_MAX];
add00535 5290 int r;
9eba9da4 5291
5cb5a6ff
LP
5292 assert(c);
5293 assert(f);
5294
4ad49000 5295 prefix = strempty(prefix);
5cb5a6ff
LP
5296
5297 fprintf(f,
94f04347
LP
5298 "%sUMask: %04o\n"
5299 "%sWorkingDirectory: %s\n"
451a074f 5300 "%sRootDirectory: %s\n"
15ae422b 5301 "%sNonBlocking: %s\n"
64747e2d 5302 "%sPrivateTmp: %s\n"
7f112f50 5303 "%sPrivateDevices: %s\n"
59eeb84b 5304 "%sProtectKernelTunables: %s\n"
e66a2f65 5305 "%sProtectKernelModules: %s\n"
84703040 5306 "%sProtectKernelLogs: %s\n"
fc64760d 5307 "%sProtectClock: %s\n"
59eeb84b 5308 "%sProtectControlGroups: %s\n"
d251207d
LP
5309 "%sPrivateNetwork: %s\n"
5310 "%sPrivateUsers: %s\n"
1b8689f9
LP
5311 "%sProtectHome: %s\n"
5312 "%sProtectSystem: %s\n"
5d997827 5313 "%sMountAPIVFS: %s\n"
f3e43635 5314 "%sIgnoreSIGPIPE: %s\n"
f4170c67 5315 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 5316 "%sRestrictRealtime: %s\n"
f69567cb 5317 "%sRestrictSUIDSGID: %s\n"
aecd5ac6 5318 "%sKeyringMode: %s\n"
4e399953
LP
5319 "%sProtectHostname: %s\n"
5320 "%sProtectProc: %s\n"
5321 "%sProcSubset: %s\n",
5cb5a6ff 5322 prefix, c->umask,
14eb3285
LP
5323 prefix, empty_to_root(c->working_directory),
5324 prefix, empty_to_root(c->root_directory),
15ae422b 5325 prefix, yes_no(c->non_blocking),
64747e2d 5326 prefix, yes_no(c->private_tmp),
7f112f50 5327 prefix, yes_no(c->private_devices),
59eeb84b 5328 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 5329 prefix, yes_no(c->protect_kernel_modules),
84703040 5330 prefix, yes_no(c->protect_kernel_logs),
fc64760d 5331 prefix, yes_no(c->protect_clock),
59eeb84b 5332 prefix, yes_no(c->protect_control_groups),
d251207d
LP
5333 prefix, yes_no(c->private_network),
5334 prefix, yes_no(c->private_users),
1b8689f9
LP
5335 prefix, protect_home_to_string(c->protect_home),
5336 prefix, protect_system_to_string(c->protect_system),
5e98086d 5337 prefix, yes_no(exec_context_get_effective_mount_apivfs(c)),
f3e43635 5338 prefix, yes_no(c->ignore_sigpipe),
f4170c67 5339 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 5340 prefix, yes_no(c->restrict_realtime),
f69567cb 5341 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6 5342 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4e399953
LP
5343 prefix, yes_no(c->protect_hostname),
5344 prefix, protect_proc_to_string(c->protect_proc),
5345 prefix, proc_subset_to_string(c->proc_subset));
fb33a393 5346
915e6d16
LP
5347 if (c->root_image)
5348 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
5349
18d73705
LB
5350 if (c->root_image_options) {
5351 MountOptions *o;
5352
5353 fprintf(f, "%sRootImageOptions:", prefix);
5354 LIST_FOREACH(mount_options, o, c->root_image_options)
5355 if (!isempty(o->options))
9ece6444
LB
5356 fprintf(f, " %s:%s",
5357 partition_designator_to_string(o->partition_designator),
5358 o->options);
18d73705
LB
5359 fprintf(f, "\n");
5360 }
5361
0389f4fa
LB
5362 if (c->root_hash) {
5363 _cleanup_free_ char *encoded = NULL;
5364 encoded = hexmem(c->root_hash, c->root_hash_size);
5365 if (encoded)
5366 fprintf(f, "%sRootHash: %s\n", prefix, encoded);
5367 }
5368
5369 if (c->root_hash_path)
5370 fprintf(f, "%sRootHash: %s\n", prefix, c->root_hash_path);
5371
d4d55b0d
LB
5372 if (c->root_hash_sig) {
5373 _cleanup_free_ char *encoded = NULL;
5374 ssize_t len;
5375 len = base64mem(c->root_hash_sig, c->root_hash_sig_size, &encoded);
5376 if (len)
5377 fprintf(f, "%sRootHashSignature: base64:%s\n", prefix, encoded);
5378 }
5379
5380 if (c->root_hash_sig_path)
5381 fprintf(f, "%sRootHashSignature: %s\n", prefix, c->root_hash_sig_path);
5382
0389f4fa
LB
5383 if (c->root_verity)
5384 fprintf(f, "%sRootVerity: %s\n", prefix, c->root_verity);
5385
8c7be95e
LP
5386 STRV_FOREACH(e, c->environment)
5387 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
5388
5389 STRV_FOREACH(e, c->environment_files)
5390 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 5391
b4c14404
FB
5392 STRV_FOREACH(e, c->pass_environment)
5393 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
5394
00819cc1
LP
5395 STRV_FOREACH(e, c->unset_environment)
5396 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
5397
53f47dfc
YW
5398 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
5399
5b10116e 5400 for (ExecDirectoryType dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
5401 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
5402
5403 STRV_FOREACH(d, c->directories[dt].paths)
5404 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
5405 }
c2bbd90b 5406
12213aed
YW
5407 fprintf(f,
5408 "%sTimeoutCleanSec: %s\n",
5409 prefix, format_timespan(buf_clean, sizeof(buf_clean), c->timeout_clean_usec, USEC_PER_SEC));
5410
fb33a393
LP
5411 if (c->nice_set)
5412 fprintf(f,
5413 "%sNice: %i\n",
5414 prefix, c->nice);
5415
dd6c17b1 5416 if (c->oom_score_adjust_set)
fb33a393 5417 fprintf(f,
dd6c17b1
LP
5418 "%sOOMScoreAdjust: %i\n",
5419 prefix, c->oom_score_adjust);
9eba9da4 5420
ad21e542
ZJS
5421 if (c->coredump_filter_set)
5422 fprintf(f,
5423 "%sCoredumpFilter: 0x%"PRIx64"\n",
5424 prefix, c->coredump_filter);
5425
5b10116e 5426 for (unsigned i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 5427 if (c->rlimit[i]) {
4c3a2b84 5428 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 5429 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 5430 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
5431 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
5432 }
94f04347 5433
f8b69d1d 5434 if (c->ioprio_set) {
1756a011 5435 _cleanup_free_ char *class_str = NULL;
f8b69d1d 5436
837df140
YW
5437 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
5438 if (r >= 0)
5439 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
5440
5441 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 5442 }
94f04347 5443
f8b69d1d 5444 if (c->cpu_sched_set) {
1756a011 5445 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 5446
837df140
YW
5447 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
5448 if (r >= 0)
5449 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
5450
94f04347 5451 fprintf(f,
38b48754
LP
5452 "%sCPUSchedulingPriority: %i\n"
5453 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
5454 prefix, c->cpu_sched_priority,
5455 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 5456 }
94f04347 5457
0985c7c4 5458 if (c->cpu_set.set) {
e7fca352
MS
5459 _cleanup_free_ char *affinity = NULL;
5460
5461 affinity = cpu_set_to_range_string(&c->cpu_set);
5462 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
5463 }
5464
b070c7c0
MS
5465 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
5466 _cleanup_free_ char *nodes = NULL;
5467
5468 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
5469 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
5470 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
5471 }
5472
3a43da28 5473 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 5474 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
5475
5476 fprintf(f,
80876c20
LP
5477 "%sStandardInput: %s\n"
5478 "%sStandardOutput: %s\n"
5479 "%sStandardError: %s\n",
5480 prefix, exec_input_to_string(c->std_input),
5481 prefix, exec_output_to_string(c->std_output),
5482 prefix, exec_output_to_string(c->std_error));
5483
befc4a80
LP
5484 if (c->std_input == EXEC_INPUT_NAMED_FD)
5485 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
5486 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
5487 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
5488 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
5489 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
5490
5491 if (c->std_input == EXEC_INPUT_FILE)
5492 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
5493 if (c->std_output == EXEC_OUTPUT_FILE)
5494 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
5495 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
5496 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
8d7dab1f
LW
5497 if (c->std_output == EXEC_OUTPUT_FILE_TRUNCATE)
5498 fprintf(f, "%sStandardOutputFileToTruncate: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
5499 if (c->std_error == EXEC_OUTPUT_FILE)
5500 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
5501 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
5502 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
8d7dab1f
LW
5503 if (c->std_error == EXEC_OUTPUT_FILE_TRUNCATE)
5504 fprintf(f, "%sStandardErrorFileToTruncate: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 5505
80876c20
LP
5506 if (c->tty_path)
5507 fprintf(f,
6ea832a2
LP
5508 "%sTTYPath: %s\n"
5509 "%sTTYReset: %s\n"
5510 "%sTTYVHangup: %s\n"
5511 "%sTTYVTDisallocate: %s\n",
5512 prefix, c->tty_path,
5513 prefix, yes_no(c->tty_reset),
5514 prefix, yes_no(c->tty_vhangup),
5515 prefix, yes_no(c->tty_vt_disallocate));
94f04347 5516
9f6444eb 5517 if (IN_SET(c->std_output,
9f6444eb
LP
5518 EXEC_OUTPUT_KMSG,
5519 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5520 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5521 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
5522 IN_SET(c->std_error,
9f6444eb
LP
5523 EXEC_OUTPUT_KMSG,
5524 EXEC_OUTPUT_JOURNAL,
9f6444eb
LP
5525 EXEC_OUTPUT_KMSG_AND_CONSOLE,
5526 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 5527
5ce70e5b 5528 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 5529
837df140
YW
5530 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
5531 if (r >= 0)
5532 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 5533
837df140
YW
5534 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
5535 if (r >= 0)
5536 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 5537 }
94f04347 5538
d3070fbd
LP
5539 if (c->log_level_max >= 0) {
5540 _cleanup_free_ char *t = NULL;
5541
5542 (void) log_level_to_string_alloc(c->log_level_max, &t);
5543
5544 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
5545 }
5546
5ac1530e 5547 if (c->log_ratelimit_interval_usec > 0) {
90fc172e
AZ
5548 char buf_timespan[FORMAT_TIMESPAN_MAX];
5549
5550 fprintf(f,
5551 "%sLogRateLimitIntervalSec: %s\n",
5ac1530e 5552 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_ratelimit_interval_usec, USEC_PER_SEC));
90fc172e
AZ
5553 }
5554
5ac1530e
ZJS
5555 if (c->log_ratelimit_burst > 0)
5556 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_ratelimit_burst);
90fc172e 5557
5b10116e
ZJS
5558 for (size_t j = 0; j < c->n_log_extra_fields; j++) {
5559 fprintf(f, "%sLogExtraFields: ", prefix);
5560 fwrite(c->log_extra_fields[j].iov_base,
5561 1, c->log_extra_fields[j].iov_len,
5562 f);
5563 fputc('\n', f);
d3070fbd
LP
5564 }
5565
91dd5f7c
LP
5566 if (c->log_namespace)
5567 fprintf(f, "%sLogNamespace: %s\n", prefix, c->log_namespace);
5568
07d46372
YW
5569 if (c->secure_bits) {
5570 _cleanup_free_ char *str = NULL;
5571
5572 r = secure_bits_to_string_alloc(c->secure_bits, &str);
5573 if (r >= 0)
5574 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
5575 }
94f04347 5576
a103496c 5577 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 5578 _cleanup_free_ char *str = NULL;
94f04347 5579
dd1f5bd0
YW
5580 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
5581 if (r >= 0)
5582 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
5583 }
5584
5585 if (c->capability_ambient_set != 0) {
dd1f5bd0 5586 _cleanup_free_ char *str = NULL;
755d4b67 5587
dd1f5bd0
YW
5588 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
5589 if (r >= 0)
5590 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
5591 }
5592
5593 if (c->user)
f2d3769a 5594 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 5595 if (c->group)
f2d3769a 5596 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 5597
29206d46
LP
5598 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
5599
ddc155b2 5600 strv_dump(f, prefix, "SupplementaryGroups", c->supplementary_groups);
94f04347 5601
5b6319dc 5602 if (c->pam_name)
f2d3769a 5603 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 5604
ddc155b2
TM
5605 strv_dump(f, prefix, "ReadWritePaths", c->read_write_paths);
5606 strv_dump(f, prefix, "ReadOnlyPaths", c->read_only_paths);
5607 strv_dump(f, prefix, "InaccessiblePaths", c->inaccessible_paths);
5608 strv_dump(f, prefix, "ExecPaths", c->exec_paths);
5609 strv_dump(f, prefix, "NoExecPaths", c->no_exec_paths);
2e22afe9 5610
5b10116e
ZJS
5611 for (size_t i = 0; i < c->n_bind_mounts; i++)
5612 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
5613 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
5614 c->bind_mounts[i].ignore_enoent ? "-": "",
5615 c->bind_mounts[i].source,
5616 c->bind_mounts[i].destination,
5617 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 5618
5b10116e
ZJS
5619 for (size_t i = 0; i < c->n_temporary_filesystems; i++) {
5620 const TemporaryFileSystem *t = c->temporary_filesystems + i;
2abd4e38 5621
5b10116e
ZJS
5622 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
5623 t->path,
5624 isempty(t->options) ? "" : ":",
5625 strempty(t->options));
5626 }
2abd4e38 5627
169c1bda
LP
5628 if (c->utmp_id)
5629 fprintf(f,
5630 "%sUtmpIdentifier: %s\n",
5631 prefix, c->utmp_id);
7b52a628
MS
5632
5633 if (c->selinux_context)
5634 fprintf(f,
5f8640fb
LP
5635 "%sSELinuxContext: %s%s\n",
5636 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 5637
80c21aea
WC
5638 if (c->apparmor_profile)
5639 fprintf(f,
5640 "%sAppArmorProfile: %s%s\n",
5641 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5642
5643 if (c->smack_process_label)
5644 fprintf(f,
5645 "%sSmackProcessLabel: %s%s\n",
5646 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
5647
050f7277 5648 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
5649 fprintf(f,
5650 "%sPersonality: %s\n",
5651 prefix, strna(personality_to_string(c->personality)));
5652
78e864e5
TM
5653 fprintf(f,
5654 "%sLockPersonality: %s\n",
5655 prefix, yes_no(c->lock_personality));
5656
17df7223 5657 if (c->syscall_filter) {
349cc4a5 5658#if HAVE_SECCOMP
8cfa775f 5659 void *id, *val;
17df7223 5660 bool first = true;
351a19b1 5661#endif
17df7223
LP
5662
5663 fprintf(f,
57183d11 5664 "%sSystemCallFilter: ",
17df7223
LP
5665 prefix);
5666
6b000af4 5667 if (!c->syscall_allow_list)
17df7223
LP
5668 fputc('~', f);
5669
349cc4a5 5670#if HAVE_SECCOMP
90e74a66 5671 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter) {
17df7223 5672 _cleanup_free_ char *name = NULL;
8cfa775f
YW
5673 const char *errno_name = NULL;
5674 int num = PTR_TO_INT(val);
17df7223
LP
5675
5676 if (first)
5677 first = false;
5678 else
5679 fputc(' ', f);
5680
57183d11 5681 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 5682 fputs(strna(name), f);
8cfa775f
YW
5683
5684 if (num >= 0) {
005bfaf1 5685 errno_name = seccomp_errno_or_action_to_string(num);
8cfa775f
YW
5686 if (errno_name)
5687 fprintf(f, ":%s", errno_name);
5688 else
5689 fprintf(f, ":%d", num);
5690 }
17df7223 5691 }
351a19b1 5692#endif
17df7223
LP
5693
5694 fputc('\n', f);
5695 }
5696
57183d11 5697 if (c->syscall_archs) {
349cc4a5 5698#if HAVE_SECCOMP
57183d11
LP
5699 void *id;
5700#endif
5701
5702 fprintf(f,
5703 "%sSystemCallArchitectures:",
5704 prefix);
5705
349cc4a5 5706#if HAVE_SECCOMP
90e74a66 5707 SET_FOREACH(id, c->syscall_archs)
57183d11
LP
5708 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
5709#endif
5710 fputc('\n', f);
5711 }
5712
add00535
LP
5713 if (exec_context_restrict_namespaces_set(c)) {
5714 _cleanup_free_ char *s = NULL;
5715
86c2a9f1 5716 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
5717 if (r >= 0)
5718 fprintf(f, "%sRestrictNamespaces: %s\n",
dd0395b5 5719 prefix, strna(s));
add00535
LP
5720 }
5721
a8d08f39
LP
5722 if (c->network_namespace_path)
5723 fprintf(f,
5724 "%sNetworkNamespacePath: %s\n",
5725 prefix, c->network_namespace_path);
5726
3df90f24 5727 if (c->syscall_errno > 0) {
005bfaf1 5728#if HAVE_SECCOMP
3df90f24 5729 const char *errno_name;
005bfaf1 5730#endif
3df90f24
YW
5731
5732 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
5733
005bfaf1
TM
5734#if HAVE_SECCOMP
5735 errno_name = seccomp_errno_or_action_to_string(c->syscall_errno);
3df90f24 5736 if (errno_name)
005bfaf1 5737 fputs(errno_name, f);
3df90f24 5738 else
005bfaf1
TM
5739 fprintf(f, "%d", c->syscall_errno);
5740#endif
5741 fputc('\n', f);
3df90f24 5742 }
b3d13314 5743
5b10116e 5744 for (size_t i = 0; i < c->n_mount_images; i++) {
427353f6
LB
5745 MountOptions *o;
5746
79e20ceb 5747 fprintf(f, "%sMountImages: %s%s:%s", prefix,
b3d13314
LB
5748 c->mount_images[i].ignore_enoent ? "-": "",
5749 c->mount_images[i].source,
79e20ceb 5750 c->mount_images[i].destination);
427353f6 5751 LIST_FOREACH(mount_options, o, c->mount_images[i].mount_options)
79e20ceb 5752 fprintf(f, ":%s:%s",
427353f6 5753 partition_designator_to_string(o->partition_designator),
79e20ceb 5754 strempty(o->options));
427353f6
LB
5755 fprintf(f, "\n");
5756 }
93f59701
LB
5757
5758 for (size_t i = 0; i < c->n_extension_images; i++) {
5759 MountOptions *o;
5760
5761 fprintf(f, "%sExtensionImages: %s%s", prefix,
5762 c->extension_images[i].ignore_enoent ? "-": "",
5763 c->extension_images[i].source);
5764 LIST_FOREACH(mount_options, o, c->extension_images[i].mount_options)
5765 fprintf(f, ":%s:%s",
5766 partition_designator_to_string(o->partition_designator),
5767 strempty(o->options));
5768 fprintf(f, "\n");
5769 }
5cb5a6ff
LP
5770}
5771
34cf6c43 5772bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
5773 assert(c);
5774
61233823 5775 /* Returns true if the process forked off would run under
a931ad47
LP
5776 * an unchanged UID or as root. */
5777
5778 if (!c->user)
5779 return true;
5780
5781 if (streq(c->user, "root") || streq(c->user, "0"))
5782 return true;
5783
5784 return false;
5785}
5786
34cf6c43 5787int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
5788 int p;
5789
5790 assert(c);
5791
5792 if (c->ioprio_set)
5793 return c->ioprio;
5794
5795 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
5796 if (p < 0)
5797 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
5798
5799 return p;
5800}
5801
5e98086d
ZJS
5802bool exec_context_get_effective_mount_apivfs(const ExecContext *c) {
5803 assert(c);
5804
61198784 5805 /* Explicit setting wins */
5e98086d
ZJS
5806 if (c->mount_apivfs_set)
5807 return c->mount_apivfs;
5808
61198784 5809 /* Default to "yes" if root directory or image are specified */
74e12520 5810 if (exec_context_with_rootfs(c))
61198784
ZJS
5811 return true;
5812
5e98086d
ZJS
5813 return false;
5814}
5815
d3070fbd 5816void exec_context_free_log_extra_fields(ExecContext *c) {
d3070fbd
LP
5817 assert(c);
5818
5b10116e 5819 for (size_t l = 0; l < c->n_log_extra_fields; l++)
d3070fbd
LP
5820 free(c->log_extra_fields[l].iov_base);
5821 c->log_extra_fields = mfree(c->log_extra_fields);
5822 c->n_log_extra_fields = 0;
5823}
5824
6f765baf 5825void exec_context_revert_tty(ExecContext *c) {
0ba976e8
LP
5826 _cleanup_close_ int fd = -1;
5827 const char *path;
5828 struct stat st;
6f765baf
LP
5829 int r;
5830
5831 assert(c);
5832
5833 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
5834 exec_context_tty_reset(c, NULL);
5835
5836 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
5837 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
5838 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
0ba976e8
LP
5839 if (!exec_context_may_touch_tty(c))
5840 return;
6f765baf 5841
0ba976e8
LP
5842 path = exec_context_tty_path(c);
5843 if (!path)
5844 return;
6f765baf 5845
0ba976e8
LP
5846 fd = open(path, O_PATH|O_CLOEXEC);
5847 if (fd < 0)
5848 return (void) log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_WARNING, errno,
5849 "Failed to open TTY inode of '%s' to adjust ownership/access mode, ignoring: %m",
5850 path);
5851
5852 if (fstat(fd, &st) < 0)
5853 return (void) log_warning_errno(errno, "Failed to stat TTY '%s', ignoring: %m", path);
5854
5855 /* Let's add a superficial check that we only do this for stuff that looks like a TTY. We only check
5856 * if things are a character device, since a proper check either means we'd have to open the TTY and
5857 * use isatty(), but we'd rather not do that since opening TTYs comes with all kinds of side-effects
5858 * and is slow. Or we'd have to hardcode dev_t major information, which we'd rather avoid. Why bother
5859 * with this at all? → https://github.com/systemd/systemd/issues/19213 */
5860 if (!S_ISCHR(st.st_mode))
5861 return log_warning("Configured TTY '%s' is not actually a character device, ignoring.", path);
5862
5863 r = fchmod_and_chown(fd, TTY_MODE, 0, TTY_GID);
5864 if (r < 0)
5865 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
6f765baf
LP
5866}
5867
4c2f5842
LP
5868int exec_context_get_clean_directories(
5869 ExecContext *c,
5870 char **prefix,
5871 ExecCleanMask mask,
5872 char ***ret) {
5873
5874 _cleanup_strv_free_ char **l = NULL;
4c2f5842
LP
5875 int r;
5876
5877 assert(c);
5878 assert(prefix);
5879 assert(ret);
5880
5b10116e 5881 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4c2f5842
LP
5882 char **i;
5883
5884 if (!FLAGS_SET(mask, 1U << t))
5885 continue;
5886
5887 if (!prefix[t])
5888 continue;
5889
5890 STRV_FOREACH(i, c->directories[t].paths) {
5891 char *j;
5892
5893 j = path_join(prefix[t], *i);
5894 if (!j)
5895 return -ENOMEM;
5896
5897 r = strv_consume(&l, j);
5898 if (r < 0)
5899 return r;
7f622a19
YW
5900
5901 /* Also remove private directories unconditionally. */
5902 if (t != EXEC_DIRECTORY_CONFIGURATION) {
5903 j = path_join(prefix[t], "private", *i);
5904 if (!j)
5905 return -ENOMEM;
5906
5907 r = strv_consume(&l, j);
5908 if (r < 0)
5909 return r;
5910 }
4c2f5842
LP
5911 }
5912 }
5913
5914 *ret = TAKE_PTR(l);
5915 return 0;
5916}
5917
5918int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
5919 ExecCleanMask mask = 0;
5920
5921 assert(c);
5922 assert(ret);
5923
5924 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
5925 if (!strv_isempty(c->directories[t].paths))
5926 mask |= 1U << t;
5927
5928 *ret = mask;
5929 return 0;
5930}
5931
b58b4116 5932void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 5933 assert(s);
5cb5a6ff 5934
2ed26ed0
LP
5935 *s = (ExecStatus) {
5936 .pid = pid,
5937 };
5938
b58b4116
LP
5939 dual_timestamp_get(&s->start_timestamp);
5940}
5941
34cf6c43 5942void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
5943 assert(s);
5944
d46b79bb 5945 if (s->pid != pid)
2ed26ed0
LP
5946 *s = (ExecStatus) {
5947 .pid = pid,
5948 };
b58b4116 5949
63983207 5950 dual_timestamp_get(&s->exit_timestamp);
9fb86720 5951
034c6ed7
LP
5952 s->code = code;
5953 s->status = status;
169c1bda 5954
6f765baf
LP
5955 if (context && context->utmp_id)
5956 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
5957}
5958
6a1d4d9f
LP
5959void exec_status_reset(ExecStatus *s) {
5960 assert(s);
5961
5962 *s = (ExecStatus) {};
5963}
5964
34cf6c43 5965void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
5966 assert(s);
5967 assert(f);
5968
9fb86720
LP
5969 if (s->pid <= 0)
5970 return;
5971
4c940960
LP
5972 prefix = strempty(prefix);
5973
9fb86720 5974 fprintf(f,
ccd06097
ZJS
5975 "%sPID: "PID_FMT"\n",
5976 prefix, s->pid);
9fb86720 5977
af9d16e1 5978 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
5979 fprintf(f,
5980 "%sStart Timestamp: %s\n",
04f5c018 5981 prefix, FORMAT_TIMESTAMP(s->start_timestamp.realtime));
9fb86720 5982
af9d16e1 5983 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
5984 fprintf(f,
5985 "%sExit Timestamp: %s\n"
5986 "%sExit Code: %s\n"
5987 "%sExit Status: %i\n",
04f5c018 5988 prefix, FORMAT_TIMESTAMP(s->exit_timestamp.realtime),
9fb86720
LP
5989 prefix, sigchld_code_to_string(s->code),
5990 prefix, s->status);
5cb5a6ff 5991}
44d8db9e 5992
34cf6c43 5993static char *exec_command_line(char **argv) {
44d8db9e
LP
5994 size_t k;
5995 char *n, *p, **a;
5996 bool first = true;
5997
9e2f7c11 5998 assert(argv);
44d8db9e 5999
9164977d 6000 k = 1;
9e2f7c11 6001 STRV_FOREACH(a, argv)
44d8db9e
LP
6002 k += strlen(*a)+3;
6003
5cd9cd35
LP
6004 n = new(char, k);
6005 if (!n)
44d8db9e
LP
6006 return NULL;
6007
6008 p = n;
9e2f7c11 6009 STRV_FOREACH(a, argv) {
44d8db9e
LP
6010
6011 if (!first)
6012 *(p++) = ' ';
6013 else
6014 first = false;
6015
6016 if (strpbrk(*a, WHITESPACE)) {
6017 *(p++) = '\'';
6018 p = stpcpy(p, *a);
6019 *(p++) = '\'';
6020 } else
6021 p = stpcpy(p, *a);
6022
6023 }
6024
9164977d
LP
6025 *p = 0;
6026
44d8db9e
LP
6027 /* FIXME: this doesn't really handle arguments that have
6028 * spaces and ticks in them */
6029
6030 return n;
6031}
6032
34cf6c43 6033static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 6034 _cleanup_free_ char *cmd = NULL;
4c940960 6035 const char *prefix2;
44d8db9e
LP
6036
6037 assert(c);
6038 assert(f);
6039
4c940960 6040 prefix = strempty(prefix);
63c372cb 6041 prefix2 = strjoina(prefix, "\t");
44d8db9e 6042
9e2f7c11 6043 cmd = exec_command_line(c->argv);
44d8db9e
LP
6044 fprintf(f,
6045 "%sCommand Line: %s\n",
4bbccb02 6046 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 6047
9fb86720 6048 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
6049}
6050
6051void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
6052 assert(f);
6053
4c940960 6054 prefix = strempty(prefix);
44d8db9e
LP
6055
6056 LIST_FOREACH(command, c, c)
6057 exec_command_dump(c, f, prefix);
6058}
94f04347 6059
a6a80b4f
LP
6060void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
6061 ExecCommand *end;
6062
6063 assert(l);
6064 assert(e);
6065
6066 if (*l) {
35b8ca3a 6067 /* It's kind of important, that we keep the order here */
71fda00f
LP
6068 LIST_FIND_TAIL(command, *l, end);
6069 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
6070 } else
6071 *l = e;
6072}
6073
26fd040d
LP
6074int exec_command_set(ExecCommand *c, const char *path, ...) {
6075 va_list ap;
6076 char **l, *p;
6077
6078 assert(c);
6079 assert(path);
6080
6081 va_start(ap, path);
6082 l = strv_new_ap(path, ap);
6083 va_end(ap);
6084
6085 if (!l)
6086 return -ENOMEM;
6087
250a918d
LP
6088 p = strdup(path);
6089 if (!p) {
26fd040d
LP
6090 strv_free(l);
6091 return -ENOMEM;
6092 }
6093
6897dfe8 6094 free_and_replace(c->path, p);
26fd040d 6095
130d3d22 6096 return strv_free_and_replace(c->argv, l);
26fd040d
LP
6097}
6098
86b23b07 6099int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 6100 _cleanup_strv_free_ char **l = NULL;
86b23b07 6101 va_list ap;
86b23b07
JS
6102 int r;
6103
6104 assert(c);
6105 assert(path);
6106
6107 va_start(ap, path);
6108 l = strv_new_ap(path, ap);
6109 va_end(ap);
6110
6111 if (!l)
6112 return -ENOMEM;
6113
e287086b 6114 r = strv_extend_strv(&c->argv, l, false);
e63ff941 6115 if (r < 0)
86b23b07 6116 return r;
86b23b07
JS
6117
6118 return 0;
6119}
6120
e8a565cb
YW
6121static void *remove_tmpdir_thread(void *p) {
6122 _cleanup_free_ char *path = p;
86b23b07 6123
e8a565cb
YW
6124 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
6125 return NULL;
6126}
6127
6128static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
6129 int r;
6130
6131 if (!rt)
6132 return NULL;
6133
6134 if (rt->manager)
6135 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
6136
6137 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
56a13a49
ZJS
6138
6139 if (destroy && rt->tmp_dir && !streq(rt->tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6140 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
6141
6142 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
56a13a49 6143 if (r < 0)
e8a565cb 6144 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
56a13a49
ZJS
6145 else
6146 rt->tmp_dir = NULL;
e8a565cb 6147 }
613b411c 6148
56a13a49 6149 if (destroy && rt->var_tmp_dir && !streq(rt->var_tmp_dir, RUN_SYSTEMD_EMPTY)) {
e8a565cb
YW
6150 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
6151
6152 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
56a13a49 6153 if (r < 0)
e8a565cb 6154 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
56a13a49
ZJS
6155 else
6156 rt->var_tmp_dir = NULL;
e8a565cb
YW
6157 }
6158
6159 rt->id = mfree(rt->id);
6160 rt->tmp_dir = mfree(rt->tmp_dir);
6161 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
6162 safe_close_pair(rt->netns_storage_socket);
a70581ff 6163 safe_close_pair(rt->ipcns_storage_socket);
e8a565cb
YW
6164 return mfree(rt);
6165}
6166
6167static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 6168 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
6169}
6170
56a13a49
ZJS
6171static int exec_runtime_allocate(ExecRuntime **ret, const char *id) {
6172 _cleanup_free_ char *id_copy = NULL;
8e8009dc 6173 ExecRuntime *n;
613b411c 6174
8e8009dc 6175 assert(ret);
613b411c 6176
56a13a49
ZJS
6177 id_copy = strdup(id);
6178 if (!id_copy)
6179 return -ENOMEM;
6180
8e8009dc
LP
6181 n = new(ExecRuntime, 1);
6182 if (!n)
613b411c
LP
6183 return -ENOMEM;
6184
8e8009dc 6185 *n = (ExecRuntime) {
56a13a49 6186 .id = TAKE_PTR(id_copy),
8e8009dc 6187 .netns_storage_socket = { -1, -1 },
a70581ff 6188 .ipcns_storage_socket = { -1, -1 },
8e8009dc
LP
6189 };
6190
6191 *ret = n;
613b411c
LP
6192 return 0;
6193}
6194
e8a565cb
YW
6195static int exec_runtime_add(
6196 Manager *m,
6197 const char *id,
56a13a49
ZJS
6198 char **tmp_dir,
6199 char **var_tmp_dir,
6200 int netns_storage_socket[2],
a70581ff 6201 int ipcns_storage_socket[2],
e8a565cb
YW
6202 ExecRuntime **ret) {
6203
6204 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
6205 int r;
6206
e8a565cb 6207 assert(m);
613b411c
LP
6208 assert(id);
6209
a70581ff 6210 /* tmp_dir, var_tmp_dir, {net,ipc}ns_storage_socket fds are donated on success */
56a13a49 6211
56a13a49 6212 r = exec_runtime_allocate(&rt, id);
613b411c
LP
6213 if (r < 0)
6214 return r;
6215
63083706 6216 r = hashmap_ensure_put(&m->exec_runtime_by_id, &string_hash_ops, rt->id, rt);
56a13a49
ZJS
6217 if (r < 0)
6218 return r;
e8a565cb 6219
56a13a49
ZJS
6220 assert(!!rt->tmp_dir == !!rt->var_tmp_dir); /* We require both to be set together */
6221 rt->tmp_dir = TAKE_PTR(*tmp_dir);
6222 rt->var_tmp_dir = TAKE_PTR(*var_tmp_dir);
e8a565cb
YW
6223
6224 if (netns_storage_socket) {
56a13a49
ZJS
6225 rt->netns_storage_socket[0] = TAKE_FD(netns_storage_socket[0]);
6226 rt->netns_storage_socket[1] = TAKE_FD(netns_storage_socket[1]);
613b411c
LP
6227 }
6228
a70581ff
XR
6229 if (ipcns_storage_socket) {
6230 rt->ipcns_storage_socket[0] = TAKE_FD(ipcns_storage_socket[0]);
6231 rt->ipcns_storage_socket[1] = TAKE_FD(ipcns_storage_socket[1]);
6232 }
6233
e8a565cb
YW
6234 rt->manager = m;
6235
6236 if (ret)
6237 *ret = rt;
e8a565cb 6238 /* do not remove created ExecRuntime object when the operation succeeds. */
56a13a49 6239 TAKE_PTR(rt);
e8a565cb
YW
6240 return 0;
6241}
6242
74aaf59b
LP
6243static int exec_runtime_make(
6244 Manager *m,
6245 const ExecContext *c,
6246 const char *id,
6247 ExecRuntime **ret) {
6248
56a13a49 6249 _cleanup_(namespace_cleanup_tmpdirp) char *tmp_dir = NULL, *var_tmp_dir = NULL;
a70581ff 6250 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 }, ipcns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
6251 int r;
6252
6253 assert(m);
6254 assert(c);
6255 assert(id);
6256
6257 /* It is not necessary to create ExecRuntime object. */
a70581ff 6258 if (!c->private_network && !c->private_ipc && !c->private_tmp && !c->network_namespace_path) {
74aaf59b 6259 *ret = NULL;
e8a565cb 6260 return 0;
74aaf59b 6261 }
e8a565cb 6262
efa2f3a1
TM
6263 if (c->private_tmp &&
6264 !(prefixed_path_strv_contains(c->inaccessible_paths, "/tmp") &&
6265 (prefixed_path_strv_contains(c->inaccessible_paths, "/var/tmp") ||
6266 prefixed_path_strv_contains(c->inaccessible_paths, "/var")))) {
e8a565cb 6267 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
6268 if (r < 0)
6269 return r;
6270 }
6271
a8d08f39 6272 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
6273 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
6274 return -errno;
6275 }
6276
a70581ff
XR
6277 if (c->private_ipc || c->ipc_namespace_path) {
6278 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, ipcns_storage_socket) < 0)
6279 return -errno;
6280 }
6281
6282 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_storage_socket, ipcns_storage_socket, ret);
e8a565cb
YW
6283 if (r < 0)
6284 return r;
6285
613b411c
LP
6286 return 1;
6287}
6288
e8a565cb
YW
6289int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
6290 ExecRuntime *rt;
6291 int r;
613b411c 6292
e8a565cb
YW
6293 assert(m);
6294 assert(id);
6295 assert(ret);
6296
6297 rt = hashmap_get(m->exec_runtime_by_id, id);
6298 if (rt)
387f6955 6299 /* We already have an ExecRuntime object, let's increase the ref count and reuse it */
e8a565cb
YW
6300 goto ref;
6301
74aaf59b
LP
6302 if (!create) {
6303 *ret = NULL;
e8a565cb 6304 return 0;
74aaf59b 6305 }
e8a565cb
YW
6306
6307 /* If not found, then create a new object. */
6308 r = exec_runtime_make(m, c, id, &rt);
74aaf59b 6309 if (r < 0)
e8a565cb 6310 return r;
74aaf59b
LP
6311 if (r == 0) {
6312 /* When r == 0, it is not necessary to create ExecRuntime object. */
6313 *ret = NULL;
6314 return 0;
6315 }
613b411c 6316
e8a565cb
YW
6317ref:
6318 /* increment reference counter. */
6319 rt->n_ref++;
6320 *ret = rt;
6321 return 1;
6322}
613b411c 6323
e8a565cb
YW
6324ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
6325 if (!rt)
613b411c
LP
6326 return NULL;
6327
e8a565cb 6328 assert(rt->n_ref > 0);
613b411c 6329
e8a565cb
YW
6330 rt->n_ref--;
6331 if (rt->n_ref > 0)
f2341e0a
LP
6332 return NULL;
6333
e8a565cb 6334 return exec_runtime_free(rt, destroy);
613b411c
LP
6335}
6336
e8a565cb
YW
6337int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
6338 ExecRuntime *rt;
e8a565cb
YW
6339
6340 assert(m);
613b411c
LP
6341 assert(f);
6342 assert(fds);
6343
90e74a66 6344 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb 6345 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 6346
e8a565cb
YW
6347 if (rt->tmp_dir)
6348 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 6349
e8a565cb
YW
6350 if (rt->var_tmp_dir)
6351 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 6352
e8a565cb
YW
6353 if (rt->netns_storage_socket[0] >= 0) {
6354 int copy;
613b411c 6355
e8a565cb
YW
6356 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
6357 if (copy < 0)
6358 return copy;
613b411c 6359
e8a565cb
YW
6360 fprintf(f, " netns-socket-0=%i", copy);
6361 }
613b411c 6362
e8a565cb
YW
6363 if (rt->netns_storage_socket[1] >= 0) {
6364 int copy;
613b411c 6365
e8a565cb
YW
6366 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
6367 if (copy < 0)
6368 return copy;
613b411c 6369
e8a565cb
YW
6370 fprintf(f, " netns-socket-1=%i", copy);
6371 }
6372
a70581ff
XR
6373 if (rt->ipcns_storage_socket[0] >= 0) {
6374 int copy;
6375
6376 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[0]);
6377 if (copy < 0)
6378 return copy;
6379
6380 fprintf(f, " ipcns-socket-0=%i", copy);
6381 }
6382
6383 if (rt->ipcns_storage_socket[1] >= 0) {
6384 int copy;
6385
6386 copy = fdset_put_dup(fds, rt->ipcns_storage_socket[1]);
6387 if (copy < 0)
6388 return copy;
6389
6390 fprintf(f, " ipcns-socket-1=%i", copy);
6391 }
6392
e8a565cb 6393 fputc('\n', f);
613b411c
LP
6394 }
6395
6396 return 0;
6397}
6398
e8a565cb
YW
6399int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
6400 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
6401 ExecRuntime *rt;
613b411c
LP
6402 int r;
6403
e8a565cb
YW
6404 /* This is for the migration from old (v237 or earlier) deserialization text.
6405 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
6406 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
6407 * so or not from the serialized text, then we always creates a new object owned by this. */
6408
6409 assert(u);
613b411c
LP
6410 assert(key);
6411 assert(value);
6412
e8a565cb
YW
6413 /* Manager manages ExecRuntime objects by the unit id.
6414 * So, we omit the serialized text when the unit does not have id (yet?)... */
6415 if (isempty(u->id)) {
6416 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
6417 return 0;
6418 }
613b411c 6419
cbc165d1
ZJS
6420 if (hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops) < 0)
6421 return log_oom();
e8a565cb
YW
6422
6423 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
6424 if (!rt) {
cbc165d1 6425 if (exec_runtime_allocate(&rt_create, u->id) < 0)
f2341e0a 6426 return log_oom();
613b411c 6427
e8a565cb
YW
6428 rt = rt_create;
6429 }
6430
6431 if (streq(key, "tmp-dir")) {
cbc165d1
ZJS
6432 if (free_and_strdup_warn(&rt->tmp_dir, value) < 0)
6433 return -ENOMEM;
613b411c
LP
6434
6435 } else if (streq(key, "var-tmp-dir")) {
cbc165d1
ZJS
6436 if (free_and_strdup_warn(&rt->var_tmp_dir, value) < 0)
6437 return -ENOMEM;
613b411c
LP
6438
6439 } else if (streq(key, "netns-socket-0")) {
6440 int fd;
6441
e8a565cb 6442 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6443 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6444 return 0;
613b411c 6445 }
e8a565cb
YW
6446
6447 safe_close(rt->netns_storage_socket[0]);
6448 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
6449
613b411c
LP
6450 } else if (streq(key, "netns-socket-1")) {
6451 int fd;
6452
e8a565cb 6453 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 6454 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 6455 return 0;
613b411c 6456 }
e8a565cb
YW
6457
6458 safe_close(rt->netns_storage_socket[1]);
6459 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
a70581ff 6460
613b411c
LP
6461 } else
6462 return 0;
6463
e8a565cb
YW
6464 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
6465 if (rt_create) {
6466 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
6467 if (r < 0) {
3fe91079 6468 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
6469 return 0;
6470 }
613b411c 6471
e8a565cb 6472 rt_create->manager = u->manager;
613b411c 6473
e8a565cb 6474 /* Avoid cleanup */
56a13a49 6475 TAKE_PTR(rt_create);
e8a565cb 6476 }
98b47d54 6477
e8a565cb
YW
6478 return 1;
6479}
613b411c 6480
56a13a49
ZJS
6481int exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
6482 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
6483 char *id = NULL;
a70581ff 6484 int r, netns_fdpair[] = {-1, -1}, ipcns_fdpair[] = {-1, -1};
e8a565cb
YW
6485 const char *p, *v = value;
6486 size_t n;
613b411c 6487
e8a565cb
YW
6488 assert(m);
6489 assert(value);
6490 assert(fds);
98b47d54 6491
e8a565cb
YW
6492 n = strcspn(v, " ");
6493 id = strndupa(v, n);
6494 if (v[n] != ' ')
6495 goto finalize;
6496 p = v + n + 1;
6497
6498 v = startswith(p, "tmp-dir=");
6499 if (v) {
6500 n = strcspn(v, " ");
56a13a49
ZJS
6501 tmp_dir = strndup(v, n);
6502 if (!tmp_dir)
6503 return log_oom();
e8a565cb
YW
6504 if (v[n] != ' ')
6505 goto finalize;
6506 p = v + n + 1;
6507 }
6508
6509 v = startswith(p, "var-tmp-dir=");
6510 if (v) {
6511 n = strcspn(v, " ");
56a13a49
ZJS
6512 var_tmp_dir = strndup(v, n);
6513 if (!var_tmp_dir)
6514 return log_oom();
e8a565cb
YW
6515 if (v[n] != ' ')
6516 goto finalize;
6517 p = v + n + 1;
6518 }
6519
6520 v = startswith(p, "netns-socket-0=");
6521 if (v) {
6522 char *buf;
6523
6524 n = strcspn(v, " ");
6525 buf = strndupa(v, n);
c413bb28 6526
a70581ff 6527 r = safe_atoi(buf, &netns_fdpair[0]);
c413bb28
ZJS
6528 if (r < 0)
6529 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-0=%s: %m", buf);
a70581ff 6530 if (!fdset_contains(fds, netns_fdpair[0]))
c413bb28 6531 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6532 "exec-runtime specification netns-socket-0= refers to unknown fd %d: %m", netns_fdpair[0]);
6533 netns_fdpair[0] = fdset_remove(fds, netns_fdpair[0]);
e8a565cb
YW
6534 if (v[n] != ' ')
6535 goto finalize;
6536 p = v + n + 1;
613b411c
LP
6537 }
6538
e8a565cb
YW
6539 v = startswith(p, "netns-socket-1=");
6540 if (v) {
6541 char *buf;
98b47d54 6542
e8a565cb
YW
6543 n = strcspn(v, " ");
6544 buf = strndupa(v, n);
a70581ff
XR
6545
6546 r = safe_atoi(buf, &netns_fdpair[1]);
c413bb28
ZJS
6547 if (r < 0)
6548 return log_debug_errno(r, "Unable to parse exec-runtime specification netns-socket-1=%s: %m", buf);
a70581ff
XR
6549 if (!fdset_contains(fds, netns_fdpair[1]))
6550 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6551 "exec-runtime specification netns-socket-1= refers to unknown fd %d: %m", netns_fdpair[1]);
6552 netns_fdpair[1] = fdset_remove(fds, netns_fdpair[1]);
6553 if (v[n] != ' ')
6554 goto finalize;
6555 p = v + n + 1;
6556 }
6557
6558 v = startswith(p, "ipcns-socket-0=");
6559 if (v) {
6560 char *buf;
6561
6562 n = strcspn(v, " ");
6563 buf = strndupa(v, n);
6564
6565 r = safe_atoi(buf, &ipcns_fdpair[0]);
6566 if (r < 0)
6567 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-0=%s: %m", buf);
6568 if (!fdset_contains(fds, ipcns_fdpair[0]))
6569 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
6570 "exec-runtime specification ipcns-socket-0= refers to unknown fd %d: %m", ipcns_fdpair[0]);
6571 ipcns_fdpair[0] = fdset_remove(fds, ipcns_fdpair[0]);
6572 if (v[n] != ' ')
6573 goto finalize;
6574 p = v + n + 1;
6575 }
6576
6577 v = startswith(p, "ipcns-socket-1=");
6578 if (v) {
6579 char *buf;
6580
6581 n = strcspn(v, " ");
6582 buf = strndupa(v, n);
6583
6584 r = safe_atoi(buf, &ipcns_fdpair[1]);
6585 if (r < 0)
6586 return log_debug_errno(r, "Unable to parse exec-runtime specification ipcns-socket-1=%s: %m", buf);
6587 if (!fdset_contains(fds, ipcns_fdpair[1]))
c413bb28 6588 return log_debug_errno(SYNTHETIC_ERRNO(EBADF),
a70581ff
XR
6589 "exec-runtime specification ipcns-socket-1= refers to unknown fd %d: %m", ipcns_fdpair[1]);
6590 ipcns_fdpair[1] = fdset_remove(fds, ipcns_fdpair[1]);
e8a565cb 6591 }
98b47d54 6592
e8a565cb 6593finalize:
a70581ff 6594 r = exec_runtime_add(m, id, &tmp_dir, &var_tmp_dir, netns_fdpair, ipcns_fdpair, NULL);
7d853ca6 6595 if (r < 0)
56a13a49
ZJS
6596 return log_debug_errno(r, "Failed to add exec-runtime: %m");
6597 return 0;
e8a565cb 6598}
613b411c 6599
e8a565cb
YW
6600void exec_runtime_vacuum(Manager *m) {
6601 ExecRuntime *rt;
e8a565cb
YW
6602
6603 assert(m);
6604
6605 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
6606
90e74a66 6607 HASHMAP_FOREACH(rt, m->exec_runtime_by_id) {
e8a565cb
YW
6608 if (rt->n_ref > 0)
6609 continue;
6610
6611 (void) exec_runtime_free(rt, false);
6612 }
613b411c
LP
6613}
6614
b9c04eaf
YW
6615void exec_params_clear(ExecParameters *p) {
6616 if (!p)
6617 return;
6618
c3f8a065
LP
6619 p->environment = strv_free(p->environment);
6620 p->fd_names = strv_free(p->fd_names);
6621 p->fds = mfree(p->fds);
6622 p->exec_fd = safe_close(p->exec_fd);
b9c04eaf
YW
6623}
6624
bb0c0d6f
LP
6625ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
6626 if (!sc)
6627 return NULL;
6628
6629 free(sc->id);
6630 free(sc->data);
6631 return mfree(sc);
6632}
6633
43144be4
LP
6634ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
6635 if (!lc)
6636 return NULL;
6637
6638 free(lc->id);
6639 free(lc->path);
6640 return mfree(lc);
6641}
6642
bb0c0d6f 6643DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_set_credential_hash_ops, char, string_hash_func, string_compare_func, ExecSetCredential, exec_set_credential_free);
43144be4 6644DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(exec_load_credential_hash_ops, char, string_hash_func, string_compare_func, ExecLoadCredential, exec_load_credential_free);
bb0c0d6f 6645
80876c20
LP
6646static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
6647 [EXEC_INPUT_NULL] = "null",
6648 [EXEC_INPUT_TTY] = "tty",
6649 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 6650 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
6651 [EXEC_INPUT_SOCKET] = "socket",
6652 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 6653 [EXEC_INPUT_DATA] = "data",
2038c3f5 6654 [EXEC_INPUT_FILE] = "file",
80876c20
LP
6655};
6656
8a0867d6
LP
6657DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
6658
94f04347 6659static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 6660 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 6661 [EXEC_OUTPUT_NULL] = "null",
80876c20 6662 [EXEC_OUTPUT_TTY] = "tty",
9a6bca7a 6663 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 6664 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
6665 [EXEC_OUTPUT_JOURNAL] = "journal",
6666 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
6667 [EXEC_OUTPUT_SOCKET] = "socket",
6668 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 6669 [EXEC_OUTPUT_FILE] = "file",
566b7d23 6670 [EXEC_OUTPUT_FILE_APPEND] = "append",
8d7dab1f 6671 [EXEC_OUTPUT_FILE_TRUNCATE] = "truncate",
94f04347
LP
6672};
6673
6674DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
6675
6676static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
6677 [EXEC_UTMP_INIT] = "init",
6678 [EXEC_UTMP_LOGIN] = "login",
6679 [EXEC_UTMP_USER] = "user",
6680};
6681
6682DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
6683
6684static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
6685 [EXEC_PRESERVE_NO] = "no",
6686 [EXEC_PRESERVE_YES] = "yes",
6687 [EXEC_PRESERVE_RESTART] = "restart",
6688};
6689
6690DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 6691
6b7b2ed9 6692/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 6693static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
6694 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
6695 [EXEC_DIRECTORY_STATE] = "StateDirectory",
6696 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
6697 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
6698 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
6699};
6700
6701DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 6702
6b7b2ed9
LP
6703/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
6704 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
6705 * directories, specifically .timer units with their timestamp touch file. */
6706static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6707 [EXEC_DIRECTORY_RUNTIME] = "runtime",
6708 [EXEC_DIRECTORY_STATE] = "state",
6709 [EXEC_DIRECTORY_CACHE] = "cache",
6710 [EXEC_DIRECTORY_LOGS] = "logs",
6711 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
6712};
6713
6714DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
6715
6716/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
6717 * the service payload in. */
fb2042dd
YW
6718static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
6719 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
6720 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
6721 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
6722 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
6723 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
6724};
6725
6726DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
6727
b1edf445
LP
6728static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
6729 [EXEC_KEYRING_INHERIT] = "inherit",
6730 [EXEC_KEYRING_PRIVATE] = "private",
6731 [EXEC_KEYRING_SHARED] = "shared",
6732};
6733
6734DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);