]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
core: introduce exec_directory_is_private() helper function
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
a7334b09 2
034c6ed7
LP
3#include <errno.h>
4#include <fcntl.h>
8dd4c05b
LP
5#include <glob.h>
6#include <grp.h>
7#include <poll.h>
309bff19 8#include <signal.h>
8dd4c05b 9#include <string.h>
19c0b0b9 10#include <sys/capability.h>
d251207d 11#include <sys/eventfd.h>
f3e43635 12#include <sys/mman.h>
8dd4c05b 13#include <sys/personality.h>
94f04347 14#include <sys/prctl.h>
d2ffa389 15#include <sys/shm.h>
8dd4c05b 16#include <sys/socket.h>
451a074f 17#include <sys/stat.h>
d2ffa389 18#include <sys/types.h>
8dd4c05b
LP
19#include <sys/un.h>
20#include <unistd.h>
023a4f67 21#include <utmpx.h>
5cb5a6ff 22
349cc4a5 23#if HAVE_PAM
5b6319dc
LP
24#include <security/pam_appl.h>
25#endif
26
349cc4a5 27#if HAVE_SELINUX
7b52a628
MS
28#include <selinux/selinux.h>
29#endif
30
349cc4a5 31#if HAVE_SECCOMP
17df7223
LP
32#include <seccomp.h>
33#endif
34
349cc4a5 35#if HAVE_APPARMOR
eef65bf3
MS
36#include <sys/apparmor.h>
37#endif
38
24882e06 39#include "sd-messages.h"
8dd4c05b
LP
40
41#include "af-list.h"
b5efdb8a 42#include "alloc-util.h"
349cc4a5 43#if HAVE_APPARMOR
3ffd4af2
LP
44#include "apparmor-util.h"
45#endif
8dd4c05b
LP
46#include "async.h"
47#include "barrier.h"
8dd4c05b 48#include "cap-list.h"
430f0182 49#include "capability-util.h"
a1164ae3 50#include "chown-recursive.h"
da681e1b 51#include "cpu-set-util.h"
f6a6225e 52#include "def.h"
686d13b9 53#include "env-file.h"
4d1a6904 54#include "env-util.h"
17df7223 55#include "errno-list.h"
3ffd4af2 56#include "execute.h"
8dd4c05b 57#include "exit-status.h"
3ffd4af2 58#include "fd-util.h"
f97b34a6 59#include "format-util.h"
f4f15635 60#include "fs-util.h"
7d50b32a 61#include "glob-util.h"
c004493c 62#include "io-util.h"
8dd4c05b 63#include "ioprio.h"
a1164ae3 64#include "label.h"
8dd4c05b
LP
65#include "log.h"
66#include "macro.h"
e8a565cb 67#include "manager.h"
0a970718 68#include "memory-util.h"
8dd4c05b
LP
69#include "missing.h"
70#include "mkdir.h"
71#include "namespace.h"
6bedfcbb 72#include "parse-util.h"
8dd4c05b 73#include "path-util.h"
0b452006 74#include "process-util.h"
78f22b97 75#include "rlimit-util.h"
8dd4c05b 76#include "rm-rf.h"
349cc4a5 77#if HAVE_SECCOMP
3ffd4af2
LP
78#include "seccomp-util.h"
79#endif
07d46372 80#include "securebits-util.h"
8dd4c05b 81#include "selinux-util.h"
24882e06 82#include "signal-util.h"
8dd4c05b 83#include "smack-util.h"
57b7a260 84#include "socket-util.h"
fd63e712 85#include "special.h"
949befd3 86#include "stat-util.h"
8b43440b 87#include "string-table.h"
07630cea 88#include "string-util.h"
8dd4c05b 89#include "strv.h"
7ccbd1ae 90#include "syslog-util.h"
8dd4c05b 91#include "terminal-util.h"
566b7d23 92#include "umask-util.h"
8dd4c05b 93#include "unit.h"
b1d4f8e1 94#include "user-util.h"
8dd4c05b 95#include "utmp-wtmp.h"
5cb5a6ff 96
e056b01d 97#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 98#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 99
531dca78
LP
100#define SNDBUF_SIZE (8*1024*1024)
101
da6053d0 102static int shift_fds(int fds[], size_t n_fds) {
034c6ed7
LP
103 int start, restart_from;
104
105 if (n_fds <= 0)
106 return 0;
107
a0d40ac5
LP
108 /* Modifies the fds array! (sorts it) */
109
034c6ed7
LP
110 assert(fds);
111
112 start = 0;
113 for (;;) {
114 int i;
115
116 restart_from = -1;
117
118 for (i = start; i < (int) n_fds; i++) {
119 int nfd;
120
121 /* Already at right index? */
122 if (fds[i] == i+3)
123 continue;
124
3cc2aff1
LP
125 nfd = fcntl(fds[i], F_DUPFD, i + 3);
126 if (nfd < 0)
034c6ed7
LP
127 return -errno;
128
03e334a1 129 safe_close(fds[i]);
034c6ed7
LP
130 fds[i] = nfd;
131
132 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 133 * let's remember that and try again from here */
034c6ed7
LP
134 if (nfd != i+3 && restart_from < 0)
135 restart_from = i;
136 }
137
138 if (restart_from < 0)
139 break;
140
141 start = restart_from;
142 }
143
144 return 0;
145}
146
25b583d7 147static int flags_fds(const int fds[], size_t n_socket_fds, size_t n_storage_fds, bool nonblock) {
da6053d0 148 size_t i, n_fds;
e2c76839 149 int r;
47a71eed 150
25b583d7 151 n_fds = n_socket_fds + n_storage_fds;
47a71eed
LP
152 if (n_fds <= 0)
153 return 0;
154
155 assert(fds);
156
9b141911
FB
157 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
158 * O_NONBLOCK only applies to socket activation though. */
47a71eed
LP
159
160 for (i = 0; i < n_fds; i++) {
47a71eed 161
9b141911
FB
162 if (i < n_socket_fds) {
163 r = fd_nonblock(fds[i], nonblock);
164 if (r < 0)
165 return r;
166 }
47a71eed 167
451a074f
LP
168 /* We unconditionally drop FD_CLOEXEC from the fds,
169 * since after all we want to pass these fds to our
170 * children */
47a71eed 171
3cc2aff1
LP
172 r = fd_cloexec(fds[i], false);
173 if (r < 0)
e2c76839 174 return r;
47a71eed
LP
175 }
176
177 return 0;
178}
179
1e22b5cd 180static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
181 assert(context);
182
1e22b5cd
LP
183 if (context->stdio_as_fds)
184 return NULL;
185
80876c20
LP
186 if (context->tty_path)
187 return context->tty_path;
188
189 return "/dev/console";
190}
191
1e22b5cd
LP
192static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
193 const char *path;
194
6ea832a2
LP
195 assert(context);
196
1e22b5cd 197 path = exec_context_tty_path(context);
6ea832a2 198
1e22b5cd
LP
199 if (context->tty_vhangup) {
200 if (p && p->stdin_fd >= 0)
201 (void) terminal_vhangup_fd(p->stdin_fd);
202 else if (path)
203 (void) terminal_vhangup(path);
204 }
6ea832a2 205
1e22b5cd
LP
206 if (context->tty_reset) {
207 if (p && p->stdin_fd >= 0)
208 (void) reset_terminal_fd(p->stdin_fd, true);
209 else if (path)
210 (void) reset_terminal(path);
211 }
212
213 if (context->tty_vt_disallocate && path)
214 (void) vt_disallocate(path);
6ea832a2
LP
215}
216
6af760f3
LP
217static bool is_terminal_input(ExecInput i) {
218 return IN_SET(i,
219 EXEC_INPUT_TTY,
220 EXEC_INPUT_TTY_FORCE,
221 EXEC_INPUT_TTY_FAIL);
222}
223
3a1286b6 224static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
225 return IN_SET(o,
226 EXEC_OUTPUT_TTY,
227 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
228 EXEC_OUTPUT_KMSG_AND_CONSOLE,
229 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
230}
231
aac8c0c3
LP
232static bool is_syslog_output(ExecOutput o) {
233 return IN_SET(o,
234 EXEC_OUTPUT_SYSLOG,
235 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
236}
237
238static bool is_kmsg_output(ExecOutput o) {
239 return IN_SET(o,
240 EXEC_OUTPUT_KMSG,
241 EXEC_OUTPUT_KMSG_AND_CONSOLE);
242}
243
6af760f3
LP
244static bool exec_context_needs_term(const ExecContext *c) {
245 assert(c);
246
247 /* Return true if the execution context suggests we should set $TERM to something useful. */
248
249 if (is_terminal_input(c->std_input))
250 return true;
251
252 if (is_terminal_output(c->std_output))
253 return true;
254
255 if (is_terminal_output(c->std_error))
256 return true;
257
258 return !!c->tty_path;
3a1286b6
MS
259}
260
80876c20 261static int open_null_as(int flags, int nfd) {
046a82c1 262 int fd;
071830ff 263
80876c20 264 assert(nfd >= 0);
071830ff 265
613b411c
LP
266 fd = open("/dev/null", flags|O_NOCTTY);
267 if (fd < 0)
071830ff
LP
268 return -errno;
269
046a82c1 270 return move_fd(fd, nfd, false);
071830ff
LP
271}
272
524daa8c 273static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
92a17af9 274 static const union sockaddr_union sa = {
b92bea5d
ZJS
275 .un.sun_family = AF_UNIX,
276 .un.sun_path = "/run/systemd/journal/stdout",
277 };
524daa8c
ZJS
278 uid_t olduid = UID_INVALID;
279 gid_t oldgid = GID_INVALID;
280 int r;
281
cad93f29 282 if (gid_is_valid(gid)) {
524daa8c
ZJS
283 oldgid = getgid();
284
92a17af9 285 if (setegid(gid) < 0)
524daa8c
ZJS
286 return -errno;
287 }
288
cad93f29 289 if (uid_is_valid(uid)) {
524daa8c
ZJS
290 olduid = getuid();
291
92a17af9 292 if (seteuid(uid) < 0) {
524daa8c
ZJS
293 r = -errno;
294 goto restore_gid;
295 }
296 }
297
92a17af9 298 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
524daa8c
ZJS
299
300 /* If we fail to restore the uid or gid, things will likely
301 fail later on. This should only happen if an LSM interferes. */
302
cad93f29 303 if (uid_is_valid(uid))
524daa8c
ZJS
304 (void) seteuid(olduid);
305
306 restore_gid:
cad93f29 307 if (gid_is_valid(gid))
524daa8c
ZJS
308 (void) setegid(oldgid);
309
310 return r;
311}
312
fd1f9c89 313static int connect_logger_as(
34cf6c43 314 const Unit *unit,
fd1f9c89 315 const ExecContext *context,
af635cf3 316 const ExecParameters *params,
fd1f9c89
LP
317 ExecOutput output,
318 const char *ident,
fd1f9c89
LP
319 int nfd,
320 uid_t uid,
321 gid_t gid) {
322
2ac1ff68
EV
323 _cleanup_close_ int fd = -1;
324 int r;
071830ff
LP
325
326 assert(context);
af635cf3 327 assert(params);
80876c20
LP
328 assert(output < _EXEC_OUTPUT_MAX);
329 assert(ident);
330 assert(nfd >= 0);
071830ff 331
54fe0cdb
LP
332 fd = socket(AF_UNIX, SOCK_STREAM, 0);
333 if (fd < 0)
80876c20 334 return -errno;
071830ff 335
524daa8c
ZJS
336 r = connect_journal_socket(fd, uid, gid);
337 if (r < 0)
338 return r;
071830ff 339
2ac1ff68 340 if (shutdown(fd, SHUT_RD) < 0)
80876c20 341 return -errno;
071830ff 342
fd1f9c89 343 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 344
2ac1ff68 345 if (dprintf(fd,
62bca2c6 346 "%s\n"
80876c20
LP
347 "%s\n"
348 "%i\n"
54fe0cdb
LP
349 "%i\n"
350 "%i\n"
351 "%i\n"
4f4a1dbf 352 "%i\n",
c867611e 353 context->syslog_identifier ?: ident,
af635cf3 354 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
355 context->syslog_priority,
356 !!context->syslog_level_prefix,
aac8c0c3
LP
357 is_syslog_output(output),
358 is_kmsg_output(output),
2ac1ff68
EV
359 is_terminal_output(output)) < 0)
360 return -errno;
80876c20 361
2ac1ff68 362 return move_fd(TAKE_FD(fd), nfd, false);
80876c20 363}
2ac1ff68 364
3a274a21 365static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 366 int fd;
071830ff 367
80876c20
LP
368 assert(path);
369 assert(nfd >= 0);
fd1f9c89 370
3a274a21 371 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 372 if (fd < 0)
80876c20 373 return fd;
071830ff 374
046a82c1 375 return move_fd(fd, nfd, false);
80876c20 376}
071830ff 377
2038c3f5 378static int acquire_path(const char *path, int flags, mode_t mode) {
15a3e96f
LP
379 union sockaddr_union sa = {};
380 _cleanup_close_ int fd = -1;
381 int r, salen;
071830ff 382
80876c20 383 assert(path);
071830ff 384
2038c3f5
LP
385 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
386 flags |= O_CREAT;
387
388 fd = open(path, flags|O_NOCTTY, mode);
389 if (fd >= 0)
15a3e96f 390 return TAKE_FD(fd);
071830ff 391
2038c3f5
LP
392 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
393 return -errno;
15a3e96f 394 if (strlen(path) >= sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
2038c3f5
LP
395 return -ENXIO;
396
397 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
398
399 fd = socket(AF_UNIX, SOCK_STREAM, 0);
400 if (fd < 0)
401 return -errno;
402
15a3e96f
LP
403 salen = sockaddr_un_set_path(&sa.un, path);
404 if (salen < 0)
405 return salen;
406
407 if (connect(fd, &sa.sa, salen) < 0)
2038c3f5
LP
408 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
409 * indication that his wasn't an AF_UNIX socket after all */
071830ff 410
2038c3f5
LP
411 if ((flags & O_ACCMODE) == O_RDONLY)
412 r = shutdown(fd, SHUT_WR);
413 else if ((flags & O_ACCMODE) == O_WRONLY)
414 r = shutdown(fd, SHUT_RD);
415 else
15a3e96f
LP
416 return TAKE_FD(fd);
417 if (r < 0)
2038c3f5 418 return -errno;
2038c3f5 419
15a3e96f 420 return TAKE_FD(fd);
80876c20 421}
071830ff 422
08f3be7a
LP
423static int fixup_input(
424 const ExecContext *context,
425 int socket_fd,
426 bool apply_tty_stdin) {
427
428 ExecInput std_input;
429
430 assert(context);
431
432 std_input = context->std_input;
1e3ad081
LP
433
434 if (is_terminal_input(std_input) && !apply_tty_stdin)
435 return EXEC_INPUT_NULL;
071830ff 436
03fd9c49 437 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
438 return EXEC_INPUT_NULL;
439
08f3be7a
LP
440 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
441 return EXEC_INPUT_NULL;
442
03fd9c49 443 return std_input;
4f2d528d
LP
444}
445
03fd9c49 446static int fixup_output(ExecOutput std_output, int socket_fd) {
4f2d528d 447
03fd9c49 448 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
449 return EXEC_OUTPUT_INHERIT;
450
03fd9c49 451 return std_output;
4f2d528d
LP
452}
453
a34ceba6
LP
454static int setup_input(
455 const ExecContext *context,
456 const ExecParameters *params,
52c239d7 457 int socket_fd,
2caa38e9 458 const int named_iofds[static 3]) {
a34ceba6 459
4f2d528d
LP
460 ExecInput i;
461
462 assert(context);
a34ceba6 463 assert(params);
2caa38e9 464 assert(named_iofds);
a34ceba6
LP
465
466 if (params->stdin_fd >= 0) {
467 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
468 return -errno;
469
470 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
471 if (isatty(STDIN_FILENO)) {
472 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
473 (void) reset_terminal_fd(STDIN_FILENO, true);
474 }
a34ceba6
LP
475
476 return STDIN_FILENO;
477 }
4f2d528d 478
08f3be7a 479 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
480
481 switch (i) {
071830ff 482
80876c20
LP
483 case EXEC_INPUT_NULL:
484 return open_null_as(O_RDONLY, STDIN_FILENO);
485
486 case EXEC_INPUT_TTY:
487 case EXEC_INPUT_TTY_FORCE:
488 case EXEC_INPUT_TTY_FAIL: {
046a82c1 489 int fd;
071830ff 490
1e22b5cd 491 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
492 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
493 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
494 ACQUIRE_TERMINAL_WAIT,
3a43da28 495 USEC_INFINITY);
970edce6 496 if (fd < 0)
80876c20
LP
497 return fd;
498
046a82c1 499 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
500 }
501
4f2d528d 502 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
503 assert(socket_fd >= 0);
504
4f2d528d
LP
505 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
506
52c239d7 507 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
508 assert(named_iofds[STDIN_FILENO] >= 0);
509
52c239d7
LB
510 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
511 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
512
08f3be7a
LP
513 case EXEC_INPUT_DATA: {
514 int fd;
515
516 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
517 if (fd < 0)
518 return fd;
519
520 return move_fd(fd, STDIN_FILENO, false);
521 }
522
2038c3f5
LP
523 case EXEC_INPUT_FILE: {
524 bool rw;
525 int fd;
526
527 assert(context->stdio_file[STDIN_FILENO]);
528
529 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
530 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
531
532 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
533 if (fd < 0)
534 return fd;
535
536 return move_fd(fd, STDIN_FILENO, false);
537 }
538
80876c20
LP
539 default:
540 assert_not_reached("Unknown input type");
541 }
542}
543
41fc585a
LP
544static bool can_inherit_stderr_from_stdout(
545 const ExecContext *context,
546 ExecOutput o,
547 ExecOutput e) {
548
549 assert(context);
550
551 /* Returns true, if given the specified STDERR and STDOUT output we can directly dup() the stdout fd to the
552 * stderr fd */
553
554 if (e == EXEC_OUTPUT_INHERIT)
555 return true;
556 if (e != o)
557 return false;
558
559 if (e == EXEC_OUTPUT_NAMED_FD)
560 return streq_ptr(context->stdio_fdname[STDOUT_FILENO], context->stdio_fdname[STDERR_FILENO]);
561
562 if (IN_SET(e, EXEC_OUTPUT_FILE, EXEC_OUTPUT_FILE_APPEND))
563 return streq_ptr(context->stdio_file[STDOUT_FILENO], context->stdio_file[STDERR_FILENO]);
564
565 return true;
566}
567
a34ceba6 568static int setup_output(
34cf6c43 569 const Unit *unit,
a34ceba6
LP
570 const ExecContext *context,
571 const ExecParameters *params,
572 int fileno,
573 int socket_fd,
2caa38e9 574 const int named_iofds[static 3],
a34ceba6 575 const char *ident,
7bce046b
LP
576 uid_t uid,
577 gid_t gid,
578 dev_t *journal_stream_dev,
579 ino_t *journal_stream_ino) {
a34ceba6 580
4f2d528d
LP
581 ExecOutput o;
582 ExecInput i;
47c1d80d 583 int r;
4f2d528d 584
f2341e0a 585 assert(unit);
80876c20 586 assert(context);
a34ceba6 587 assert(params);
80876c20 588 assert(ident);
7bce046b
LP
589 assert(journal_stream_dev);
590 assert(journal_stream_ino);
80876c20 591
a34ceba6
LP
592 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
593
594 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
595 return -errno;
596
597 return STDOUT_FILENO;
598 }
599
600 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
601 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
602 return -errno;
603
604 return STDERR_FILENO;
605 }
606
08f3be7a 607 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 608 o = fixup_output(context->std_output, socket_fd);
4f2d528d 609
eb17e935
MS
610 if (fileno == STDERR_FILENO) {
611 ExecOutput e;
612 e = fixup_output(context->std_error, socket_fd);
80876c20 613
eb17e935
MS
614 /* This expects the input and output are already set up */
615
616 /* Don't change the stderr file descriptor if we inherit all
617 * the way and are not on a tty */
618 if (e == EXEC_OUTPUT_INHERIT &&
619 o == EXEC_OUTPUT_INHERIT &&
620 i == EXEC_INPUT_NULL &&
621 !is_terminal_input(context->std_input) &&
622 getppid () != 1)
623 return fileno;
624
625 /* Duplicate from stdout if possible */
41fc585a 626 if (can_inherit_stderr_from_stdout(context, o, e))
eb17e935 627 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 628
eb17e935 629 o = e;
80876c20 630
eb17e935 631 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
632 /* If input got downgraded, inherit the original value */
633 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 634 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 635
08f3be7a
LP
636 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
637 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 638 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 639
acb591e4
LP
640 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
641 if (getppid() != 1)
eb17e935 642 return fileno;
94f04347 643
eb17e935
MS
644 /* We need to open /dev/null here anew, to get the right access mode. */
645 return open_null_as(O_WRONLY, fileno);
071830ff 646 }
94f04347 647
eb17e935 648 switch (o) {
80876c20
LP
649
650 case EXEC_OUTPUT_NULL:
eb17e935 651 return open_null_as(O_WRONLY, fileno);
80876c20
LP
652
653 case EXEC_OUTPUT_TTY:
4f2d528d 654 if (is_terminal_input(i))
eb17e935 655 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
656
657 /* We don't reset the terminal if this is just about output */
1e22b5cd 658 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20
LP
659
660 case EXEC_OUTPUT_SYSLOG:
28dbc1e8 661 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
9a6bca7a 662 case EXEC_OUTPUT_KMSG:
28dbc1e8 663 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
664 case EXEC_OUTPUT_JOURNAL:
665 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 666 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 667 if (r < 0) {
82677ae4 668 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 669 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
670 } else {
671 struct stat st;
672
673 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
674 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
675 * services to detect whether they are connected to the journal or not.
676 *
677 * If both stdout and stderr are connected to a stream then let's make sure to store the data
678 * about STDERR as that's usually the best way to do logging. */
7bce046b 679
ab2116b1
LP
680 if (fstat(fileno, &st) >= 0 &&
681 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
682 *journal_stream_dev = st.st_dev;
683 *journal_stream_ino = st.st_ino;
684 }
47c1d80d
MS
685 }
686 return r;
4f2d528d
LP
687
688 case EXEC_OUTPUT_SOCKET:
689 assert(socket_fd >= 0);
e75a9ed1 690
eb17e935 691 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 692
52c239d7 693 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
694 assert(named_iofds[fileno] >= 0);
695
52c239d7
LB
696 (void) fd_nonblock(named_iofds[fileno], false);
697 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
698
566b7d23
ZD
699 case EXEC_OUTPUT_FILE:
700 case EXEC_OUTPUT_FILE_APPEND: {
2038c3f5 701 bool rw;
566b7d23 702 int fd, flags;
2038c3f5
LP
703
704 assert(context->stdio_file[fileno]);
705
706 rw = context->std_input == EXEC_INPUT_FILE &&
707 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
708
709 if (rw)
710 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
711
566b7d23
ZD
712 flags = O_WRONLY;
713 if (o == EXEC_OUTPUT_FILE_APPEND)
714 flags |= O_APPEND;
715
716 fd = acquire_path(context->stdio_file[fileno], flags, 0666 & ~context->umask);
2038c3f5
LP
717 if (fd < 0)
718 return fd;
719
566b7d23 720 return move_fd(fd, fileno, 0);
2038c3f5
LP
721 }
722
94f04347 723 default:
80876c20 724 assert_not_reached("Unknown error type");
94f04347 725 }
071830ff
LP
726}
727
02a51aba 728static int chown_terminal(int fd, uid_t uid) {
4b3b5bc7 729 int r;
02a51aba
LP
730
731 assert(fd >= 0);
02a51aba 732
1ff74fb6 733 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
4b3b5bc7
LP
734 if (isatty(fd) < 1) {
735 if (IN_SET(errno, EINVAL, ENOTTY))
736 return 0; /* not a tty */
1ff74fb6 737
02a51aba 738 return -errno;
4b3b5bc7 739 }
02a51aba 740
4b3b5bc7
LP
741 /* This might fail. What matters are the results. */
742 r = fchmod_and_chown(fd, TTY_MODE, uid, -1);
743 if (r < 0)
744 return r;
02a51aba 745
4b3b5bc7 746 return 1;
02a51aba
LP
747}
748
7d5ceb64 749static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
750 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
751 int r;
80876c20 752
80876c20
LP
753 assert(_saved_stdin);
754 assert(_saved_stdout);
755
af6da548
LP
756 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
757 if (saved_stdin < 0)
758 return -errno;
80876c20 759
af6da548 760 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
761 if (saved_stdout < 0)
762 return -errno;
80876c20 763
8854d795 764 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
765 if (fd < 0)
766 return fd;
80876c20 767
af6da548
LP
768 r = chown_terminal(fd, getuid());
769 if (r < 0)
3d18b167 770 return r;
02a51aba 771
3d18b167
LP
772 r = reset_terminal_fd(fd, true);
773 if (r < 0)
774 return r;
80876c20 775
2b33ab09 776 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 777 fd = -1;
2b33ab09
LP
778 if (r < 0)
779 return r;
80876c20
LP
780
781 *_saved_stdin = saved_stdin;
782 *_saved_stdout = saved_stdout;
783
3d18b167 784 saved_stdin = saved_stdout = -1;
80876c20 785
3d18b167 786 return 0;
80876c20
LP
787}
788
63d77c92 789static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
790 assert(err < 0);
791
792 if (err == -ETIMEDOUT)
63d77c92 793 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
794 else {
795 errno = -err;
63d77c92 796 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
797 }
798}
799
63d77c92 800static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 801 _cleanup_close_ int fd = -1;
80876c20 802
3b20f877 803 assert(vc);
80876c20 804
7d5ceb64 805 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 806 if (fd < 0)
3b20f877 807 return;
80876c20 808
63d77c92 809 write_confirm_error_fd(err, fd, u);
af6da548 810}
80876c20 811
3d18b167 812static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 813 int r = 0;
80876c20 814
af6da548
LP
815 assert(saved_stdin);
816 assert(saved_stdout);
817
818 release_terminal();
819
820 if (*saved_stdin >= 0)
80876c20 821 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 822 r = -errno;
80876c20 823
af6da548 824 if (*saved_stdout >= 0)
80876c20 825 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 826 r = -errno;
80876c20 827
3d18b167
LP
828 *saved_stdin = safe_close(*saved_stdin);
829 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
830
831 return r;
832}
833
3b20f877
FB
834enum {
835 CONFIRM_PRETEND_FAILURE = -1,
836 CONFIRM_PRETEND_SUCCESS = 0,
837 CONFIRM_EXECUTE = 1,
838};
839
eedf223a 840static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 841 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 842 _cleanup_free_ char *e = NULL;
3b20f877 843 char c;
af6da548 844
3b20f877 845 /* For any internal errors, assume a positive response. */
7d5ceb64 846 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 847 if (r < 0) {
63d77c92 848 write_confirm_error(r, vc, u);
3b20f877
FB
849 return CONFIRM_EXECUTE;
850 }
af6da548 851
b0eb2944
FB
852 /* confirm_spawn might have been disabled while we were sleeping. */
853 if (manager_is_confirm_spawn_disabled(u->manager)) {
854 r = 1;
855 goto restore_stdio;
856 }
af6da548 857
2bcd3c26
FB
858 e = ellipsize(cmdline, 60, 100);
859 if (!e) {
860 log_oom();
861 r = CONFIRM_EXECUTE;
862 goto restore_stdio;
863 }
af6da548 864
d172b175 865 for (;;) {
539622bd 866 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 867 if (r < 0) {
63d77c92 868 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
869 r = CONFIRM_EXECUTE;
870 goto restore_stdio;
871 }
af6da548 872
d172b175 873 switch (c) {
b0eb2944
FB
874 case 'c':
875 printf("Resuming normal execution.\n");
876 manager_disable_confirm_spawn();
877 r = 1;
878 break;
dd6f9ac0
FB
879 case 'D':
880 unit_dump(u, stdout, " ");
881 continue; /* ask again */
d172b175
FB
882 case 'f':
883 printf("Failing execution.\n");
884 r = CONFIRM_PRETEND_FAILURE;
885 break;
886 case 'h':
b0eb2944
FB
887 printf(" c - continue, proceed without asking anymore\n"
888 " D - dump, show the state of the unit\n"
dd6f9ac0 889 " f - fail, don't execute the command and pretend it failed\n"
d172b175 890 " h - help\n"
eedf223a 891 " i - info, show a short summary of the unit\n"
56fde33a 892 " j - jobs, show jobs that are in progress\n"
d172b175
FB
893 " s - skip, don't execute the command and pretend it succeeded\n"
894 " y - yes, execute the command\n");
dd6f9ac0 895 continue; /* ask again */
eedf223a
FB
896 case 'i':
897 printf(" Description: %s\n"
898 " Unit: %s\n"
899 " Command: %s\n",
900 u->id, u->description, cmdline);
901 continue; /* ask again */
56fde33a
FB
902 case 'j':
903 manager_dump_jobs(u->manager, stdout, " ");
904 continue; /* ask again */
539622bd
FB
905 case 'n':
906 /* 'n' was removed in favor of 'f'. */
907 printf("Didn't understand 'n', did you mean 'f'?\n");
908 continue; /* ask again */
d172b175
FB
909 case 's':
910 printf("Skipping execution.\n");
911 r = CONFIRM_PRETEND_SUCCESS;
912 break;
913 case 'y':
914 r = CONFIRM_EXECUTE;
915 break;
916 default:
917 assert_not_reached("Unhandled choice");
918 }
3b20f877 919 break;
3b20f877 920 }
af6da548 921
3b20f877 922restore_stdio:
af6da548 923 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 924 return r;
80876c20
LP
925}
926
4d885bd3
DH
927static int get_fixed_user(const ExecContext *c, const char **user,
928 uid_t *uid, gid_t *gid,
929 const char **home, const char **shell) {
81a2b7ce 930 int r;
4d885bd3 931 const char *name;
81a2b7ce 932
4d885bd3 933 assert(c);
81a2b7ce 934
23deef88
LP
935 if (!c->user)
936 return 0;
937
4d885bd3
DH
938 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
939 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 940
23deef88 941 name = c->user;
fafff8f1 942 r = get_user_creds(&name, uid, gid, home, shell, USER_CREDS_CLEAN);
4d885bd3
DH
943 if (r < 0)
944 return r;
81a2b7ce 945
4d885bd3
DH
946 *user = name;
947 return 0;
948}
949
950static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
951 int r;
952 const char *name;
953
954 assert(c);
955
956 if (!c->group)
957 return 0;
958
959 name = c->group;
fafff8f1 960 r = get_group_creds(&name, gid, 0);
4d885bd3
DH
961 if (r < 0)
962 return r;
963
964 *group = name;
965 return 0;
966}
967
cdc5d5c5
DH
968static int get_supplementary_groups(const ExecContext *c, const char *user,
969 const char *group, gid_t gid,
970 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
971 char **i;
972 int r, k = 0;
973 int ngroups_max;
974 bool keep_groups = false;
975 gid_t *groups = NULL;
976 _cleanup_free_ gid_t *l_gids = NULL;
977
978 assert(c);
979
bbeea271
DH
980 /*
981 * If user is given, then lookup GID and supplementary groups list.
982 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
983 * here and as early as possible so we keep the list of supplementary
984 * groups of the caller.
bbeea271
DH
985 */
986 if (user && gid_is_valid(gid) && gid != 0) {
987 /* First step, initialize groups from /etc/groups */
988 if (initgroups(user, gid) < 0)
989 return -errno;
990
991 keep_groups = true;
992 }
993
ac6e8be6 994 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
995 return 0;
996
366ddd25
DH
997 /*
998 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
999 * be positive, otherwise fail.
1000 */
1001 errno = 0;
1002 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
66855de7
LP
1003 if (ngroups_max <= 0)
1004 return errno_or_else(EOPNOTSUPP);
366ddd25 1005
4d885bd3
DH
1006 l_gids = new(gid_t, ngroups_max);
1007 if (!l_gids)
1008 return -ENOMEM;
81a2b7ce 1009
4d885bd3
DH
1010 if (keep_groups) {
1011 /*
1012 * Lookup the list of groups that the user belongs to, we
1013 * avoid NSS lookups here too for gid=0.
1014 */
1015 k = ngroups_max;
1016 if (getgrouplist(user, gid, l_gids, &k) < 0)
1017 return -EINVAL;
1018 } else
1019 k = 0;
81a2b7ce 1020
4d885bd3
DH
1021 STRV_FOREACH(i, c->supplementary_groups) {
1022 const char *g;
81a2b7ce 1023
4d885bd3
DH
1024 if (k >= ngroups_max)
1025 return -E2BIG;
81a2b7ce 1026
4d885bd3 1027 g = *i;
fafff8f1 1028 r = get_group_creds(&g, l_gids+k, 0);
4d885bd3
DH
1029 if (r < 0)
1030 return r;
81a2b7ce 1031
4d885bd3
DH
1032 k++;
1033 }
81a2b7ce 1034
4d885bd3
DH
1035 /*
1036 * Sets ngids to zero to drop all supplementary groups, happens
1037 * when we are under root and SupplementaryGroups= is empty.
1038 */
1039 if (k == 0) {
1040 *ngids = 0;
1041 return 0;
1042 }
81a2b7ce 1043
4d885bd3
DH
1044 /* Otherwise get the final list of supplementary groups */
1045 groups = memdup(l_gids, sizeof(gid_t) * k);
1046 if (!groups)
1047 return -ENOMEM;
1048
1049 *supplementary_gids = groups;
1050 *ngids = k;
1051
1052 groups = NULL;
1053
1054 return 0;
1055}
1056
34cf6c43 1057static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1058 int r;
1059
709dbeac
YW
1060 /* Handle SupplementaryGroups= if it is not empty */
1061 if (ngids > 0) {
4d885bd3
DH
1062 r = maybe_setgroups(ngids, supplementary_gids);
1063 if (r < 0)
97f0e76f 1064 return r;
4d885bd3 1065 }
81a2b7ce 1066
4d885bd3
DH
1067 if (gid_is_valid(gid)) {
1068 /* Then set our gids */
1069 if (setresgid(gid, gid, gid) < 0)
1070 return -errno;
81a2b7ce
LP
1071 }
1072
1073 return 0;
1074}
1075
1076static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce
LP
1077 assert(context);
1078
4d885bd3
DH
1079 if (!uid_is_valid(uid))
1080 return 0;
1081
479050b3 1082 /* Sets (but doesn't look up) the uid and make sure we keep the
81a2b7ce
LP
1083 * capabilities while doing so. */
1084
479050b3 1085 if (context->capability_ambient_set != 0) {
81a2b7ce
LP
1086
1087 /* First step: If we need to keep capabilities but
1088 * drop privileges we need to make sure we keep our
cbb21cca 1089 * caps, while we drop privileges. */
693ced48 1090 if (uid != 0) {
cbb21cca 1091 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
693ced48
LP
1092
1093 if (prctl(PR_GET_SECUREBITS) != sb)
1094 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1095 return -errno;
1096 }
81a2b7ce
LP
1097 }
1098
479050b3 1099 /* Second step: actually set the uids */
81a2b7ce
LP
1100 if (setresuid(uid, uid, uid) < 0)
1101 return -errno;
1102
1103 /* At this point we should have all necessary capabilities but
1104 are otherwise a normal user. However, the caps might got
1105 corrupted due to the setresuid() so we need clean them up
1106 later. This is done outside of this call. */
1107
1108 return 0;
1109}
1110
349cc4a5 1111#if HAVE_PAM
5b6319dc
LP
1112
1113static int null_conv(
1114 int num_msg,
1115 const struct pam_message **msg,
1116 struct pam_response **resp,
1117 void *appdata_ptr) {
1118
1119 /* We don't support conversations */
1120
1121 return PAM_CONV_ERR;
1122}
1123
cefc33ae
LP
1124#endif
1125
5b6319dc
LP
1126static int setup_pam(
1127 const char *name,
1128 const char *user,
940c5210 1129 uid_t uid,
2d6fce8d 1130 gid_t gid,
5b6319dc 1131 const char *tty,
2065ca69 1132 char ***env,
da6053d0 1133 int fds[], size_t n_fds) {
5b6319dc 1134
349cc4a5 1135#if HAVE_PAM
cefc33ae 1136
5b6319dc
LP
1137 static const struct pam_conv conv = {
1138 .conv = null_conv,
1139 .appdata_ptr = NULL
1140 };
1141
2d7c6aa2 1142 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1143 pam_handle_t *handle = NULL;
d6e5f3ad 1144 sigset_t old_ss;
7bb70b6e 1145 int pam_code = PAM_SUCCESS, r;
84eada2f 1146 char **nv, **e = NULL;
5b6319dc
LP
1147 bool close_session = false;
1148 pid_t pam_pid = 0, parent_pid;
970edce6 1149 int flags = 0;
5b6319dc
LP
1150
1151 assert(name);
1152 assert(user);
2065ca69 1153 assert(env);
5b6319dc
LP
1154
1155 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1156 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1157 * systemd via the cgroup logic. It will then remove the PAM
1158 * session again. The parent process will exec() the actual
1159 * daemon. We do things this way to ensure that the main PID
1160 * of the daemon is the one we initially fork()ed. */
1161
7bb70b6e
LP
1162 r = barrier_create(&barrier);
1163 if (r < 0)
2d7c6aa2
DH
1164 goto fail;
1165
553d2243 1166 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1167 flags |= PAM_SILENT;
1168
f546241b
ZJS
1169 pam_code = pam_start(name, user, &conv, &handle);
1170 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1171 handle = NULL;
1172 goto fail;
1173 }
1174
3cd24c1a
LP
1175 if (!tty) {
1176 _cleanup_free_ char *q = NULL;
1177
1178 /* Hmm, so no TTY was explicitly passed, but an fd passed to us directly might be a TTY. Let's figure
1179 * out if that's the case, and read the TTY off it. */
1180
1181 if (getttyname_malloc(STDIN_FILENO, &q) >= 0)
1182 tty = strjoina("/dev/", q);
1183 }
1184
f546241b
ZJS
1185 if (tty) {
1186 pam_code = pam_set_item(handle, PAM_TTY, tty);
1187 if (pam_code != PAM_SUCCESS)
5b6319dc 1188 goto fail;
f546241b 1189 }
5b6319dc 1190
84eada2f
JW
1191 STRV_FOREACH(nv, *env) {
1192 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1193 if (pam_code != PAM_SUCCESS)
1194 goto fail;
1195 }
1196
970edce6 1197 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1198 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1199 goto fail;
1200
970edce6 1201 pam_code = pam_open_session(handle, flags);
f546241b 1202 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1203 goto fail;
1204
1205 close_session = true;
1206
f546241b
ZJS
1207 e = pam_getenvlist(handle);
1208 if (!e) {
5b6319dc
LP
1209 pam_code = PAM_BUF_ERR;
1210 goto fail;
1211 }
1212
1213 /* Block SIGTERM, so that we know that it won't get lost in
1214 * the child */
ce30c8dc 1215
72c0a2c2 1216 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1217
df0ff127 1218 parent_pid = getpid_cached();
5b6319dc 1219
4c253ed1
LP
1220 r = safe_fork("(sd-pam)", 0, &pam_pid);
1221 if (r < 0)
5b6319dc 1222 goto fail;
4c253ed1 1223 if (r == 0) {
7bb70b6e 1224 int sig, ret = EXIT_PAM;
5b6319dc
LP
1225
1226 /* The child's job is to reset the PAM session on
1227 * termination */
2d7c6aa2 1228 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1229
4c253ed1
LP
1230 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1231 * are open here that have been opened by PAM. */
1232 (void) close_many(fds, n_fds);
5b6319dc 1233
940c5210
AK
1234 /* Drop privileges - we don't need any to pam_close_session
1235 * and this will make PR_SET_PDEATHSIG work in most cases.
1236 * If this fails, ignore the error - but expect sd-pam threads
1237 * to fail to exit normally */
2d6fce8d 1238
97f0e76f
LP
1239 r = maybe_setgroups(0, NULL);
1240 if (r < 0)
1241 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1242 if (setresgid(gid, gid, gid) < 0)
1243 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1244 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1245 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1246
ce30c8dc
LP
1247 (void) ignore_signals(SIGPIPE, -1);
1248
940c5210
AK
1249 /* Wait until our parent died. This will only work if
1250 * the above setresuid() succeeds, otherwise the kernel
1251 * will not allow unprivileged parents kill their privileged
1252 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1253 * to do the rest for us. */
1254 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1255 goto child_finish;
1256
2d7c6aa2
DH
1257 /* Tell the parent that our setup is done. This is especially
1258 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1259 * setup might race against our setresuid(2) call.
1260 *
1261 * If the parent aborted, we'll detect this below, hence ignore
1262 * return failure here. */
1263 (void) barrier_place(&barrier);
2d7c6aa2 1264
643f4706 1265 /* Check if our parent process might already have died? */
5b6319dc 1266 if (getppid() == parent_pid) {
d6e5f3ad
DM
1267 sigset_t ss;
1268
1269 assert_se(sigemptyset(&ss) >= 0);
1270 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1271
3dead8d9
LP
1272 for (;;) {
1273 if (sigwait(&ss, &sig) < 0) {
1274 if (errno == EINTR)
1275 continue;
1276
1277 goto child_finish;
1278 }
5b6319dc 1279
3dead8d9
LP
1280 assert(sig == SIGTERM);
1281 break;
1282 }
5b6319dc
LP
1283 }
1284
3dead8d9 1285 /* If our parent died we'll end the session */
f546241b 1286 if (getppid() != parent_pid) {
970edce6 1287 pam_code = pam_close_session(handle, flags);
f546241b 1288 if (pam_code != PAM_SUCCESS)
5b6319dc 1289 goto child_finish;
f546241b 1290 }
5b6319dc 1291
7bb70b6e 1292 ret = 0;
5b6319dc
LP
1293
1294 child_finish:
970edce6 1295 pam_end(handle, pam_code | flags);
7bb70b6e 1296 _exit(ret);
5b6319dc
LP
1297 }
1298
2d7c6aa2
DH
1299 barrier_set_role(&barrier, BARRIER_PARENT);
1300
5b6319dc
LP
1301 /* If the child was forked off successfully it will do all the
1302 * cleanups, so forget about the handle here. */
1303 handle = NULL;
1304
3b8bddde 1305 /* Unblock SIGTERM again in the parent */
72c0a2c2 1306 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1307
1308 /* We close the log explicitly here, since the PAM modules
1309 * might have opened it, but we don't want this fd around. */
1310 closelog();
1311
2d7c6aa2
DH
1312 /* Synchronously wait for the child to initialize. We don't care for
1313 * errors as we cannot recover. However, warn loudly if it happens. */
1314 if (!barrier_place_and_sync(&barrier))
1315 log_error("PAM initialization failed");
1316
130d3d22 1317 return strv_free_and_replace(*env, e);
5b6319dc
LP
1318
1319fail:
970edce6
ZJS
1320 if (pam_code != PAM_SUCCESS) {
1321 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1322 r = -EPERM; /* PAM errors do not map to errno */
1323 } else
1324 log_error_errno(r, "PAM failed: %m");
9ba35398 1325
5b6319dc
LP
1326 if (handle) {
1327 if (close_session)
970edce6 1328 pam_code = pam_close_session(handle, flags);
5b6319dc 1329
970edce6 1330 pam_end(handle, pam_code | flags);
5b6319dc
LP
1331 }
1332
1333 strv_free(e);
5b6319dc
LP
1334 closelog();
1335
7bb70b6e 1336 return r;
cefc33ae
LP
1337#else
1338 return 0;
5b6319dc 1339#endif
cefc33ae 1340}
5b6319dc 1341
5d6b1584
LP
1342static void rename_process_from_path(const char *path) {
1343 char process_name[11];
1344 const char *p;
1345 size_t l;
1346
1347 /* This resulting string must fit in 10 chars (i.e. the length
1348 * of "/sbin/init") to look pretty in /bin/ps */
1349
2b6bf07d 1350 p = basename(path);
5d6b1584
LP
1351 if (isempty(p)) {
1352 rename_process("(...)");
1353 return;
1354 }
1355
1356 l = strlen(p);
1357 if (l > 8) {
1358 /* The end of the process name is usually more
1359 * interesting, since the first bit might just be
1360 * "systemd-" */
1361 p = p + l - 8;
1362 l = 8;
1363 }
1364
1365 process_name[0] = '(';
1366 memcpy(process_name+1, p, l);
1367 process_name[1+l] = ')';
1368 process_name[1+l+1] = 0;
1369
1370 rename_process(process_name);
1371}
1372
469830d1
LP
1373static bool context_has_address_families(const ExecContext *c) {
1374 assert(c);
1375
1376 return c->address_families_whitelist ||
1377 !set_isempty(c->address_families);
1378}
1379
1380static bool context_has_syscall_filters(const ExecContext *c) {
1381 assert(c);
1382
1383 return c->syscall_whitelist ||
8cfa775f 1384 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1385}
1386
1387static bool context_has_no_new_privileges(const ExecContext *c) {
1388 assert(c);
1389
1390 if (c->no_new_privileges)
1391 return true;
1392
1393 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1394 return false;
1395
1396 /* We need NNP if we have any form of seccomp and are unprivileged */
1397 return context_has_address_families(c) ||
1398 c->memory_deny_write_execute ||
1399 c->restrict_realtime ||
f69567cb 1400 c->restrict_suid_sgid ||
469830d1
LP
1401 exec_context_restrict_namespaces_set(c) ||
1402 c->protect_kernel_tunables ||
1403 c->protect_kernel_modules ||
1404 c->private_devices ||
1405 context_has_syscall_filters(c) ||
78e864e5 1406 !set_isempty(c->syscall_archs) ||
aecd5ac6
TM
1407 c->lock_personality ||
1408 c->protect_hostname;
469830d1
LP
1409}
1410
349cc4a5 1411#if HAVE_SECCOMP
17df7223 1412
83f12b27 1413static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1414
1415 if (is_seccomp_available())
1416 return false;
1417
f673b62d 1418 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1419 return true;
83f12b27
FS
1420}
1421
165a31c0 1422static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1423 uint32_t negative_action, default_action, action;
165a31c0 1424 int r;
8351ceae 1425
469830d1 1426 assert(u);
c0467cf3 1427 assert(c);
8351ceae 1428
469830d1 1429 if (!context_has_syscall_filters(c))
83f12b27
FS
1430 return 0;
1431
469830d1
LP
1432 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1433 return 0;
e9642be2 1434
ccc16c78 1435 negative_action = c->syscall_errno == 0 ? scmp_act_kill_process() : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1436
469830d1
LP
1437 if (c->syscall_whitelist) {
1438 default_action = negative_action;
1439 action = SCMP_ACT_ALLOW;
7c66bae2 1440 } else {
469830d1
LP
1441 default_action = SCMP_ACT_ALLOW;
1442 action = negative_action;
57183d11 1443 }
8351ceae 1444
165a31c0
LP
1445 if (needs_ambient_hack) {
1446 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1447 if (r < 0)
1448 return r;
1449 }
1450
b54f36c6 1451 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action, false);
4298d0b5
LP
1452}
1453
469830d1
LP
1454static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1455 assert(u);
4298d0b5
LP
1456 assert(c);
1457
469830d1 1458 if (set_isempty(c->syscall_archs))
83f12b27
FS
1459 return 0;
1460
469830d1
LP
1461 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1462 return 0;
4298d0b5 1463
469830d1
LP
1464 return seccomp_restrict_archs(c->syscall_archs);
1465}
4298d0b5 1466
469830d1
LP
1467static int apply_address_families(const Unit* u, const ExecContext *c) {
1468 assert(u);
1469 assert(c);
4298d0b5 1470
469830d1
LP
1471 if (!context_has_address_families(c))
1472 return 0;
4298d0b5 1473
469830d1
LP
1474 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1475 return 0;
4298d0b5 1476
469830d1 1477 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
8351ceae 1478}
4298d0b5 1479
83f12b27 1480static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1481 assert(u);
f3e43635
TM
1482 assert(c);
1483
469830d1 1484 if (!c->memory_deny_write_execute)
83f12b27
FS
1485 return 0;
1486
469830d1
LP
1487 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1488 return 0;
f3e43635 1489
469830d1 1490 return seccomp_memory_deny_write_execute();
f3e43635
TM
1491}
1492
83f12b27 1493static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1494 assert(u);
f4170c67
LP
1495 assert(c);
1496
469830d1 1497 if (!c->restrict_realtime)
83f12b27
FS
1498 return 0;
1499
469830d1
LP
1500 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1501 return 0;
f4170c67 1502
469830d1 1503 return seccomp_restrict_realtime();
f4170c67
LP
1504}
1505
f69567cb
LP
1506static int apply_restrict_suid_sgid(const Unit* u, const ExecContext *c) {
1507 assert(u);
1508 assert(c);
1509
1510 if (!c->restrict_suid_sgid)
1511 return 0;
1512
1513 if (skip_seccomp_unavailable(u, "RestrictSUIDSGID="))
1514 return 0;
1515
1516 return seccomp_restrict_suid_sgid();
1517}
1518
59e856c7 1519static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1520 assert(u);
59eeb84b
LP
1521 assert(c);
1522
1523 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1524 * let's protect even those systems where this is left on in the kernel. */
1525
469830d1 1526 if (!c->protect_kernel_tunables)
59eeb84b
LP
1527 return 0;
1528
469830d1
LP
1529 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1530 return 0;
59eeb84b 1531
469830d1 1532 return seccomp_protect_sysctl();
59eeb84b
LP
1533}
1534
59e856c7 1535static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1536 assert(u);
502d704e
DH
1537 assert(c);
1538
25a8d8a0 1539 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1540
469830d1
LP
1541 if (!c->protect_kernel_modules)
1542 return 0;
1543
502d704e
DH
1544 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1545 return 0;
1546
b54f36c6 1547 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM), false);
502d704e
DH
1548}
1549
59e856c7 1550static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1551 assert(u);
ba128bb8
LP
1552 assert(c);
1553
8f81a5f6 1554 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1555
469830d1
LP
1556 if (!c->private_devices)
1557 return 0;
1558
ba128bb8
LP
1559 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1560 return 0;
1561
b54f36c6 1562 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM), false);
ba128bb8
LP
1563}
1564
34cf6c43 1565static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1566 assert(u);
add00535
LP
1567 assert(c);
1568
1569 if (!exec_context_restrict_namespaces_set(c))
1570 return 0;
1571
1572 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1573 return 0;
1574
1575 return seccomp_restrict_namespaces(c->restrict_namespaces);
1576}
1577
78e864e5 1578static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1579 unsigned long personality;
1580 int r;
78e864e5
TM
1581
1582 assert(u);
1583 assert(c);
1584
1585 if (!c->lock_personality)
1586 return 0;
1587
1588 if (skip_seccomp_unavailable(u, "LockPersonality="))
1589 return 0;
1590
e8132d63
LP
1591 personality = c->personality;
1592
1593 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1594 if (personality == PERSONALITY_INVALID) {
1595
1596 r = opinionated_personality(&personality);
1597 if (r < 0)
1598 return r;
1599 }
78e864e5
TM
1600
1601 return seccomp_lock_personality(personality);
1602}
1603
c0467cf3 1604#endif
8351ceae 1605
3042bbeb 1606static void do_idle_pipe_dance(int idle_pipe[static 4]) {
31a7eb86
ZJS
1607 assert(idle_pipe);
1608
54eb2300
LP
1609 idle_pipe[1] = safe_close(idle_pipe[1]);
1610 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1611
1612 if (idle_pipe[0] >= 0) {
1613 int r;
1614
1615 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1616
1617 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1618 ssize_t n;
1619
31a7eb86 1620 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1621 n = write(idle_pipe[3], "x", 1);
1622 if (n > 0)
cd972d69
ZJS
1623 /* Wait for systemd to react to the signal above. */
1624 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1625 }
1626
54eb2300 1627 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1628
1629 }
1630
54eb2300 1631 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1632}
1633
fb2042dd
YW
1634static const char *exec_directory_env_name_to_string(ExecDirectoryType t);
1635
7cae38c4 1636static int build_environment(
34cf6c43 1637 const Unit *u,
9fa95f85 1638 const ExecContext *c,
1e22b5cd 1639 const ExecParameters *p,
da6053d0 1640 size_t n_fds,
7cae38c4
LP
1641 const char *home,
1642 const char *username,
1643 const char *shell,
7bce046b
LP
1644 dev_t journal_stream_dev,
1645 ino_t journal_stream_ino,
7cae38c4
LP
1646 char ***ret) {
1647
1648 _cleanup_strv_free_ char **our_env = NULL;
fb2042dd 1649 ExecDirectoryType t;
da6053d0 1650 size_t n_env = 0;
7cae38c4
LP
1651 char *x;
1652
4b58153d 1653 assert(u);
7cae38c4 1654 assert(c);
7c1cb6f1 1655 assert(p);
7cae38c4
LP
1656 assert(ret);
1657
fb2042dd 1658 our_env = new0(char*, 14 + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4
LP
1659 if (!our_env)
1660 return -ENOMEM;
1661
1662 if (n_fds > 0) {
8dd4c05b
LP
1663 _cleanup_free_ char *joined = NULL;
1664
df0ff127 1665 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1666 return -ENOMEM;
1667 our_env[n_env++] = x;
1668
da6053d0 1669 if (asprintf(&x, "LISTEN_FDS=%zu", n_fds) < 0)
7cae38c4
LP
1670 return -ENOMEM;
1671 our_env[n_env++] = x;
8dd4c05b 1672
1e22b5cd 1673 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1674 if (!joined)
1675 return -ENOMEM;
1676
605405c6 1677 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1678 if (!x)
1679 return -ENOMEM;
1680 our_env[n_env++] = x;
7cae38c4
LP
1681 }
1682
b08af3b1 1683 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1684 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1685 return -ENOMEM;
1686 our_env[n_env++] = x;
1687
1e22b5cd 1688 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1689 return -ENOMEM;
1690 our_env[n_env++] = x;
1691 }
1692
fd63e712
LP
1693 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1694 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1695 * check the database directly. */
ac647978 1696 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1697 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1698 if (!x)
1699 return -ENOMEM;
1700 our_env[n_env++] = x;
1701 }
1702
7cae38c4 1703 if (home) {
b910cc72 1704 x = strjoin("HOME=", home);
7cae38c4
LP
1705 if (!x)
1706 return -ENOMEM;
7bbead1d
LP
1707
1708 path_simplify(x + 5, true);
7cae38c4
LP
1709 our_env[n_env++] = x;
1710 }
1711
1712 if (username) {
b910cc72 1713 x = strjoin("LOGNAME=", username);
7cae38c4
LP
1714 if (!x)
1715 return -ENOMEM;
1716 our_env[n_env++] = x;
1717
b910cc72 1718 x = strjoin("USER=", username);
7cae38c4
LP
1719 if (!x)
1720 return -ENOMEM;
1721 our_env[n_env++] = x;
1722 }
1723
1724 if (shell) {
b910cc72 1725 x = strjoin("SHELL=", shell);
7cae38c4
LP
1726 if (!x)
1727 return -ENOMEM;
7bbead1d
LP
1728
1729 path_simplify(x + 6, true);
7cae38c4
LP
1730 our_env[n_env++] = x;
1731 }
1732
4b58153d
LP
1733 if (!sd_id128_is_null(u->invocation_id)) {
1734 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1735 return -ENOMEM;
1736
1737 our_env[n_env++] = x;
1738 }
1739
6af760f3
LP
1740 if (exec_context_needs_term(c)) {
1741 const char *tty_path, *term = NULL;
1742
1743 tty_path = exec_context_tty_path(c);
1744
1745 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1746 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1747 * passes to PID 1 ends up all the way in the console login shown. */
1748
1749 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1750 term = getenv("TERM");
1751 if (!term)
1752 term = default_term_for_tty(tty_path);
7cae38c4 1753
b910cc72 1754 x = strjoin("TERM=", term);
7cae38c4
LP
1755 if (!x)
1756 return -ENOMEM;
1757 our_env[n_env++] = x;
1758 }
1759
7bce046b
LP
1760 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1761 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1762 return -ENOMEM;
1763
1764 our_env[n_env++] = x;
1765 }
1766
fb2042dd
YW
1767 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1768 _cleanup_free_ char *pre = NULL, *joined = NULL;
1769 const char *n;
1770
1771 if (!p->prefix[t])
1772 continue;
1773
1774 if (strv_isempty(c->directories[t].paths))
1775 continue;
1776
1777 n = exec_directory_env_name_to_string(t);
1778 if (!n)
1779 continue;
1780
1781 pre = strjoin(p->prefix[t], "/");
1782 if (!pre)
1783 return -ENOMEM;
1784
1785 joined = strv_join_prefix(c->directories[t].paths, ":", pre);
1786 if (!joined)
1787 return -ENOMEM;
1788
1789 x = strjoin(n, "=", joined);
1790 if (!x)
1791 return -ENOMEM;
1792
1793 our_env[n_env++] = x;
1794 }
1795
7cae38c4 1796 our_env[n_env++] = NULL;
fb2042dd 1797 assert(n_env <= 14 + _EXEC_DIRECTORY_TYPE_MAX);
7cae38c4 1798
ae2a15bc 1799 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1800
1801 return 0;
1802}
1803
b4c14404
FB
1804static int build_pass_environment(const ExecContext *c, char ***ret) {
1805 _cleanup_strv_free_ char **pass_env = NULL;
1806 size_t n_env = 0, n_bufsize = 0;
1807 char **i;
1808
1809 STRV_FOREACH(i, c->pass_environment) {
1810 _cleanup_free_ char *x = NULL;
1811 char *v;
1812
1813 v = getenv(*i);
1814 if (!v)
1815 continue;
605405c6 1816 x = strjoin(*i, "=", v);
b4c14404
FB
1817 if (!x)
1818 return -ENOMEM;
00819cc1 1819
b4c14404
FB
1820 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1821 return -ENOMEM;
00819cc1 1822
1cc6c93a 1823 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1824 pass_env[n_env] = NULL;
b4c14404
FB
1825 }
1826
ae2a15bc 1827 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1828
1829 return 0;
1830}
1831
8b44a3d2
LP
1832static bool exec_needs_mount_namespace(
1833 const ExecContext *context,
1834 const ExecParameters *params,
4657abb5 1835 const ExecRuntime *runtime) {
8b44a3d2
LP
1836
1837 assert(context);
1838 assert(params);
1839
915e6d16
LP
1840 if (context->root_image)
1841 return true;
1842
2a624c36
AP
1843 if (!strv_isempty(context->read_write_paths) ||
1844 !strv_isempty(context->read_only_paths) ||
1845 !strv_isempty(context->inaccessible_paths))
8b44a3d2
LP
1846 return true;
1847
42b1d8e0 1848 if (context->n_bind_mounts > 0)
d2d6c096
LP
1849 return true;
1850
2abd4e38
YW
1851 if (context->n_temporary_filesystems > 0)
1852 return true;
1853
37ed15d7 1854 if (!IN_SET(context->mount_flags, 0, MS_SHARED))
8b44a3d2
LP
1855 return true;
1856
1857 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1858 return true;
1859
8b44a3d2 1860 if (context->private_devices ||
228af36f 1861 context->private_mounts ||
8b44a3d2 1862 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
1863 context->protect_home != PROTECT_HOME_NO ||
1864 context->protect_kernel_tunables ||
c575770b 1865 context->protect_kernel_modules ||
59eeb84b 1866 context->protect_control_groups)
8b44a3d2
LP
1867 return true;
1868
37c56f89
YW
1869 if (context->root_directory) {
1870 ExecDirectoryType t;
1871
1872 if (context->mount_apivfs)
1873 return true;
1874
1875 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
1876 if (!params->prefix[t])
1877 continue;
1878
1879 if (!strv_isempty(context->directories[t].paths))
1880 return true;
1881 }
1882 }
5d997827 1883
42b1d8e0 1884 if (context->dynamic_user &&
b43ee82f 1885 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
1886 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1887 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1888 return true;
1889
8b44a3d2
LP
1890 return false;
1891}
1892
d251207d
LP
1893static int setup_private_users(uid_t uid, gid_t gid) {
1894 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1895 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1896 _cleanup_close_ int unshare_ready_fd = -1;
1897 _cleanup_(sigkill_waitp) pid_t pid = 0;
1898 uint64_t c = 1;
d251207d
LP
1899 ssize_t n;
1900 int r;
1901
1902 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1903 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1904 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1905 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1906 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1907 * continues execution normally. */
1908
587ab01b
ZJS
1909 if (uid != 0 && uid_is_valid(uid)) {
1910 r = asprintf(&uid_map,
1911 "0 0 1\n" /* Map root → root */
1912 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1913 uid, uid);
1914 if (r < 0)
1915 return -ENOMEM;
1916 } else {
e0f3720e 1917 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
587ab01b
ZJS
1918 if (!uid_map)
1919 return -ENOMEM;
1920 }
d251207d 1921
587ab01b
ZJS
1922 if (gid != 0 && gid_is_valid(gid)) {
1923 r = asprintf(&gid_map,
1924 "0 0 1\n" /* Map root → root */
1925 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1926 gid, gid);
1927 if (r < 0)
1928 return -ENOMEM;
1929 } else {
d251207d 1930 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
587ab01b
ZJS
1931 if (!gid_map)
1932 return -ENOMEM;
1933 }
d251207d
LP
1934
1935 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1936 * namespace. */
1937 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1938 if (unshare_ready_fd < 0)
1939 return -errno;
1940
1941 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1942 * failed. */
1943 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1944 return -errno;
1945
4c253ed1
LP
1946 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1947 if (r < 0)
1948 return r;
1949 if (r == 0) {
d251207d
LP
1950 _cleanup_close_ int fd = -1;
1951 const char *a;
1952 pid_t ppid;
1953
1954 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1955 * here, after the parent opened its own user namespace. */
1956
1957 ppid = getppid();
1958 errno_pipe[0] = safe_close(errno_pipe[0]);
1959
1960 /* Wait until the parent unshared the user namespace */
1961 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1962 r = -errno;
1963 goto child_fail;
1964 }
1965
1966 /* Disable the setgroups() system call in the child user namespace, for good. */
1967 a = procfs_file_alloca(ppid, "setgroups");
1968 fd = open(a, O_WRONLY|O_CLOEXEC);
1969 if (fd < 0) {
1970 if (errno != ENOENT) {
1971 r = -errno;
1972 goto child_fail;
1973 }
1974
1975 /* If the file is missing the kernel is too old, let's continue anyway. */
1976 } else {
1977 if (write(fd, "deny\n", 5) < 0) {
1978 r = -errno;
1979 goto child_fail;
1980 }
1981
1982 fd = safe_close(fd);
1983 }
1984
1985 /* First write the GID map */
1986 a = procfs_file_alloca(ppid, "gid_map");
1987 fd = open(a, O_WRONLY|O_CLOEXEC);
1988 if (fd < 0) {
1989 r = -errno;
1990 goto child_fail;
1991 }
1992 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1993 r = -errno;
1994 goto child_fail;
1995 }
1996 fd = safe_close(fd);
1997
1998 /* The write the UID map */
1999 a = procfs_file_alloca(ppid, "uid_map");
2000 fd = open(a, O_WRONLY|O_CLOEXEC);
2001 if (fd < 0) {
2002 r = -errno;
2003 goto child_fail;
2004 }
2005 if (write(fd, uid_map, strlen(uid_map)) < 0) {
2006 r = -errno;
2007 goto child_fail;
2008 }
2009
2010 _exit(EXIT_SUCCESS);
2011
2012 child_fail:
2013 (void) write(errno_pipe[1], &r, sizeof(r));
2014 _exit(EXIT_FAILURE);
2015 }
2016
2017 errno_pipe[1] = safe_close(errno_pipe[1]);
2018
2019 if (unshare(CLONE_NEWUSER) < 0)
2020 return -errno;
2021
2022 /* Let the child know that the namespace is ready now */
2023 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
2024 return -errno;
2025
2026 /* Try to read an error code from the child */
2027 n = read(errno_pipe[0], &r, sizeof(r));
2028 if (n < 0)
2029 return -errno;
2030 if (n == sizeof(r)) { /* an error code was sent to us */
2031 if (r < 0)
2032 return r;
2033 return -EIO;
2034 }
2035 if (n != 0) /* on success we should have read 0 bytes */
2036 return -EIO;
2037
2e87a1fd
LP
2038 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
2039 pid = 0;
d251207d
LP
2040 if (r < 0)
2041 return r;
2e87a1fd 2042 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
2043 return -EIO;
2044
2045 return 0;
2046}
2047
494d0247
YW
2048static bool exec_directory_is_private(const ExecContext *context, ExecDirectoryType type) {
2049 if (!context->dynamic_user)
2050 return false;
2051
2052 if (type == EXEC_DIRECTORY_CONFIGURATION)
2053 return false;
2054
2055 if (type == EXEC_DIRECTORY_RUNTIME && context->runtime_directory_preserve_mode == EXEC_PRESERVE_NO)
2056 return false;
2057
2058 return true;
2059}
2060
3536f49e 2061static int setup_exec_directory(
07689d5d
LP
2062 const ExecContext *context,
2063 const ExecParameters *params,
2064 uid_t uid,
3536f49e 2065 gid_t gid,
3536f49e
YW
2066 ExecDirectoryType type,
2067 int *exit_status) {
07689d5d 2068
72fd1768 2069 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
2070 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
2071 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
2072 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
2073 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
2074 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
2075 };
07689d5d
LP
2076 char **rt;
2077 int r;
2078
2079 assert(context);
2080 assert(params);
72fd1768 2081 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 2082 assert(exit_status);
07689d5d 2083
3536f49e
YW
2084 if (!params->prefix[type])
2085 return 0;
2086
8679efde 2087 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
2088 if (!uid_is_valid(uid))
2089 uid = 0;
2090 if (!gid_is_valid(gid))
2091 gid = 0;
2092 }
2093
2094 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 2095 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 2096
edbfeb12 2097 p = path_join(params->prefix[type], *rt);
3536f49e
YW
2098 if (!p) {
2099 r = -ENOMEM;
2100 goto fail;
2101 }
07689d5d 2102
23a7448e
YW
2103 r = mkdir_parents_label(p, 0755);
2104 if (r < 0)
3536f49e 2105 goto fail;
23a7448e 2106
494d0247 2107 if (exec_directory_is_private(context, type)) {
6c9c51e5 2108 _cleanup_free_ char *private_root = NULL;
6c47cd7d 2109
3f5b1508
LP
2110 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that
2111 * case we want to avoid leaving a directory around fully accessible that is owned by
2112 * a dynamic user whose UID is later on reused. To lock this down we use the same
2113 * trick used by container managers to prohibit host users to get access to files of
2114 * the same UID in containers: we place everything inside a directory that has an
2115 * access mode of 0700 and is owned root:root, so that it acts as security boundary
2116 * for unprivileged host code. We then use fs namespacing to make this directory
2117 * permeable for the service itself.
6c47cd7d 2118 *
3f5b1508
LP
2119 * Specifically: for a service which wants a special directory "foo/" we first create
2120 * a directory "private/" with access mode 0700 owned by root:root. Then we place
2121 * "foo" inside of that directory (i.e. "private/foo/"), and make "foo" a symlink to
2122 * "private/foo". This way, privileged host users can access "foo/" as usual, but
2123 * unprivileged host users can't look into it. Inside of the namespace of the unit
2124 * "private/" is replaced by a more liberally accessible tmpfs, into which the host's
2125 * "private/foo/" is mounted under the same name, thus disabling the access boundary
2126 * for the service and making sure it only gets access to the dirs it needs but no
2127 * others. Tricky? Yes, absolutely, but it works!
6c47cd7d 2128 *
3f5b1508
LP
2129 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not
2130 * to be owned by the service itself.
2131 *
2132 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used
2133 * for sharing files or sockets with other services. */
6c47cd7d 2134
edbfeb12 2135 private_root = path_join(params->prefix[type], "private");
6c47cd7d
LP
2136 if (!private_root) {
2137 r = -ENOMEM;
2138 goto fail;
2139 }
2140
2141 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
37c1d5e9 2142 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2143 if (r < 0)
2144 goto fail;
2145
edbfeb12 2146 pp = path_join(private_root, *rt);
6c47cd7d
LP
2147 if (!pp) {
2148 r = -ENOMEM;
2149 goto fail;
2150 }
2151
2152 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2153 r = mkdir_parents_label(pp, 0755);
2154 if (r < 0)
2155 goto fail;
2156
949befd3
LP
2157 if (is_dir(p, false) > 0 &&
2158 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2159
2160 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2161 * it over. Most likely the service has been upgraded from one that didn't use
2162 * DynamicUser=1, to one that does. */
2163
cf52c45d
LP
2164 log_info("Found pre-existing public %s= directory %s, migrating to %s.\n"
2165 "Apparently, service previously had DynamicUser= turned off, and has now turned it on.",
2166 exec_directory_type_to_string(type), p, pp);
2167
949befd3
LP
2168 if (rename(p, pp) < 0) {
2169 r = -errno;
2170 goto fail;
2171 }
2172 } else {
2173 /* Otherwise, create the actual directory for the service */
2174
2175 r = mkdir_label(pp, context->directories[type].mode);
2176 if (r < 0 && r != -EEXIST)
2177 goto fail;
2178 }
6c47cd7d 2179
6c47cd7d 2180 /* And link it up from the original place */
6c9c51e5 2181 r = symlink_idempotent(pp, p, true);
6c47cd7d
LP
2182 if (r < 0)
2183 goto fail;
2184
6c47cd7d 2185 } else {
5c6d40d1
LP
2186 _cleanup_free_ char *target = NULL;
2187
2188 if (type != EXEC_DIRECTORY_CONFIGURATION &&
2189 readlink_and_make_absolute(p, &target) >= 0) {
2190 _cleanup_free_ char *q = NULL;
2191
2192 /* This already exists and is a symlink? Interesting. Maybe it's one created
2193f17c
LP
2193 * by DynamicUser=1 (see above)?
2194 *
2195 * We do this for all directory types except for ConfigurationDirectory=,
2196 * since they all support the private/ symlink logic at least in some
2197 * configurations, see above. */
5c6d40d1
LP
2198
2199 q = path_join(params->prefix[type], "private", *rt);
2200 if (!q) {
2201 r = -ENOMEM;
2202 goto fail;
2203 }
2204
2205 if (path_equal(q, target)) {
2206
2207 /* Hmm, apparently DynamicUser= was once turned on for this service,
2208 * but is no longer. Let's move the directory back up. */
2209
cf52c45d
LP
2210 log_info("Found pre-existing private %s= directory %s, migrating to %s.\n"
2211 "Apparently, service previously had DynamicUser= turned on, and has now turned it off.",
2212 exec_directory_type_to_string(type), q, p);
2213
5c6d40d1
LP
2214 if (unlink(p) < 0) {
2215 r = -errno;
2216 goto fail;
2217 }
2218
2219 if (rename(q, p) < 0) {
2220 r = -errno;
2221 goto fail;
2222 }
2223 }
2224 }
2225
6c47cd7d 2226 r = mkdir_label(p, context->directories[type].mode);
d484580c 2227 if (r < 0) {
d484580c
LP
2228 if (r != -EEXIST)
2229 goto fail;
2230
206e9864
LP
2231 if (type == EXEC_DIRECTORY_CONFIGURATION) {
2232 struct stat st;
2233
2234 /* Don't change the owner/access mode of the configuration directory,
2235 * as in the common case it is not written to by a service, and shall
2236 * not be writable. */
2237
2238 if (stat(p, &st) < 0) {
2239 r = -errno;
2240 goto fail;
2241 }
2242
2243 /* Still complain if the access mode doesn't match */
2244 if (((st.st_mode ^ context->directories[type].mode) & 07777) != 0)
2245 log_warning("%s \'%s\' already exists but the mode is different. "
2246 "(File system: %o %sMode: %o)",
2247 exec_directory_type_to_string(type), *rt,
2248 st.st_mode & 07777, exec_directory_type_to_string(type), context->directories[type].mode & 07777);
2249
6cff72eb 2250 continue;
206e9864 2251 }
6cff72eb 2252 }
a1164ae3 2253 }
07689d5d 2254
206e9864 2255 /* Lock down the access mode (we use chmod_and_chown() to make this idempotent. We don't
5238e957 2256 * specify UID/GID here, so that path_chown_recursive() can optimize things depending on the
206e9864
LP
2257 * current UID/GID ownership.) */
2258 r = chmod_and_chown(pp ?: p, context->directories[type].mode, UID_INVALID, GID_INVALID);
2259 if (r < 0)
2260 goto fail;
c71b2eb7 2261
607b358e
LP
2262 /* Then, change the ownership of the whole tree, if necessary. When dynamic users are used we
2263 * drop the suid/sgid bits, since we really don't want SUID/SGID files for dynamic UID/GID
2264 * assignments to exist.*/
2265 r = path_chown_recursive(pp ?: p, uid, gid, context->dynamic_user ? 01777 : 07777);
07689d5d 2266 if (r < 0)
3536f49e 2267 goto fail;
07689d5d
LP
2268 }
2269
2270 return 0;
3536f49e
YW
2271
2272fail:
2273 *exit_status = exit_status_table[type];
3536f49e 2274 return r;
07689d5d
LP
2275}
2276
92b423b9 2277#if ENABLE_SMACK
cefc33ae
LP
2278static int setup_smack(
2279 const ExecContext *context,
2280 const ExecCommand *command) {
2281
cefc33ae
LP
2282 int r;
2283
2284 assert(context);
2285 assert(command);
2286
cefc33ae
LP
2287 if (context->smack_process_label) {
2288 r = mac_smack_apply_pid(0, context->smack_process_label);
2289 if (r < 0)
2290 return r;
2291 }
2292#ifdef SMACK_DEFAULT_PROCESS_LABEL
2293 else {
2294 _cleanup_free_ char *exec_label = NULL;
2295
2296 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
4c701096 2297 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2298 return r;
2299
2300 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2301 if (r < 0)
2302 return r;
2303 }
cefc33ae
LP
2304#endif
2305
2306 return 0;
2307}
92b423b9 2308#endif
cefc33ae 2309
6c47cd7d
LP
2310static int compile_bind_mounts(
2311 const ExecContext *context,
2312 const ExecParameters *params,
2313 BindMount **ret_bind_mounts,
da6053d0 2314 size_t *ret_n_bind_mounts,
6c47cd7d
LP
2315 char ***ret_empty_directories) {
2316
2317 _cleanup_strv_free_ char **empty_directories = NULL;
2318 BindMount *bind_mounts;
da6053d0 2319 size_t n, h = 0, i;
6c47cd7d
LP
2320 ExecDirectoryType t;
2321 int r;
2322
2323 assert(context);
2324 assert(params);
2325 assert(ret_bind_mounts);
2326 assert(ret_n_bind_mounts);
2327 assert(ret_empty_directories);
2328
2329 n = context->n_bind_mounts;
2330 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2331 if (!params->prefix[t])
2332 continue;
2333
2334 n += strv_length(context->directories[t].paths);
2335 }
2336
2337 if (n <= 0) {
2338 *ret_bind_mounts = NULL;
2339 *ret_n_bind_mounts = 0;
2340 *ret_empty_directories = NULL;
2341 return 0;
2342 }
2343
2344 bind_mounts = new(BindMount, n);
2345 if (!bind_mounts)
2346 return -ENOMEM;
2347
a8cabc61 2348 for (i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
2349 BindMount *item = context->bind_mounts + i;
2350 char *s, *d;
2351
2352 s = strdup(item->source);
2353 if (!s) {
2354 r = -ENOMEM;
2355 goto finish;
2356 }
2357
2358 d = strdup(item->destination);
2359 if (!d) {
2360 free(s);
2361 r = -ENOMEM;
2362 goto finish;
2363 }
2364
2365 bind_mounts[h++] = (BindMount) {
2366 .source = s,
2367 .destination = d,
2368 .read_only = item->read_only,
2369 .recursive = item->recursive,
2370 .ignore_enoent = item->ignore_enoent,
2371 };
2372 }
2373
2374 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2375 char **suffix;
2376
2377 if (!params->prefix[t])
2378 continue;
2379
2380 if (strv_isempty(context->directories[t].paths))
2381 continue;
2382
494d0247 2383 if (exec_directory_is_private(context, t) &&
5609f688 2384 !(context->root_directory || context->root_image)) {
6c47cd7d
LP
2385 char *private_root;
2386
2387 /* So this is for a dynamic user, and we need to make sure the process can access its own
2388 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2389 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2390
657ee2d8 2391 private_root = path_join(params->prefix[t], "private");
6c47cd7d
LP
2392 if (!private_root) {
2393 r = -ENOMEM;
2394 goto finish;
2395 }
2396
2397 r = strv_consume(&empty_directories, private_root);
a635a7ae 2398 if (r < 0)
6c47cd7d 2399 goto finish;
6c47cd7d
LP
2400 }
2401
2402 STRV_FOREACH(suffix, context->directories[t].paths) {
2403 char *s, *d;
2404
494d0247 2405 if (exec_directory_is_private(context, t))
657ee2d8 2406 s = path_join(params->prefix[t], "private", *suffix);
6c47cd7d 2407 else
657ee2d8 2408 s = path_join(params->prefix[t], *suffix);
6c47cd7d
LP
2409 if (!s) {
2410 r = -ENOMEM;
2411 goto finish;
2412 }
2413
494d0247 2414 if (exec_directory_is_private(context, t) &&
5609f688
YW
2415 (context->root_directory || context->root_image))
2416 /* When RootDirectory= or RootImage= are set, then the symbolic link to the private
2417 * directory is not created on the root directory. So, let's bind-mount the directory
2418 * on the 'non-private' place. */
657ee2d8 2419 d = path_join(params->prefix[t], *suffix);
5609f688
YW
2420 else
2421 d = strdup(s);
6c47cd7d
LP
2422 if (!d) {
2423 free(s);
2424 r = -ENOMEM;
2425 goto finish;
2426 }
2427
2428 bind_mounts[h++] = (BindMount) {
2429 .source = s,
2430 .destination = d,
2431 .read_only = false,
9ce4e4b0 2432 .nosuid = context->dynamic_user, /* don't allow suid/sgid when DynamicUser= is on */
6c47cd7d
LP
2433 .recursive = true,
2434 .ignore_enoent = false,
2435 };
2436 }
2437 }
2438
2439 assert(h == n);
2440
2441 *ret_bind_mounts = bind_mounts;
2442 *ret_n_bind_mounts = n;
ae2a15bc 2443 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
2444
2445 return (int) n;
2446
2447finish:
2448 bind_mount_free_many(bind_mounts, h);
2449 return r;
2450}
2451
6818c54c 2452static int apply_mount_namespace(
34cf6c43
YW
2453 const Unit *u,
2454 const ExecCommand *command,
6818c54c
LP
2455 const ExecContext *context,
2456 const ExecParameters *params,
7cc5ef5f
ZJS
2457 const ExecRuntime *runtime,
2458 char **error_path) {
6818c54c 2459
7bcef4ef 2460 _cleanup_strv_free_ char **empty_directories = NULL;
93c6bb51 2461 char *tmp = NULL, *var = NULL;
915e6d16 2462 const char *root_dir = NULL, *root_image = NULL;
228af36f 2463 NamespaceInfo ns_info;
165a31c0 2464 bool needs_sandboxing;
6c47cd7d 2465 BindMount *bind_mounts = NULL;
da6053d0 2466 size_t n_bind_mounts = 0;
6818c54c 2467 int r;
93c6bb51 2468
2b3c1b9e
DH
2469 assert(context);
2470
93c6bb51
DH
2471 /* The runtime struct only contains the parent of the private /tmp,
2472 * which is non-accessible to world users. Inside of it there's a /tmp
2473 * that is sticky, and that's the one we want to use here. */
2474
2475 if (context->private_tmp && runtime) {
2476 if (runtime->tmp_dir)
2477 tmp = strjoina(runtime->tmp_dir, "/tmp");
2478 if (runtime->var_tmp_dir)
2479 var = strjoina(runtime->var_tmp_dir, "/tmp");
2480 }
2481
915e6d16
LP
2482 if (params->flags & EXEC_APPLY_CHROOT) {
2483 root_image = context->root_image;
2484
2485 if (!root_image)
2486 root_dir = context->root_directory;
2487 }
93c6bb51 2488
6c47cd7d
LP
2489 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2490 if (r < 0)
2491 return r;
2492
165a31c0 2493 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
b5a33299
YW
2494 if (needs_sandboxing)
2495 ns_info = (NamespaceInfo) {
2496 .ignore_protect_paths = false,
2497 .private_dev = context->private_devices,
2498 .protect_control_groups = context->protect_control_groups,
2499 .protect_kernel_tunables = context->protect_kernel_tunables,
2500 .protect_kernel_modules = context->protect_kernel_modules,
aecd5ac6 2501 .protect_hostname = context->protect_hostname,
b5a33299 2502 .mount_apivfs = context->mount_apivfs,
228af36f 2503 .private_mounts = context->private_mounts,
b5a33299 2504 };
228af36f
LP
2505 else if (!context->dynamic_user && root_dir)
2506 /*
2507 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2508 * sandbox info, otherwise enforce it, don't ignore protected paths and
2509 * fail if we are enable to apply the sandbox inside the mount namespace.
2510 */
2511 ns_info = (NamespaceInfo) {
2512 .ignore_protect_paths = true,
2513 };
2514 else
2515 ns_info = (NamespaceInfo) {};
b5a33299 2516
37ed15d7
FB
2517 if (context->mount_flags == MS_SHARED)
2518 log_unit_debug(u, "shared mount propagation hidden by other fs namespacing unit settings: ignoring");
2519
915e6d16 2520 r = setup_namespace(root_dir, root_image,
7bcef4ef 2521 &ns_info, context->read_write_paths,
165a31c0
LP
2522 needs_sandboxing ? context->read_only_paths : NULL,
2523 needs_sandboxing ? context->inaccessible_paths : NULL,
6c47cd7d
LP
2524 empty_directories,
2525 bind_mounts,
2526 n_bind_mounts,
2abd4e38
YW
2527 context->temporary_filesystems,
2528 context->n_temporary_filesystems,
93c6bb51
DH
2529 tmp,
2530 var,
165a31c0
LP
2531 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2532 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
915e6d16 2533 context->mount_flags,
7cc5ef5f
ZJS
2534 DISSECT_IMAGE_DISCARD_ON_LOOP,
2535 error_path);
93c6bb51 2536
6c47cd7d
LP
2537 bind_mount_free_many(bind_mounts, n_bind_mounts);
2538
1beab8b0 2539 /* If we couldn't set up the namespace this is probably due to a missing capability. setup_namespace() reports
5238e957 2540 * that with a special, recognizable error ENOANO. In this case, silently proceed, but only if exclusively
1beab8b0
LP
2541 * sandboxing options were used, i.e. nothing such as RootDirectory= or BindMount= that would result in a
2542 * completely different execution environment. */
aca835ed
YW
2543 if (r == -ENOANO) {
2544 if (n_bind_mounts == 0 &&
2545 context->n_temporary_filesystems == 0 &&
2546 !root_dir && !root_image &&
2547 !context->dynamic_user) {
2548 log_unit_debug(u, "Failed to set up namespace, assuming containerized execution and ignoring.");
2549 return 0;
2550 }
2551
2194547e
LP
2552 log_unit_debug(u, "Failed to set up namespace, and refusing to continue since the selected namespacing options alter mount environment non-trivially.\n"
2553 "Bind mounts: %zu, temporary filesystems: %zu, root directory: %s, root image: %s, dynamic user: %s",
2554 n_bind_mounts, context->n_temporary_filesystems, yes_no(root_dir), yes_no(root_image), yes_no(context->dynamic_user));
2555
aca835ed 2556 return -EOPNOTSUPP;
93c6bb51
DH
2557 }
2558
2559 return r;
2560}
2561
915e6d16
LP
2562static int apply_working_directory(
2563 const ExecContext *context,
2564 const ExecParameters *params,
2565 const char *home,
376fecf6 2566 int *exit_status) {
915e6d16 2567
6732edab 2568 const char *d, *wd;
2b3c1b9e
DH
2569
2570 assert(context);
376fecf6 2571 assert(exit_status);
2b3c1b9e 2572
6732edab
LP
2573 if (context->working_directory_home) {
2574
376fecf6
LP
2575 if (!home) {
2576 *exit_status = EXIT_CHDIR;
6732edab 2577 return -ENXIO;
376fecf6 2578 }
6732edab 2579
2b3c1b9e 2580 wd = home;
6732edab
LP
2581
2582 } else if (context->working_directory)
2b3c1b9e
DH
2583 wd = context->working_directory;
2584 else
2585 wd = "/";
e7f1e7c6 2586
fa97f630 2587 if (params->flags & EXEC_APPLY_CHROOT)
2b3c1b9e 2588 d = wd;
fa97f630 2589 else
3b0e5bb5 2590 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 2591
376fecf6
LP
2592 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2593 *exit_status = EXIT_CHDIR;
2b3c1b9e 2594 return -errno;
376fecf6 2595 }
e7f1e7c6
DH
2596
2597 return 0;
2598}
2599
fa97f630
JB
2600static int apply_root_directory(
2601 const ExecContext *context,
2602 const ExecParameters *params,
2603 const bool needs_mount_ns,
2604 int *exit_status) {
2605
2606 assert(context);
2607 assert(exit_status);
2608
2609 if (params->flags & EXEC_APPLY_CHROOT) {
2610 if (!needs_mount_ns && context->root_directory)
2611 if (chroot(context->root_directory) < 0) {
2612 *exit_status = EXIT_CHROOT;
2613 return -errno;
2614 }
2615 }
2616
2617 return 0;
2618}
2619
b1edf445 2620static int setup_keyring(
34cf6c43 2621 const Unit *u,
b1edf445
LP
2622 const ExecContext *context,
2623 const ExecParameters *p,
2624 uid_t uid, gid_t gid) {
2625
74dd6b51 2626 key_serial_t keyring;
e64c2d0b
DJL
2627 int r = 0;
2628 uid_t saved_uid;
2629 gid_t saved_gid;
74dd6b51
LP
2630
2631 assert(u);
b1edf445 2632 assert(context);
74dd6b51
LP
2633 assert(p);
2634
2635 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2636 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2637 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2638 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2639 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2640 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2641
b1edf445
LP
2642 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2643 return 0;
2644
e64c2d0b
DJL
2645 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2646 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2647 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2648 * & group is just as nasty as acquiring a reference to the user keyring. */
2649
2650 saved_uid = getuid();
2651 saved_gid = getgid();
2652
2653 if (gid_is_valid(gid) && gid != saved_gid) {
2654 if (setregid(gid, -1) < 0)
2655 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2656 }
2657
2658 if (uid_is_valid(uid) && uid != saved_uid) {
2659 if (setreuid(uid, -1) < 0) {
2660 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2661 goto out;
2662 }
2663 }
2664
74dd6b51
LP
2665 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2666 if (keyring == -1) {
2667 if (errno == ENOSYS)
8002fb97 2668 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
74dd6b51 2669 else if (IN_SET(errno, EACCES, EPERM))
8002fb97 2670 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 2671 else if (errno == EDQUOT)
8002fb97 2672 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 2673 else
e64c2d0b 2674 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 2675
e64c2d0b 2676 goto out;
74dd6b51
LP
2677 }
2678
e64c2d0b
DJL
2679 /* When requested link the user keyring into the session keyring. */
2680 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2681
2682 if (keyctl(KEYCTL_LINK,
2683 KEY_SPEC_USER_KEYRING,
2684 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2685 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2686 goto out;
2687 }
2688 }
2689
2690 /* Restore uid/gid back */
2691 if (uid_is_valid(uid) && uid != saved_uid) {
2692 if (setreuid(saved_uid, -1) < 0) {
2693 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2694 goto out;
2695 }
2696 }
2697
2698 if (gid_is_valid(gid) && gid != saved_gid) {
2699 if (setregid(saved_gid, -1) < 0)
2700 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2701 }
2702
2703 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
2704 if (!sd_id128_is_null(u->invocation_id)) {
2705 key_serial_t key;
2706
2707 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2708 if (key == -1)
8002fb97 2709 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
2710 else {
2711 if (keyctl(KEYCTL_SETPERM, key,
2712 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2713 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 2714 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
2715 }
2716 }
2717
e64c2d0b
DJL
2718out:
2719 /* Revert back uid & gid for the the last time, and exit */
2720 /* no extra logging, as only the first already reported error matters */
2721 if (getuid() != saved_uid)
2722 (void) setreuid(saved_uid, -1);
b1edf445 2723
e64c2d0b
DJL
2724 if (getgid() != saved_gid)
2725 (void) setregid(saved_gid, -1);
b1edf445 2726
e64c2d0b 2727 return r;
74dd6b51
LP
2728}
2729
3042bbeb 2730static void append_socket_pair(int *array, size_t *n, const int pair[static 2]) {
29206d46
LP
2731 assert(array);
2732 assert(n);
2caa38e9 2733 assert(pair);
29206d46
LP
2734
2735 if (pair[0] >= 0)
2736 array[(*n)++] = pair[0];
2737 if (pair[1] >= 0)
2738 array[(*n)++] = pair[1];
2739}
2740
a34ceba6
LP
2741static int close_remaining_fds(
2742 const ExecParameters *params,
34cf6c43
YW
2743 const ExecRuntime *runtime,
2744 const DynamicCreds *dcreds,
00d9ef85 2745 int user_lookup_fd,
a34ceba6 2746 int socket_fd,
5686391b 2747 int exec_fd,
da6053d0 2748 int *fds, size_t n_fds) {
a34ceba6 2749
da6053d0 2750 size_t n_dont_close = 0;
00d9ef85 2751 int dont_close[n_fds + 12];
a34ceba6
LP
2752
2753 assert(params);
2754
2755 if (params->stdin_fd >= 0)
2756 dont_close[n_dont_close++] = params->stdin_fd;
2757 if (params->stdout_fd >= 0)
2758 dont_close[n_dont_close++] = params->stdout_fd;
2759 if (params->stderr_fd >= 0)
2760 dont_close[n_dont_close++] = params->stderr_fd;
2761
2762 if (socket_fd >= 0)
2763 dont_close[n_dont_close++] = socket_fd;
5686391b
LP
2764 if (exec_fd >= 0)
2765 dont_close[n_dont_close++] = exec_fd;
a34ceba6
LP
2766 if (n_fds > 0) {
2767 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2768 n_dont_close += n_fds;
2769 }
2770
29206d46
LP
2771 if (runtime)
2772 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2773
2774 if (dcreds) {
2775 if (dcreds->user)
2776 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2777 if (dcreds->group)
2778 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
2779 }
2780
00d9ef85
LP
2781 if (user_lookup_fd >= 0)
2782 dont_close[n_dont_close++] = user_lookup_fd;
2783
a34ceba6
LP
2784 return close_all_fds(dont_close, n_dont_close);
2785}
2786
00d9ef85
LP
2787static int send_user_lookup(
2788 Unit *unit,
2789 int user_lookup_fd,
2790 uid_t uid,
2791 gid_t gid) {
2792
2793 assert(unit);
2794
2795 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2796 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2797 * specified. */
2798
2799 if (user_lookup_fd < 0)
2800 return 0;
2801
2802 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2803 return 0;
2804
2805 if (writev(user_lookup_fd,
2806 (struct iovec[]) {
e6a7ec4b
LP
2807 IOVEC_INIT(&uid, sizeof(uid)),
2808 IOVEC_INIT(&gid, sizeof(gid)),
2809 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
2810 return -errno;
2811
2812 return 0;
2813}
2814
6732edab
LP
2815static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2816 int r;
2817
2818 assert(c);
2819 assert(home);
2820 assert(buf);
2821
2822 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2823
2824 if (*home)
2825 return 0;
2826
2827 if (!c->working_directory_home)
2828 return 0;
2829
6732edab
LP
2830 r = get_home_dir(buf);
2831 if (r < 0)
2832 return r;
2833
2834 *home = *buf;
2835 return 1;
2836}
2837
da50b85a
LP
2838static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2839 _cleanup_strv_free_ char ** list = NULL;
2840 ExecDirectoryType t;
2841 int r;
2842
2843 assert(c);
2844 assert(p);
2845 assert(ret);
2846
2847 assert(c->dynamic_user);
2848
2849 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2850 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2851 * directories. */
2852
2853 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2854 char **i;
2855
2856 if (t == EXEC_DIRECTORY_CONFIGURATION)
2857 continue;
2858
2859 if (!p->prefix[t])
2860 continue;
2861
2862 STRV_FOREACH(i, c->directories[t].paths) {
2863 char *e;
2864
494d0247 2865 if (exec_directory_is_private(c, t))
657ee2d8 2866 e = path_join(p->prefix[t], "private", *i);
494d0247
YW
2867 else
2868 e = path_join(p->prefix[t], *i);
da50b85a
LP
2869 if (!e)
2870 return -ENOMEM;
2871
2872 r = strv_consume(&list, e);
2873 if (r < 0)
2874 return r;
2875 }
2876 }
2877
ae2a15bc 2878 *ret = TAKE_PTR(list);
da50b85a
LP
2879
2880 return 0;
2881}
2882
34cf6c43
YW
2883static char *exec_command_line(char **argv);
2884
78f93209
LP
2885static int exec_parameters_get_cgroup_path(const ExecParameters *params, char **ret) {
2886 bool using_subcgroup;
2887 char *p;
2888
2889 assert(params);
2890 assert(ret);
2891
2892 if (!params->cgroup_path)
2893 return -EINVAL;
2894
2895 /* If we are called for a unit where cgroup delegation is on, and the payload created its own populated
2896 * subcgroup (which we expect it to do, after all it asked for delegation), then we cannot place the control
2897 * processes started after the main unit's process in the unit's main cgroup because it is now an inner one,
2898 * and inner cgroups may not contain processes. Hence, if delegation is on, and this is a control process,
2899 * let's use ".control" as subcgroup instead. Note that we do so only for ExecStartPost=, ExecReload=,
2900 * ExecStop=, ExecStopPost=, i.e. for the commands where the main process is already forked. For ExecStartPre=
2901 * this is not necessary, the cgroup is still empty. We distinguish these cases with the EXEC_CONTROL_CGROUP
2902 * flag, which is only passed for the former statements, not for the latter. */
2903
2904 using_subcgroup = FLAGS_SET(params->flags, EXEC_CONTROL_CGROUP|EXEC_CGROUP_DELEGATE|EXEC_IS_CONTROL);
2905 if (using_subcgroup)
657ee2d8 2906 p = path_join(params->cgroup_path, ".control");
78f93209
LP
2907 else
2908 p = strdup(params->cgroup_path);
2909 if (!p)
2910 return -ENOMEM;
2911
2912 *ret = p;
2913 return using_subcgroup;
2914}
2915
ff0af2a1 2916static int exec_child(
f2341e0a 2917 Unit *unit,
34cf6c43 2918 const ExecCommand *command,
ff0af2a1
LP
2919 const ExecContext *context,
2920 const ExecParameters *params,
2921 ExecRuntime *runtime,
29206d46 2922 DynamicCreds *dcreds,
ff0af2a1 2923 int socket_fd,
2caa38e9 2924 const int named_iofds[static 3],
4c47affc 2925 int *fds,
da6053d0 2926 size_t n_socket_fds,
25b583d7 2927 size_t n_storage_fds,
ff0af2a1 2928 char **files_env,
00d9ef85 2929 int user_lookup_fd,
12145637 2930 int *exit_status) {
d35fbf6b 2931
7ca69792 2932 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **replaced_argv = NULL;
5686391b 2933 int *fds_with_exec_fd, n_fds_with_exec_fd, r, ngids = 0, exec_fd = -1;
4d885bd3
DH
2934 _cleanup_free_ gid_t *supplementary_gids = NULL;
2935 const char *username = NULL, *groupname = NULL;
5686391b 2936 _cleanup_free_ char *home_buffer = NULL;
2b3c1b9e 2937 const char *home = NULL, *shell = NULL;
7ca69792 2938 char **final_argv = NULL;
7bce046b
LP
2939 dev_t journal_stream_dev = 0;
2940 ino_t journal_stream_ino = 0;
165a31c0
LP
2941 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2942 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2943 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2944 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 2945#if HAVE_SELINUX
7f59dd35 2946 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 2947 bool use_selinux = false;
ecfbc84f 2948#endif
f9fa32f0 2949#if ENABLE_SMACK
43b1f709 2950 bool use_smack = false;
ecfbc84f 2951#endif
349cc4a5 2952#if HAVE_APPARMOR
43b1f709 2953 bool use_apparmor = false;
ecfbc84f 2954#endif
fed1e721
LP
2955 uid_t uid = UID_INVALID;
2956 gid_t gid = GID_INVALID;
da6053d0 2957 size_t n_fds;
3536f49e 2958 ExecDirectoryType dt;
165a31c0 2959 int secure_bits;
034c6ed7 2960
f2341e0a 2961 assert(unit);
5cb5a6ff
LP
2962 assert(command);
2963 assert(context);
d35fbf6b 2964 assert(params);
ff0af2a1 2965 assert(exit_status);
d35fbf6b
DM
2966
2967 rename_process_from_path(command->path);
2968
2969 /* We reset exactly these signals, since they are the
2970 * only ones we set to SIG_IGN in the main daemon. All
2971 * others we leave untouched because we set them to
2972 * SIG_DFL or a valid handler initially, both of which
2973 * will be demoted to SIG_DFL. */
ce30c8dc
LP
2974 (void) default_signals(SIGNALS_CRASH_HANDLER,
2975 SIGNALS_IGNORE, -1);
d35fbf6b
DM
2976
2977 if (context->ignore_sigpipe)
ce30c8dc 2978 (void) ignore_signals(SIGPIPE, -1);
d35fbf6b 2979
ff0af2a1
LP
2980 r = reset_signal_mask();
2981 if (r < 0) {
2982 *exit_status = EXIT_SIGNAL_MASK;
12145637 2983 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 2984 }
034c6ed7 2985
d35fbf6b
DM
2986 if (params->idle_pipe)
2987 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 2988
2c027c62
LP
2989 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2990 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2991 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2992 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 2993
d35fbf6b 2994 log_forget_fds();
2c027c62 2995 log_set_open_when_needed(true);
4f2d528d 2996
40a80078
LP
2997 /* In case anything used libc syslog(), close this here, too */
2998 closelog();
2999
5686391b
LP
3000 n_fds = n_socket_fds + n_storage_fds;
3001 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, params->exec_fd, fds, n_fds);
ff0af2a1
LP
3002 if (r < 0) {
3003 *exit_status = EXIT_FDS;
12145637 3004 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
3005 }
3006
d35fbf6b
DM
3007 if (!context->same_pgrp)
3008 if (setsid() < 0) {
ff0af2a1 3009 *exit_status = EXIT_SETSID;
12145637 3010 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
d35fbf6b 3011 }
9e2f7c11 3012
1e22b5cd 3013 exec_context_tty_reset(context, params);
d35fbf6b 3014
c891efaf 3015 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 3016 const char *vc = params->confirm_spawn;
3b20f877
FB
3017 _cleanup_free_ char *cmdline = NULL;
3018
ee39ca20 3019 cmdline = exec_command_line(command->argv);
3b20f877 3020 if (!cmdline) {
0460aa5c 3021 *exit_status = EXIT_MEMORY;
12145637 3022 return log_oom();
3b20f877 3023 }
d35fbf6b 3024
eedf223a 3025 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
3026 if (r != CONFIRM_EXECUTE) {
3027 if (r == CONFIRM_PRETEND_SUCCESS) {
3028 *exit_status = EXIT_SUCCESS;
3029 return 0;
3030 }
ff0af2a1 3031 *exit_status = EXIT_CONFIRM;
12145637 3032 log_unit_error(unit, "Execution cancelled by the user");
d35fbf6b 3033 return -ECANCELED;
d35fbf6b
DM
3034 }
3035 }
1a63a750 3036
d521916d
LP
3037 /* We are about to invoke NSS and PAM modules. Let's tell them what we are doing here, maybe they care. This is
3038 * used by nss-resolve to disable itself when we are about to start systemd-resolved, to avoid deadlocks. Note
3039 * that these env vars do not survive the execve(), which means they really only apply to the PAM and NSS
3040 * invocations themselves. Also note that while we'll only invoke NSS modules involved in user management they
3041 * might internally call into other NSS modules that are involved in hostname resolution, we never know. */
3042 if (setenv("SYSTEMD_ACTIVATION_UNIT", unit->id, true) != 0 ||
3043 setenv("SYSTEMD_ACTIVATION_SCOPE", MANAGER_IS_SYSTEM(unit->manager) ? "system" : "user", true) != 0) {
3044 *exit_status = EXIT_MEMORY;
3045 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
3046 }
3047
29206d46 3048 if (context->dynamic_user && dcreds) {
da50b85a 3049 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 3050
d521916d
LP
3051 /* On top of that, make sure we bypass our own NSS module nss-systemd comprehensively for any NSS
3052 * checks, if DynamicUser=1 is used, as we shouldn't create a feedback loop with ourselves here.*/
409093fe
LP
3053 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
3054 *exit_status = EXIT_USER;
12145637 3055 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
3056 }
3057
da50b85a
LP
3058 r = compile_suggested_paths(context, params, &suggested_paths);
3059 if (r < 0) {
3060 *exit_status = EXIT_MEMORY;
3061 return log_oom();
3062 }
3063
3064 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
3065 if (r < 0) {
3066 *exit_status = EXIT_USER;
e2b0cc34
YW
3067 if (r == -EILSEQ) {
3068 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
3069 return -EOPNOTSUPP;
3070 }
12145637 3071 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 3072 }
524daa8c 3073
70dd455c 3074 if (!uid_is_valid(uid)) {
29206d46 3075 *exit_status = EXIT_USER;
12145637 3076 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
3077 return -ESRCH;
3078 }
3079
3080 if (!gid_is_valid(gid)) {
3081 *exit_status = EXIT_USER;
12145637 3082 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
29206d46
LP
3083 return -ESRCH;
3084 }
5bc7452b 3085
29206d46
LP
3086 if (dcreds->user)
3087 username = dcreds->user->name;
3088
3089 } else {
4d885bd3
DH
3090 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
3091 if (r < 0) {
3092 *exit_status = EXIT_USER;
12145637 3093 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 3094 }
5bc7452b 3095
4d885bd3
DH
3096 r = get_fixed_group(context, &groupname, &gid);
3097 if (r < 0) {
3098 *exit_status = EXIT_GROUP;
12145637 3099 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 3100 }
cdc5d5c5 3101 }
29206d46 3102
cdc5d5c5
DH
3103 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
3104 r = get_supplementary_groups(context, username, groupname, gid,
3105 &supplementary_gids, &ngids);
3106 if (r < 0) {
3107 *exit_status = EXIT_GROUP;
12145637 3108 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 3109 }
5bc7452b 3110
00d9ef85
LP
3111 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
3112 if (r < 0) {
3113 *exit_status = EXIT_USER;
12145637 3114 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
3115 }
3116
3117 user_lookup_fd = safe_close(user_lookup_fd);
3118
6732edab
LP
3119 r = acquire_home(context, uid, &home, &home_buffer);
3120 if (r < 0) {
3121 *exit_status = EXIT_CHDIR;
12145637 3122 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
3123 }
3124
d35fbf6b
DM
3125 /* If a socket is connected to STDIN/STDOUT/STDERR, we
3126 * must sure to drop O_NONBLOCK */
3127 if (socket_fd >= 0)
a34ceba6 3128 (void) fd_nonblock(socket_fd, false);
acbb0225 3129
4c70a4a7
MS
3130 /* Journald will try to look-up our cgroup in order to populate _SYSTEMD_CGROUP and _SYSTEMD_UNIT fields.
3131 * Hence we need to migrate to the target cgroup from init.scope before connecting to journald */
3132 if (params->cgroup_path) {
3133 _cleanup_free_ char *p = NULL;
3134
3135 r = exec_parameters_get_cgroup_path(params, &p);
3136 if (r < 0) {
3137 *exit_status = EXIT_CGROUP;
3138 return log_unit_error_errno(unit, r, "Failed to acquire cgroup path: %m");
3139 }
3140
3141 r = cg_attach_everywhere(params->cgroup_supported, p, 0, NULL, NULL);
3142 if (r < 0) {
3143 *exit_status = EXIT_CGROUP;
3144 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", p);
3145 }
3146 }
3147
a8d08f39
LP
3148 if (context->network_namespace_path && runtime && runtime->netns_storage_socket[0] >= 0) {
3149 r = open_netns_path(runtime->netns_storage_socket, context->network_namespace_path);
3150 if (r < 0) {
3151 *exit_status = EXIT_NETWORK;
3152 return log_unit_error_errno(unit, r, "Failed to open network namespace path %s: %m", context->network_namespace_path);
3153 }
3154 }
3155
52c239d7 3156 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
3157 if (r < 0) {
3158 *exit_status = EXIT_STDIN;
12145637 3159 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 3160 }
034c6ed7 3161
52c239d7 3162 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3163 if (r < 0) {
3164 *exit_status = EXIT_STDOUT;
12145637 3165 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
3166 }
3167
52c239d7 3168 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
3169 if (r < 0) {
3170 *exit_status = EXIT_STDERR;
12145637 3171 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
3172 }
3173
d35fbf6b 3174 if (context->oom_score_adjust_set) {
9f8168eb
LP
3175 /* When we can't make this change due to EPERM, then let's silently skip over it. User namespaces
3176 * prohibit write access to this file, and we shouldn't trip up over that. */
3177 r = set_oom_score_adjust(context->oom_score_adjust);
12145637 3178 if (IN_SET(r, -EPERM, -EACCES))
f2341e0a 3179 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 3180 else if (r < 0) {
ff0af2a1 3181 *exit_status = EXIT_OOM_ADJUST;
12145637 3182 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 3183 }
d35fbf6b
DM
3184 }
3185
3186 if (context->nice_set)
3187 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
ff0af2a1 3188 *exit_status = EXIT_NICE;
12145637 3189 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
613b411c
LP
3190 }
3191
d35fbf6b
DM
3192 if (context->cpu_sched_set) {
3193 struct sched_param param = {
3194 .sched_priority = context->cpu_sched_priority,
3195 };
3196
ff0af2a1
LP
3197 r = sched_setscheduler(0,
3198 context->cpu_sched_policy |
3199 (context->cpu_sched_reset_on_fork ?
3200 SCHED_RESET_ON_FORK : 0),
3201 &param);
3202 if (r < 0) {
3203 *exit_status = EXIT_SETSCHEDULER;
12145637 3204 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 3205 }
d35fbf6b 3206 }
fc9b2a84 3207
0985c7c4
ZJS
3208 if (context->cpu_set.set)
3209 if (sched_setaffinity(0, context->cpu_set.allocated, context->cpu_set.set) < 0) {
ff0af2a1 3210 *exit_status = EXIT_CPUAFFINITY;
12145637 3211 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7
LP
3212 }
3213
b070c7c0
MS
3214 if (mpol_is_valid(numa_policy_get_type(&context->numa_policy))) {
3215 r = apply_numa_policy(&context->numa_policy);
3216 if (r == -EOPNOTSUPP)
33fe9e3f 3217 log_unit_debug_errno(unit, r, "NUMA support not available, ignoring.");
b070c7c0
MS
3218 else if (r < 0) {
3219 *exit_status = EXIT_NUMA_POLICY;
3220 return log_unit_error_errno(unit, r, "Failed to set NUMA memory policy: %m");
3221 }
3222 }
3223
d35fbf6b
DM
3224 if (context->ioprio_set)
3225 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 3226 *exit_status = EXIT_IOPRIO;
12145637 3227 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 3228 }
da726a4d 3229
d35fbf6b
DM
3230 if (context->timer_slack_nsec != NSEC_INFINITY)
3231 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 3232 *exit_status = EXIT_TIMERSLACK;
12145637 3233 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 3234 }
9eba9da4 3235
21022b9d
LP
3236 if (context->personality != PERSONALITY_INVALID) {
3237 r = safe_personality(context->personality);
3238 if (r < 0) {
ff0af2a1 3239 *exit_status = EXIT_PERSONALITY;
12145637 3240 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 3241 }
21022b9d 3242 }
94f04347 3243
d35fbf6b 3244 if (context->utmp_id)
df0ff127 3245 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 3246 context->tty_path,
023a4f67
LP
3247 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
3248 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
3249 USER_PROCESS,
6a93917d 3250 username);
d35fbf6b 3251
08f67696 3252 if (uid_is_valid(uid)) {
ff0af2a1
LP
3253 r = chown_terminal(STDIN_FILENO, uid);
3254 if (r < 0) {
3255 *exit_status = EXIT_STDIN;
12145637 3256 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 3257 }
d35fbf6b 3258 }
8e274523 3259
4e1dfa45 3260 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroup v1
62b9bb26 3261 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
4e1dfa45 3262 * safe. On cgroup v2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
62b9bb26 3263 * touch a single hierarchy too. */
584b8688 3264 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 3265 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
3266 if (r < 0) {
3267 *exit_status = EXIT_CGROUP;
12145637 3268 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 3269 }
d35fbf6b 3270 }
034c6ed7 3271
72fd1768 3272 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 3273 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
3274 if (r < 0)
3275 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 3276 }
94f04347 3277
7bce046b 3278 r = build_environment(
fd63e712 3279 unit,
7bce046b
LP
3280 context,
3281 params,
3282 n_fds,
3283 home,
3284 username,
3285 shell,
3286 journal_stream_dev,
3287 journal_stream_ino,
3288 &our_env);
2065ca69
JW
3289 if (r < 0) {
3290 *exit_status = EXIT_MEMORY;
12145637 3291 return log_oom();
2065ca69
JW
3292 }
3293
3294 r = build_pass_environment(context, &pass_env);
3295 if (r < 0) {
3296 *exit_status = EXIT_MEMORY;
12145637 3297 return log_oom();
2065ca69
JW
3298 }
3299
3300 accum_env = strv_env_merge(5,
3301 params->environment,
3302 our_env,
3303 pass_env,
3304 context->environment,
3305 files_env,
3306 NULL);
3307 if (!accum_env) {
3308 *exit_status = EXIT_MEMORY;
12145637 3309 return log_oom();
2065ca69 3310 }
1280503b 3311 accum_env = strv_env_clean(accum_env);
2065ca69 3312
096424d1 3313 (void) umask(context->umask);
b213e1c1 3314
b1edf445 3315 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
3316 if (r < 0) {
3317 *exit_status = EXIT_KEYRING;
12145637 3318 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
3319 }
3320
165a31c0 3321 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 3322 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 3323
165a31c0
LP
3324 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3325 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 3326
165a31c0
LP
3327 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3328 if (needs_ambient_hack)
3329 needs_setuid = false;
3330 else
3331 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3332
3333 if (needs_sandboxing) {
7f18ef0a
FK
3334 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3335 * present. The actual MAC context application will happen later, as late as possible, to avoid
3336 * impacting our own code paths. */
3337
349cc4a5 3338#if HAVE_SELINUX
43b1f709 3339 use_selinux = mac_selinux_use();
7f18ef0a 3340#endif
f9fa32f0 3341#if ENABLE_SMACK
43b1f709 3342 use_smack = mac_smack_use();
7f18ef0a 3343#endif
349cc4a5 3344#if HAVE_APPARMOR
43b1f709 3345 use_apparmor = mac_apparmor_use();
7f18ef0a 3346#endif
165a31c0 3347 }
7f18ef0a 3348
ce932d2d
LP
3349 if (needs_sandboxing) {
3350 int which_failed;
3351
3352 /* Let's set the resource limits before we call into PAM, so that pam_limits wins over what
3353 * is set here. (See below.) */
3354
3355 r = setrlimit_closest_all((const struct rlimit* const *) context->rlimit, &which_failed);
3356 if (r < 0) {
3357 *exit_status = EXIT_LIMITS;
3358 return log_unit_error_errno(unit, r, "Failed to adjust resource limit RLIMIT_%s: %m", rlimit_to_string(which_failed));
3359 }
3360 }
3361
165a31c0 3362 if (needs_setuid) {
ce932d2d
LP
3363
3364 /* Let's call into PAM after we set up our own idea of resource limits to that pam_limits
3365 * wins here. (See above.) */
3366
165a31c0
LP
3367 if (context->pam_name && username) {
3368 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3369 if (r < 0) {
3370 *exit_status = EXIT_PAM;
12145637 3371 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0
LP
3372 }
3373 }
b213e1c1 3374 }
ac45f971 3375
a8d08f39
LP
3376 if ((context->private_network || context->network_namespace_path) && runtime && runtime->netns_storage_socket[0] >= 0) {
3377
6e2d7c4f
MS
3378 if (ns_type_supported(NAMESPACE_NET)) {
3379 r = setup_netns(runtime->netns_storage_socket);
3380 if (r < 0) {
3381 *exit_status = EXIT_NETWORK;
3382 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3383 }
a8d08f39
LP
3384 } else if (context->network_namespace_path) {
3385 *exit_status = EXIT_NETWORK;
3386 return log_unit_error_errno(unit, SYNTHETIC_ERRNO(EOPNOTSUPP), "NetworkNamespacePath= is not supported, refusing.");
6e2d7c4f
MS
3387 } else
3388 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 3389 }
169c1bda 3390
ee818b89 3391 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 3392 if (needs_mount_namespace) {
7cc5ef5f
ZJS
3393 _cleanup_free_ char *error_path = NULL;
3394
3395 r = apply_mount_namespace(unit, command, context, params, runtime, &error_path);
3fbe8dbe
LP
3396 if (r < 0) {
3397 *exit_status = EXIT_NAMESPACE;
7cc5ef5f
ZJS
3398 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing%s%s: %m",
3399 error_path ? ": " : "", strempty(error_path));
3fbe8dbe 3400 }
d35fbf6b 3401 }
81a2b7ce 3402
aecd5ac6
TM
3403 if (context->protect_hostname) {
3404 if (ns_type_supported(NAMESPACE_UTS)) {
3405 if (unshare(CLONE_NEWUTS) < 0) {
3406 *exit_status = EXIT_NAMESPACE;
3407 return log_unit_error_errno(unit, errno, "Failed to set up UTS namespacing: %m");
3408 }
3409 } else
3410 log_unit_warning(unit, "ProtectHostname=yes is configured, but the kernel does not support UTS namespaces, ignoring namespace setup.");
3411#if HAVE_SECCOMP
3412 r = seccomp_protect_hostname();
3413 if (r < 0) {
3414 *exit_status = EXIT_SECCOMP;
3415 return log_unit_error_errno(unit, r, "Failed to apply hostname restrictions: %m");
3416 }
3417#endif
3418 }
3419
bbeea271 3420 /* Drop groups as early as possbile */
165a31c0 3421 if (needs_setuid) {
709dbeac 3422 r = enforce_groups(gid, supplementary_gids, ngids);
096424d1
LP
3423 if (r < 0) {
3424 *exit_status = EXIT_GROUP;
12145637 3425 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 3426 }
165a31c0 3427 }
096424d1 3428
165a31c0 3429 if (needs_sandboxing) {
349cc4a5 3430#if HAVE_SELINUX
43b1f709 3431 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
937ccce9
LP
3432 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3433 if (r < 0) {
3434 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 3435 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
937ccce9 3436 }
9008e1ac 3437 }
9008e1ac
MS
3438#endif
3439
937ccce9
LP
3440 if (context->private_users) {
3441 r = setup_private_users(uid, gid);
3442 if (r < 0) {
3443 *exit_status = EXIT_USER;
12145637 3444 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
937ccce9 3445 }
d251207d
LP
3446 }
3447 }
3448
165a31c0 3449 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
5686391b
LP
3450 * more aggressive this time since socket_fd and the netns fds we don't need anymore. We do keep the exec_fd
3451 * however if we have it as we want to keep it open until the final execve(). */
3452
3453 if (params->exec_fd >= 0) {
3454 exec_fd = params->exec_fd;
3455
3456 if (exec_fd < 3 + (int) n_fds) {
3457 int moved_fd;
3458
3459 /* Let's move the exec fd far up, so that it's outside of the fd range we want to pass to the
3460 * process we are about to execute. */
3461
3462 moved_fd = fcntl(exec_fd, F_DUPFD_CLOEXEC, 3 + (int) n_fds);
3463 if (moved_fd < 0) {
3464 *exit_status = EXIT_FDS;
3465 return log_unit_error_errno(unit, errno, "Couldn't move exec fd up: %m");
3466 }
3467
3468 safe_close(exec_fd);
3469 exec_fd = moved_fd;
3470 } else {
3471 /* This fd should be FD_CLOEXEC already, but let's make sure. */
3472 r = fd_cloexec(exec_fd, true);
3473 if (r < 0) {
3474 *exit_status = EXIT_FDS;
3475 return log_unit_error_errno(unit, r, "Failed to make exec fd FD_CLOEXEC: %m");
3476 }
3477 }
3478
3479 fds_with_exec_fd = newa(int, n_fds + 1);
7e8d494b 3480 memcpy_safe(fds_with_exec_fd, fds, n_fds * sizeof(int));
5686391b
LP
3481 fds_with_exec_fd[n_fds] = exec_fd;
3482 n_fds_with_exec_fd = n_fds + 1;
3483 } else {
3484 fds_with_exec_fd = fds;
3485 n_fds_with_exec_fd = n_fds;
3486 }
3487
3488 r = close_all_fds(fds_with_exec_fd, n_fds_with_exec_fd);
ff0af2a1
LP
3489 if (r >= 0)
3490 r = shift_fds(fds, n_fds);
3491 if (r >= 0)
25b583d7 3492 r = flags_fds(fds, n_socket_fds, n_storage_fds, context->non_blocking);
ff0af2a1
LP
3493 if (r < 0) {
3494 *exit_status = EXIT_FDS;
12145637 3495 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 3496 }
e66cf1a3 3497
5686391b
LP
3498 /* At this point, the fds we want to pass to the program are all ready and set up, with O_CLOEXEC turned off
3499 * and at the right fd numbers. The are no other fds open, with one exception: the exec_fd if it is defined,
3500 * and it has O_CLOEXEC set, after all we want it to be closed by the execve(), so that our parent knows we
3501 * came this far. */
3502
165a31c0 3503 secure_bits = context->secure_bits;
e66cf1a3 3504
165a31c0
LP
3505 if (needs_sandboxing) {
3506 uint64_t bset;
e66cf1a3 3507
ce932d2d
LP
3508 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly
3509 * requested. (Note this is placed after the general resource limit initialization, see
3510 * above, in order to take precedence.) */
f4170c67
LP
3511 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3512 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3513 *exit_status = EXIT_LIMITS;
12145637 3514 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
3515 }
3516 }
3517
37ac2744
JB
3518#if ENABLE_SMACK
3519 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3520 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3521 if (use_smack) {
3522 r = setup_smack(context, command);
3523 if (r < 0) {
3524 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3525 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3526 }
3527 }
3528#endif
3529
165a31c0
LP
3530 bset = context->capability_bounding_set;
3531 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3532 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3533 * instead of us doing that */
3534 if (needs_ambient_hack)
3535 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3536 (UINT64_C(1) << CAP_SETUID) |
3537 (UINT64_C(1) << CAP_SETGID);
3538
3539 if (!cap_test_all(bset)) {
3540 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
3541 if (r < 0) {
3542 *exit_status = EXIT_CAPABILITIES;
12145637 3543 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 3544 }
4c2630eb 3545 }
3b8bddde 3546
755d4b67
IP
3547 /* This is done before enforce_user, but ambient set
3548 * does not survive over setresuid() if keep_caps is not set. */
165a31c0
LP
3549 if (!needs_ambient_hack &&
3550 context->capability_ambient_set != 0) {
755d4b67
IP
3551 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3552 if (r < 0) {
3553 *exit_status = EXIT_CAPABILITIES;
12145637 3554 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 3555 }
755d4b67 3556 }
165a31c0 3557 }
755d4b67 3558
fa97f630
JB
3559 /* chroot to root directory first, before we lose the ability to chroot */
3560 r = apply_root_directory(context, params, needs_mount_namespace, exit_status);
3561 if (r < 0)
3562 return log_unit_error_errno(unit, r, "Chrooting to the requested root directory failed: %m");
3563
165a31c0 3564 if (needs_setuid) {
08f67696 3565 if (uid_is_valid(uid)) {
ff0af2a1
LP
3566 r = enforce_user(context, uid);
3567 if (r < 0) {
3568 *exit_status = EXIT_USER;
12145637 3569 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 3570 }
165a31c0
LP
3571
3572 if (!needs_ambient_hack &&
3573 context->capability_ambient_set != 0) {
755d4b67
IP
3574
3575 /* Fix the ambient capabilities after user change. */
3576 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3577 if (r < 0) {
3578 *exit_status = EXIT_CAPABILITIES;
12145637 3579 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67
IP
3580 }
3581
3582 /* If we were asked to change user and ambient capabilities
3583 * were requested, we had to add keep-caps to the securebits
3584 * so that we would maintain the inherited capability set
3585 * through the setresuid(). Make sure that the bit is added
3586 * also to the context secure_bits so that we don't try to
3587 * drop the bit away next. */
3588
7f508f2c 3589 secure_bits |= 1<<SECURE_KEEP_CAPS;
755d4b67 3590 }
5b6319dc 3591 }
165a31c0 3592 }
d35fbf6b 3593
56ef8db9
JB
3594 /* Apply working directory here, because the working directory might be on NFS and only the user running
3595 * this service might have the correct privilege to change to the working directory */
fa97f630 3596 r = apply_working_directory(context, params, home, exit_status);
56ef8db9
JB
3597 if (r < 0)
3598 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
3599
165a31c0 3600 if (needs_sandboxing) {
37ac2744 3601 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
3602 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3603 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3604 * are restricted. */
3605
349cc4a5 3606#if HAVE_SELINUX
43b1f709 3607 if (use_selinux) {
5cd9cd35
LP
3608 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3609
3610 if (exec_context) {
3611 r = setexeccon(exec_context);
3612 if (r < 0) {
3613 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 3614 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
3615 }
3616 }
3617 }
3618#endif
3619
349cc4a5 3620#if HAVE_APPARMOR
43b1f709 3621 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
3622 r = aa_change_onexec(context->apparmor_profile);
3623 if (r < 0 && !context->apparmor_profile_ignore) {
3624 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 3625 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
3626 }
3627 }
3628#endif
3629
165a31c0
LP
3630 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3631 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
755d4b67
IP
3632 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3633 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 3634 *exit_status = EXIT_SECUREBITS;
12145637 3635 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 3636 }
5b6319dc 3637
59eeb84b 3638 if (context_has_no_new_privileges(context))
d35fbf6b 3639 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 3640 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 3641 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
3642 }
3643
349cc4a5 3644#if HAVE_SECCOMP
469830d1
LP
3645 r = apply_address_families(unit, context);
3646 if (r < 0) {
3647 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 3648 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 3649 }
04aa0cb9 3650
469830d1
LP
3651 r = apply_memory_deny_write_execute(unit, context);
3652 if (r < 0) {
3653 *exit_status = EXIT_SECCOMP;
12145637 3654 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 3655 }
f4170c67 3656
469830d1
LP
3657 r = apply_restrict_realtime(unit, context);
3658 if (r < 0) {
3659 *exit_status = EXIT_SECCOMP;
12145637 3660 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
3661 }
3662
f69567cb
LP
3663 r = apply_restrict_suid_sgid(unit, context);
3664 if (r < 0) {
3665 *exit_status = EXIT_SECCOMP;
3666 return log_unit_error_errno(unit, r, "Failed to apply SUID/SGID restrictions: %m");
3667 }
3668
add00535
LP
3669 r = apply_restrict_namespaces(unit, context);
3670 if (r < 0) {
3671 *exit_status = EXIT_SECCOMP;
12145637 3672 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
3673 }
3674
469830d1
LP
3675 r = apply_protect_sysctl(unit, context);
3676 if (r < 0) {
3677 *exit_status = EXIT_SECCOMP;
12145637 3678 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
3679 }
3680
469830d1
LP
3681 r = apply_protect_kernel_modules(unit, context);
3682 if (r < 0) {
3683 *exit_status = EXIT_SECCOMP;
12145637 3684 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
3685 }
3686
469830d1
LP
3687 r = apply_private_devices(unit, context);
3688 if (r < 0) {
3689 *exit_status = EXIT_SECCOMP;
12145637 3690 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
3691 }
3692
3693 r = apply_syscall_archs(unit, context);
3694 if (r < 0) {
3695 *exit_status = EXIT_SECCOMP;
12145637 3696 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
3697 }
3698
78e864e5
TM
3699 r = apply_lock_personality(unit, context);
3700 if (r < 0) {
3701 *exit_status = EXIT_SECCOMP;
12145637 3702 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
3703 }
3704
5cd9cd35
LP
3705 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3706 * by the filter as little as possible. */
165a31c0 3707 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
3708 if (r < 0) {
3709 *exit_status = EXIT_SECCOMP;
12145637 3710 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
3711 }
3712#endif
d35fbf6b 3713 }
034c6ed7 3714
00819cc1
LP
3715 if (!strv_isempty(context->unset_environment)) {
3716 char **ee = NULL;
3717
3718 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3719 if (!ee) {
3720 *exit_status = EXIT_MEMORY;
12145637 3721 return log_oom();
00819cc1
LP
3722 }
3723
130d3d22 3724 strv_free_and_replace(accum_env, ee);
00819cc1
LP
3725 }
3726
7ca69792
AZ
3727 if (!FLAGS_SET(command->flags, EXEC_COMMAND_NO_ENV_EXPAND)) {
3728 replaced_argv = replace_env_argv(command->argv, accum_env);
3729 if (!replaced_argv) {
3730 *exit_status = EXIT_MEMORY;
3731 return log_oom();
3732 }
3733 final_argv = replaced_argv;
3734 } else
3735 final_argv = command->argv;
034c6ed7 3736
f1d34068 3737 if (DEBUG_LOGGING) {
d35fbf6b 3738 _cleanup_free_ char *line;
81a2b7ce 3739
d35fbf6b 3740 line = exec_command_line(final_argv);
a1230ff9 3741 if (line)
f2341e0a 3742 log_struct(LOG_DEBUG,
f2341e0a
LP
3743 "EXECUTABLE=%s", command->path,
3744 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
ba360bb0 3745 LOG_UNIT_ID(unit),
a1230ff9 3746 LOG_UNIT_INVOCATION_ID(unit));
d35fbf6b 3747 }
dd305ec9 3748
5686391b
LP
3749 if (exec_fd >= 0) {
3750 uint8_t hot = 1;
3751
3752 /* We have finished with all our initializations. Let's now let the manager know that. From this point
3753 * on, if the manager sees POLLHUP on the exec_fd, then execve() was successful. */
3754
3755 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3756 *exit_status = EXIT_EXEC;
3757 return log_unit_error_errno(unit, errno, "Failed to enable exec_fd: %m");
3758 }
3759 }
3760
2065ca69 3761 execve(command->path, final_argv, accum_env);
5686391b
LP
3762 r = -errno;
3763
3764 if (exec_fd >= 0) {
3765 uint8_t hot = 0;
3766
3767 /* The execve() failed. This means the exec_fd is still open. Which means we need to tell the manager
3768 * that POLLHUP on it no longer means execve() succeeded. */
3769
3770 if (write(exec_fd, &hot, sizeof(hot)) < 0) {
3771 *exit_status = EXIT_EXEC;
3772 return log_unit_error_errno(unit, errno, "Failed to disable exec_fd: %m");
3773 }
3774 }
12145637 3775
5686391b
LP
3776 if (r == -ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3777 log_struct_errno(LOG_INFO, r,
12145637
LP
3778 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3779 LOG_UNIT_ID(unit),
3780 LOG_UNIT_INVOCATION_ID(unit),
3781 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3782 command->path),
a1230ff9 3783 "EXECUTABLE=%s", command->path);
12145637
LP
3784 return 0;
3785 }
3786
ff0af2a1 3787 *exit_status = EXIT_EXEC;
5686391b 3788 return log_unit_error_errno(unit, r, "Failed to execute command: %m");
d35fbf6b 3789}
81a2b7ce 3790
34cf6c43 3791static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
2caa38e9 3792static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[static 3]);
34cf6c43 3793
f2341e0a
LP
3794int exec_spawn(Unit *unit,
3795 ExecCommand *command,
d35fbf6b
DM
3796 const ExecContext *context,
3797 const ExecParameters *params,
3798 ExecRuntime *runtime,
29206d46 3799 DynamicCreds *dcreds,
d35fbf6b 3800 pid_t *ret) {
8351ceae 3801
ee39ca20 3802 int socket_fd, r, named_iofds[3] = { -1, -1, -1 }, *fds = NULL;
78f93209 3803 _cleanup_free_ char *subcgroup_path = NULL;
d35fbf6b 3804 _cleanup_strv_free_ char **files_env = NULL;
da6053d0 3805 size_t n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1 3806 _cleanup_free_ char *line = NULL;
d35fbf6b 3807 pid_t pid;
8351ceae 3808
f2341e0a 3809 assert(unit);
d35fbf6b
DM
3810 assert(command);
3811 assert(context);
3812 assert(ret);
3813 assert(params);
25b583d7 3814 assert(params->fds || (params->n_socket_fds + params->n_storage_fds <= 0));
4298d0b5 3815
d35fbf6b
DM
3816 if (context->std_input == EXEC_INPUT_SOCKET ||
3817 context->std_output == EXEC_OUTPUT_SOCKET ||
3818 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 3819
4c47affc 3820 if (params->n_socket_fds > 1) {
f2341e0a 3821 log_unit_error(unit, "Got more than one socket.");
d35fbf6b 3822 return -EINVAL;
ff0af2a1 3823 }
eef65bf3 3824
4c47affc 3825 if (params->n_socket_fds == 0) {
488ab41c
AA
3826 log_unit_error(unit, "Got no socket.");
3827 return -EINVAL;
3828 }
3829
d35fbf6b
DM
3830 socket_fd = params->fds[0];
3831 } else {
3832 socket_fd = -1;
3833 fds = params->fds;
9b141911 3834 n_socket_fds = params->n_socket_fds;
25b583d7 3835 n_storage_fds = params->n_storage_fds;
d35fbf6b 3836 }
94f04347 3837
34cf6c43 3838 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
3839 if (r < 0)
3840 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3841
f2341e0a 3842 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 3843 if (r < 0)
f2341e0a 3844 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 3845
ee39ca20 3846 line = exec_command_line(command->argv);
d35fbf6b
DM
3847 if (!line)
3848 return log_oom();
fab56fc5 3849
f2341e0a 3850 log_struct(LOG_DEBUG,
f2341e0a
LP
3851 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3852 "EXECUTABLE=%s", command->path,
ba360bb0 3853 LOG_UNIT_ID(unit),
a1230ff9 3854 LOG_UNIT_INVOCATION_ID(unit));
12145637 3855
78f93209
LP
3856 if (params->cgroup_path) {
3857 r = exec_parameters_get_cgroup_path(params, &subcgroup_path);
3858 if (r < 0)
3859 return log_unit_error_errno(unit, r, "Failed to acquire subcgroup path: %m");
3860 if (r > 0) { /* We are using a child cgroup */
3861 r = cg_create(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path);
3862 if (r < 0)
3863 return log_unit_error_errno(unit, r, "Failed to create control group '%s': %m", subcgroup_path);
3864 }
3865 }
3866
d35fbf6b
DM
3867 pid = fork();
3868 if (pid < 0)
74129a12 3869 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
3870
3871 if (pid == 0) {
12145637 3872 int exit_status = EXIT_SUCCESS;
ff0af2a1 3873
f2341e0a
LP
3874 r = exec_child(unit,
3875 command,
ff0af2a1
LP
3876 context,
3877 params,
3878 runtime,
29206d46 3879 dcreds,
ff0af2a1 3880 socket_fd,
52c239d7 3881 named_iofds,
4c47affc 3882 fds,
9b141911 3883 n_socket_fds,
25b583d7 3884 n_storage_fds,
ff0af2a1 3885 files_env,
00d9ef85 3886 unit->manager->user_lookup_fds[1],
12145637
LP
3887 &exit_status);
3888
e1714f02
ZJS
3889 if (r < 0) {
3890 const char *status =
3891 exit_status_to_string(exit_status,
e04ed6db 3892 EXIT_STATUS_LIBC | EXIT_STATUS_SYSTEMD);
e1714f02 3893
12145637
LP
3894 log_struct_errno(LOG_ERR, r,
3895 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3896 LOG_UNIT_ID(unit),
3897 LOG_UNIT_INVOCATION_ID(unit),
3898 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
e1714f02 3899 status, command->path),
a1230ff9 3900 "EXECUTABLE=%s", command->path);
e1714f02 3901 }
4c2630eb 3902
ff0af2a1 3903 _exit(exit_status);
034c6ed7
LP
3904 }
3905
f2341e0a 3906 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 3907
78f93209
LP
3908 /* We add the new process to the cgroup both in the child (so that we can be sure that no user code is ever
3909 * executed outside of the cgroup) and in the parent (so that we can be sure that when we kill the cgroup the
3910 * process will be killed too). */
3911 if (subcgroup_path)
3912 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, subcgroup_path, pid);
2da3263a 3913
b58b4116 3914 exec_status_start(&command->exec_status, pid);
9fb86720 3915
034c6ed7 3916 *ret = pid;
5cb5a6ff
LP
3917 return 0;
3918}
3919
034c6ed7 3920void exec_context_init(ExecContext *c) {
3536f49e
YW
3921 ExecDirectoryType i;
3922
034c6ed7
LP
3923 assert(c);
3924
4c12626c 3925 c->umask = 0022;
9eba9da4 3926 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 3927 c->cpu_sched_policy = SCHED_OTHER;
071830ff 3928 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 3929 c->syslog_level_prefix = true;
353e12c2 3930 c->ignore_sigpipe = true;
3a43da28 3931 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 3932 c->personality = PERSONALITY_INVALID;
72fd1768 3933 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3536f49e 3934 c->directories[i].mode = 0755;
a103496c 3935 c->capability_bounding_set = CAP_ALL;
aa9d574d
YW
3936 assert_cc(NAMESPACE_FLAGS_INITIAL != NAMESPACE_FLAGS_ALL);
3937 c->restrict_namespaces = NAMESPACE_FLAGS_INITIAL;
d3070fbd 3938 c->log_level_max = -1;
b070c7c0 3939 numa_policy_reset(&c->numa_policy);
034c6ed7
LP
3940}
3941
613b411c 3942void exec_context_done(ExecContext *c) {
3536f49e 3943 ExecDirectoryType i;
d3070fbd 3944 size_t l;
5cb5a6ff
LP
3945
3946 assert(c);
3947
6796073e
LP
3948 c->environment = strv_free(c->environment);
3949 c->environment_files = strv_free(c->environment_files);
b4c14404 3950 c->pass_environment = strv_free(c->pass_environment);
00819cc1 3951 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 3952
31ce987c 3953 rlimit_free_all(c->rlimit);
034c6ed7 3954
2038c3f5 3955 for (l = 0; l < 3; l++) {
52c239d7 3956 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
3957 c->stdio_file[l] = mfree(c->stdio_file[l]);
3958 }
52c239d7 3959
a1e58e8e
LP
3960 c->working_directory = mfree(c->working_directory);
3961 c->root_directory = mfree(c->root_directory);
915e6d16 3962 c->root_image = mfree(c->root_image);
a1e58e8e
LP
3963 c->tty_path = mfree(c->tty_path);
3964 c->syslog_identifier = mfree(c->syslog_identifier);
3965 c->user = mfree(c->user);
3966 c->group = mfree(c->group);
034c6ed7 3967
6796073e 3968 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 3969
a1e58e8e 3970 c->pam_name = mfree(c->pam_name);
5b6319dc 3971
2a624c36
AP
3972 c->read_only_paths = strv_free(c->read_only_paths);
3973 c->read_write_paths = strv_free(c->read_write_paths);
3974 c->inaccessible_paths = strv_free(c->inaccessible_paths);
82c121a4 3975
d2d6c096 3976 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
3977 c->bind_mounts = NULL;
3978 c->n_bind_mounts = 0;
2abd4e38
YW
3979 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3980 c->temporary_filesystems = NULL;
3981 c->n_temporary_filesystems = 0;
d2d6c096 3982
0985c7c4 3983 cpu_set_reset(&c->cpu_set);
b070c7c0 3984 numa_policy_reset(&c->numa_policy);
86a3475b 3985
a1e58e8e
LP
3986 c->utmp_id = mfree(c->utmp_id);
3987 c->selinux_context = mfree(c->selinux_context);
3988 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 3989 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 3990
8cfa775f 3991 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
3992 c->syscall_archs = set_free(c->syscall_archs);
3993 c->address_families = set_free(c->address_families);
e66cf1a3 3994
72fd1768 3995 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3536f49e 3996 c->directories[i].paths = strv_free(c->directories[i].paths);
d3070fbd
LP
3997
3998 c->log_level_max = -1;
3999
4000 exec_context_free_log_extra_fields(c);
08f3be7a 4001
90fc172e
AZ
4002 c->log_rate_limit_interval_usec = 0;
4003 c->log_rate_limit_burst = 0;
4004
08f3be7a
LP
4005 c->stdin_data = mfree(c->stdin_data);
4006 c->stdin_data_size = 0;
a8d08f39
LP
4007
4008 c->network_namespace_path = mfree(c->network_namespace_path);
e66cf1a3
LP
4009}
4010
34cf6c43 4011int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
4012 char **i;
4013
4014 assert(c);
4015
4016 if (!runtime_prefix)
4017 return 0;
4018
3536f49e 4019 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
e66cf1a3
LP
4020 _cleanup_free_ char *p;
4021
494d0247
YW
4022 if (exec_directory_is_private(c, EXEC_DIRECTORY_RUNTIME))
4023 p = path_join(runtime_prefix, "private", *i);
4024 else
4025 p = path_join(runtime_prefix, *i);
e66cf1a3
LP
4026 if (!p)
4027 return -ENOMEM;
4028
7bc4bf4a
LP
4029 /* We execute this synchronously, since we need to be sure this is gone when we start the
4030 * service next. */
c6878637 4031 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
4032 }
4033
4034 return 0;
5cb5a6ff
LP
4035}
4036
34cf6c43 4037static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
4038 assert(c);
4039
a1e58e8e 4040 c->path = mfree(c->path);
6796073e 4041 c->argv = strv_free(c->argv);
43d0fcbd
LP
4042}
4043
da6053d0
LP
4044void exec_command_done_array(ExecCommand *c, size_t n) {
4045 size_t i;
43d0fcbd
LP
4046
4047 for (i = 0; i < n; i++)
4048 exec_command_done(c+i);
4049}
4050
f1acf85a 4051ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
4052 ExecCommand *i;
4053
4054 while ((i = c)) {
71fda00f 4055 LIST_REMOVE(command, c, i);
43d0fcbd 4056 exec_command_done(i);
5cb5a6ff
LP
4057 free(i);
4058 }
f1acf85a
ZJS
4059
4060 return NULL;
5cb5a6ff
LP
4061}
4062
da6053d0
LP
4063void exec_command_free_array(ExecCommand **c, size_t n) {
4064 size_t i;
034c6ed7 4065
f1acf85a
ZJS
4066 for (i = 0; i < n; i++)
4067 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
4068}
4069
6a1d4d9f
LP
4070void exec_command_reset_status_array(ExecCommand *c, size_t n) {
4071 size_t i;
4072
4073 for (i = 0; i < n; i++)
4074 exec_status_reset(&c[i].exec_status);
4075}
4076
4077void exec_command_reset_status_list_array(ExecCommand **c, size_t n) {
4078 size_t i;
4079
4080 for (i = 0; i < n; i++) {
4081 ExecCommand *z;
4082
4083 LIST_FOREACH(command, z, c[i])
4084 exec_status_reset(&z->exec_status);
4085 }
4086}
4087
039f0e70 4088typedef struct InvalidEnvInfo {
34cf6c43 4089 const Unit *unit;
039f0e70
LP
4090 const char *path;
4091} InvalidEnvInfo;
4092
4093static void invalid_env(const char *p, void *userdata) {
4094 InvalidEnvInfo *info = userdata;
4095
f2341e0a 4096 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
4097}
4098
52c239d7
LB
4099const char* exec_context_fdname(const ExecContext *c, int fd_index) {
4100 assert(c);
4101
4102 switch (fd_index) {
5073ff6b 4103
52c239d7
LB
4104 case STDIN_FILENO:
4105 if (c->std_input != EXEC_INPUT_NAMED_FD)
4106 return NULL;
5073ff6b 4107
52c239d7 4108 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 4109
52c239d7
LB
4110 case STDOUT_FILENO:
4111 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
4112 return NULL;
5073ff6b 4113
52c239d7 4114 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 4115
52c239d7
LB
4116 case STDERR_FILENO:
4117 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
4118 return NULL;
5073ff6b 4119
52c239d7 4120 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 4121
52c239d7
LB
4122 default:
4123 return NULL;
4124 }
4125}
4126
2caa38e9
LP
4127static int exec_context_named_iofds(
4128 const ExecContext *c,
4129 const ExecParameters *p,
4130 int named_iofds[static 3]) {
4131
da6053d0 4132 size_t i, targets;
56fbd561 4133 const char* stdio_fdname[3];
da6053d0 4134 size_t n_fds;
52c239d7
LB
4135
4136 assert(c);
4137 assert(p);
2caa38e9 4138 assert(named_iofds);
52c239d7
LB
4139
4140 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
4141 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
4142 (c->std_error == EXEC_OUTPUT_NAMED_FD);
4143
4144 for (i = 0; i < 3; i++)
4145 stdio_fdname[i] = exec_context_fdname(c, i);
4146
4c47affc
FB
4147 n_fds = p->n_storage_fds + p->n_socket_fds;
4148
4149 for (i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
4150 if (named_iofds[STDIN_FILENO] < 0 &&
4151 c->std_input == EXEC_INPUT_NAMED_FD &&
4152 stdio_fdname[STDIN_FILENO] &&
4153 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
4154
52c239d7
LB
4155 named_iofds[STDIN_FILENO] = p->fds[i];
4156 targets--;
56fbd561
ZJS
4157
4158 } else if (named_iofds[STDOUT_FILENO] < 0 &&
4159 c->std_output == EXEC_OUTPUT_NAMED_FD &&
4160 stdio_fdname[STDOUT_FILENO] &&
4161 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
4162
52c239d7
LB
4163 named_iofds[STDOUT_FILENO] = p->fds[i];
4164 targets--;
56fbd561
ZJS
4165
4166 } else if (named_iofds[STDERR_FILENO] < 0 &&
4167 c->std_error == EXEC_OUTPUT_NAMED_FD &&
4168 stdio_fdname[STDERR_FILENO] &&
4169 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
4170
52c239d7
LB
4171 named_iofds[STDERR_FILENO] = p->fds[i];
4172 targets--;
4173 }
4174
56fbd561 4175 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
4176}
4177
34cf6c43 4178static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
4179 char **i, **r = NULL;
4180
4181 assert(c);
4182 assert(l);
4183
4184 STRV_FOREACH(i, c->environment_files) {
4185 char *fn;
52511fae
ZJS
4186 int k;
4187 unsigned n;
8c7be95e
LP
4188 bool ignore = false;
4189 char **p;
7fd1b19b 4190 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
4191
4192 fn = *i;
4193
4194 if (fn[0] == '-') {
4195 ignore = true;
313cefa1 4196 fn++;
8c7be95e
LP
4197 }
4198
4199 if (!path_is_absolute(fn)) {
8c7be95e
LP
4200 if (ignore)
4201 continue;
4202
4203 strv_free(r);
4204 return -EINVAL;
4205 }
4206
2bef10ab 4207 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
4208 k = safe_glob(fn, 0, &pglob);
4209 if (k < 0) {
2bef10ab
PL
4210 if (ignore)
4211 continue;
8c7be95e 4212
2bef10ab 4213 strv_free(r);
d8c92e8b 4214 return k;
2bef10ab 4215 }
8c7be95e 4216
d8c92e8b
ZJS
4217 /* When we don't match anything, -ENOENT should be returned */
4218 assert(pglob.gl_pathc > 0);
4219
4220 for (n = 0; n < pglob.gl_pathc; n++) {
aa8fbc74 4221 k = load_env_file(NULL, pglob.gl_pathv[n], &p);
2bef10ab
PL
4222 if (k < 0) {
4223 if (ignore)
4224 continue;
8c7be95e 4225
2bef10ab 4226 strv_free(r);
2bef10ab 4227 return k;
e9c1ea9d 4228 }
ebc05a09 4229 /* Log invalid environment variables with filename */
039f0e70
LP
4230 if (p) {
4231 InvalidEnvInfo info = {
f2341e0a 4232 .unit = unit,
039f0e70
LP
4233 .path = pglob.gl_pathv[n]
4234 };
4235
4236 p = strv_env_clean_with_callback(p, invalid_env, &info);
4237 }
8c7be95e 4238
234519ae 4239 if (!r)
2bef10ab
PL
4240 r = p;
4241 else {
4242 char **m;
8c7be95e 4243
2bef10ab
PL
4244 m = strv_env_merge(2, r, p);
4245 strv_free(r);
4246 strv_free(p);
c84a9488 4247 if (!m)
2bef10ab 4248 return -ENOMEM;
2bef10ab
PL
4249
4250 r = m;
4251 }
8c7be95e
LP
4252 }
4253 }
4254
4255 *l = r;
4256
4257 return 0;
4258}
4259
6ac8fdc9 4260static bool tty_may_match_dev_console(const char *tty) {
7b912648 4261 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 4262
1e22b5cd
LP
4263 if (!tty)
4264 return true;
4265
a119ec7c 4266 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
4267
4268 /* trivial identity? */
4269 if (streq(tty, "console"))
4270 return true;
4271
7b912648
LP
4272 if (resolve_dev_console(&resolved) < 0)
4273 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
4274
4275 /* "tty0" means the active VC, so it may be the same sometimes */
955f1c85 4276 return path_equal(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
4277}
4278
6c0ae739
LP
4279static bool exec_context_may_touch_tty(const ExecContext *ec) {
4280 assert(ec);
1e22b5cd 4281
6c0ae739 4282 return ec->tty_reset ||
1e22b5cd
LP
4283 ec->tty_vhangup ||
4284 ec->tty_vt_disallocate ||
6ac8fdc9
MS
4285 is_terminal_input(ec->std_input) ||
4286 is_terminal_output(ec->std_output) ||
6c0ae739
LP
4287 is_terminal_output(ec->std_error);
4288}
4289
4290bool exec_context_may_touch_console(const ExecContext *ec) {
4291
4292 return exec_context_may_touch_tty(ec) &&
1e22b5cd 4293 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
4294}
4295
15ae422b
LP
4296static void strv_fprintf(FILE *f, char **l) {
4297 char **g;
4298
4299 assert(f);
4300
4301 STRV_FOREACH(g, l)
4302 fprintf(f, " %s", *g);
4303}
4304
34cf6c43 4305void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
d3070fbd 4306 ExecDirectoryType dt;
c2bbd90b 4307 char **e, **d;
94f04347 4308 unsigned i;
add00535 4309 int r;
9eba9da4 4310
5cb5a6ff
LP
4311 assert(c);
4312 assert(f);
4313
4ad49000 4314 prefix = strempty(prefix);
5cb5a6ff
LP
4315
4316 fprintf(f,
94f04347
LP
4317 "%sUMask: %04o\n"
4318 "%sWorkingDirectory: %s\n"
451a074f 4319 "%sRootDirectory: %s\n"
15ae422b 4320 "%sNonBlocking: %s\n"
64747e2d 4321 "%sPrivateTmp: %s\n"
7f112f50 4322 "%sPrivateDevices: %s\n"
59eeb84b 4323 "%sProtectKernelTunables: %s\n"
e66a2f65 4324 "%sProtectKernelModules: %s\n"
59eeb84b 4325 "%sProtectControlGroups: %s\n"
d251207d
LP
4326 "%sPrivateNetwork: %s\n"
4327 "%sPrivateUsers: %s\n"
1b8689f9
LP
4328 "%sProtectHome: %s\n"
4329 "%sProtectSystem: %s\n"
5d997827 4330 "%sMountAPIVFS: %s\n"
f3e43635 4331 "%sIgnoreSIGPIPE: %s\n"
f4170c67 4332 "%sMemoryDenyWriteExecute: %s\n"
b1edf445 4333 "%sRestrictRealtime: %s\n"
f69567cb 4334 "%sRestrictSUIDSGID: %s\n"
aecd5ac6
TM
4335 "%sKeyringMode: %s\n"
4336 "%sProtectHostname: %s\n",
5cb5a6ff 4337 prefix, c->umask,
9eba9da4 4338 prefix, c->working_directory ? c->working_directory : "/",
451a074f 4339 prefix, c->root_directory ? c->root_directory : "/",
15ae422b 4340 prefix, yes_no(c->non_blocking),
64747e2d 4341 prefix, yes_no(c->private_tmp),
7f112f50 4342 prefix, yes_no(c->private_devices),
59eeb84b 4343 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 4344 prefix, yes_no(c->protect_kernel_modules),
59eeb84b 4345 prefix, yes_no(c->protect_control_groups),
d251207d
LP
4346 prefix, yes_no(c->private_network),
4347 prefix, yes_no(c->private_users),
1b8689f9
LP
4348 prefix, protect_home_to_string(c->protect_home),
4349 prefix, protect_system_to_string(c->protect_system),
5d997827 4350 prefix, yes_no(c->mount_apivfs),
f3e43635 4351 prefix, yes_no(c->ignore_sigpipe),
f4170c67 4352 prefix, yes_no(c->memory_deny_write_execute),
b1edf445 4353 prefix, yes_no(c->restrict_realtime),
f69567cb 4354 prefix, yes_no(c->restrict_suid_sgid),
aecd5ac6
TM
4355 prefix, exec_keyring_mode_to_string(c->keyring_mode),
4356 prefix, yes_no(c->protect_hostname));
fb33a393 4357
915e6d16
LP
4358 if (c->root_image)
4359 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
4360
8c7be95e
LP
4361 STRV_FOREACH(e, c->environment)
4362 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
4363
4364 STRV_FOREACH(e, c->environment_files)
4365 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 4366
b4c14404
FB
4367 STRV_FOREACH(e, c->pass_environment)
4368 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
4369
00819cc1
LP
4370 STRV_FOREACH(e, c->unset_environment)
4371 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
4372
53f47dfc
YW
4373 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
4374
72fd1768 4375 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
4376 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
4377
4378 STRV_FOREACH(d, c->directories[dt].paths)
4379 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
4380 }
c2bbd90b 4381
fb33a393
LP
4382 if (c->nice_set)
4383 fprintf(f,
4384 "%sNice: %i\n",
4385 prefix, c->nice);
4386
dd6c17b1 4387 if (c->oom_score_adjust_set)
fb33a393 4388 fprintf(f,
dd6c17b1
LP
4389 "%sOOMScoreAdjust: %i\n",
4390 prefix, c->oom_score_adjust);
9eba9da4 4391
94f04347 4392 for (i = 0; i < RLIM_NLIMITS; i++)
3c11da9d 4393 if (c->rlimit[i]) {
4c3a2b84 4394 fprintf(f, "%sLimit%s: " RLIM_FMT "\n",
3c11da9d 4395 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
4c3a2b84 4396 fprintf(f, "%sLimit%sSoft: " RLIM_FMT "\n",
3c11da9d
EV
4397 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
4398 }
94f04347 4399
f8b69d1d 4400 if (c->ioprio_set) {
1756a011 4401 _cleanup_free_ char *class_str = NULL;
f8b69d1d 4402
837df140
YW
4403 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
4404 if (r >= 0)
4405 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
4406
4407 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 4408 }
94f04347 4409
f8b69d1d 4410 if (c->cpu_sched_set) {
1756a011 4411 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 4412
837df140
YW
4413 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
4414 if (r >= 0)
4415 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4416
94f04347 4417 fprintf(f,
38b48754
LP
4418 "%sCPUSchedulingPriority: %i\n"
4419 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
4420 prefix, c->cpu_sched_priority,
4421 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 4422 }
94f04347 4423
0985c7c4 4424 if (c->cpu_set.set) {
e7fca352
MS
4425 _cleanup_free_ char *affinity = NULL;
4426
4427 affinity = cpu_set_to_range_string(&c->cpu_set);
4428 fprintf(f, "%sCPUAffinity: %s\n", prefix, affinity);
94f04347
LP
4429 }
4430
b070c7c0
MS
4431 if (mpol_is_valid(numa_policy_get_type(&c->numa_policy))) {
4432 _cleanup_free_ char *nodes = NULL;
4433
4434 nodes = cpu_set_to_range_string(&c->numa_policy.nodes);
4435 fprintf(f, "%sNUMAPolicy: %s\n", prefix, mpol_to_string(numa_policy_get_type(&c->numa_policy)));
4436 fprintf(f, "%sNUMAMask: %s\n", prefix, strnull(nodes));
4437 }
4438
3a43da28 4439 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 4440 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
4441
4442 fprintf(f,
80876c20
LP
4443 "%sStandardInput: %s\n"
4444 "%sStandardOutput: %s\n"
4445 "%sStandardError: %s\n",
4446 prefix, exec_input_to_string(c->std_input),
4447 prefix, exec_output_to_string(c->std_output),
4448 prefix, exec_output_to_string(c->std_error));
4449
befc4a80
LP
4450 if (c->std_input == EXEC_INPUT_NAMED_FD)
4451 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4452 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4453 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4454 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4455 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4456
4457 if (c->std_input == EXEC_INPUT_FILE)
4458 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4459 if (c->std_output == EXEC_OUTPUT_FILE)
4460 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
566b7d23
ZD
4461 if (c->std_output == EXEC_OUTPUT_FILE_APPEND)
4462 fprintf(f, "%sStandardOutputFileToAppend: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
befc4a80
LP
4463 if (c->std_error == EXEC_OUTPUT_FILE)
4464 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
566b7d23
ZD
4465 if (c->std_error == EXEC_OUTPUT_FILE_APPEND)
4466 fprintf(f, "%sStandardErrorFileToAppend: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
befc4a80 4467
80876c20
LP
4468 if (c->tty_path)
4469 fprintf(f,
6ea832a2
LP
4470 "%sTTYPath: %s\n"
4471 "%sTTYReset: %s\n"
4472 "%sTTYVHangup: %s\n"
4473 "%sTTYVTDisallocate: %s\n",
4474 prefix, c->tty_path,
4475 prefix, yes_no(c->tty_reset),
4476 prefix, yes_no(c->tty_vhangup),
4477 prefix, yes_no(c->tty_vt_disallocate));
94f04347 4478
9f6444eb
LP
4479 if (IN_SET(c->std_output,
4480 EXEC_OUTPUT_SYSLOG,
4481 EXEC_OUTPUT_KMSG,
4482 EXEC_OUTPUT_JOURNAL,
4483 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4484 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4485 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4486 IN_SET(c->std_error,
4487 EXEC_OUTPUT_SYSLOG,
4488 EXEC_OUTPUT_KMSG,
4489 EXEC_OUTPUT_JOURNAL,
4490 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4491 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4492 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 4493
5ce70e5b 4494 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 4495
837df140
YW
4496 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4497 if (r >= 0)
4498 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 4499
837df140
YW
4500 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4501 if (r >= 0)
4502 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 4503 }
94f04347 4504
d3070fbd
LP
4505 if (c->log_level_max >= 0) {
4506 _cleanup_free_ char *t = NULL;
4507
4508 (void) log_level_to_string_alloc(c->log_level_max, &t);
4509
4510 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4511 }
4512
90fc172e
AZ
4513 if (c->log_rate_limit_interval_usec > 0) {
4514 char buf_timespan[FORMAT_TIMESPAN_MAX];
4515
4516 fprintf(f,
4517 "%sLogRateLimitIntervalSec: %s\n",
4518 prefix, format_timespan(buf_timespan, sizeof(buf_timespan), c->log_rate_limit_interval_usec, USEC_PER_SEC));
4519 }
4520
4521 if (c->log_rate_limit_burst > 0)
4522 fprintf(f, "%sLogRateLimitBurst: %u\n", prefix, c->log_rate_limit_burst);
4523
d3070fbd
LP
4524 if (c->n_log_extra_fields > 0) {
4525 size_t j;
4526
4527 for (j = 0; j < c->n_log_extra_fields; j++) {
4528 fprintf(f, "%sLogExtraFields: ", prefix);
4529 fwrite(c->log_extra_fields[j].iov_base,
4530 1, c->log_extra_fields[j].iov_len,
4531 f);
4532 fputc('\n', f);
4533 }
4534 }
4535
07d46372
YW
4536 if (c->secure_bits) {
4537 _cleanup_free_ char *str = NULL;
4538
4539 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4540 if (r >= 0)
4541 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4542 }
94f04347 4543
a103496c 4544 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 4545 _cleanup_free_ char *str = NULL;
94f04347 4546
dd1f5bd0
YW
4547 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4548 if (r >= 0)
4549 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
4550 }
4551
4552 if (c->capability_ambient_set != 0) {
dd1f5bd0 4553 _cleanup_free_ char *str = NULL;
755d4b67 4554
dd1f5bd0
YW
4555 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4556 if (r >= 0)
4557 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
4558 }
4559
4560 if (c->user)
f2d3769a 4561 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 4562 if (c->group)
f2d3769a 4563 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 4564
29206d46
LP
4565 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4566
ac6e8be6 4567 if (!strv_isempty(c->supplementary_groups)) {
94f04347 4568 fprintf(f, "%sSupplementaryGroups:", prefix);
15ae422b
LP
4569 strv_fprintf(f, c->supplementary_groups);
4570 fputs("\n", f);
4571 }
94f04347 4572
5b6319dc 4573 if (c->pam_name)
f2d3769a 4574 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 4575
58629001 4576 if (!strv_isempty(c->read_write_paths)) {
2a624c36
AP
4577 fprintf(f, "%sReadWritePaths:", prefix);
4578 strv_fprintf(f, c->read_write_paths);
15ae422b
LP
4579 fputs("\n", f);
4580 }
4581
58629001 4582 if (!strv_isempty(c->read_only_paths)) {
2a624c36
AP
4583 fprintf(f, "%sReadOnlyPaths:", prefix);
4584 strv_fprintf(f, c->read_only_paths);
15ae422b
LP
4585 fputs("\n", f);
4586 }
94f04347 4587
58629001 4588 if (!strv_isempty(c->inaccessible_paths)) {
2a624c36
AP
4589 fprintf(f, "%sInaccessiblePaths:", prefix);
4590 strv_fprintf(f, c->inaccessible_paths);
94f04347
LP
4591 fputs("\n", f);
4592 }
2e22afe9 4593
d2d6c096 4594 if (c->n_bind_mounts > 0)
4ca763a9
YW
4595 for (i = 0; i < c->n_bind_mounts; i++)
4596 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
d2d6c096 4597 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4ca763a9 4598 c->bind_mounts[i].ignore_enoent ? "-": "",
d2d6c096
LP
4599 c->bind_mounts[i].source,
4600 c->bind_mounts[i].destination,
4601 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 4602
2abd4e38
YW
4603 if (c->n_temporary_filesystems > 0)
4604 for (i = 0; i < c->n_temporary_filesystems; i++) {
4605 TemporaryFileSystem *t = c->temporary_filesystems + i;
4606
4607 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4608 t->path,
4609 isempty(t->options) ? "" : ":",
4610 strempty(t->options));
4611 }
4612
169c1bda
LP
4613 if (c->utmp_id)
4614 fprintf(f,
4615 "%sUtmpIdentifier: %s\n",
4616 prefix, c->utmp_id);
7b52a628
MS
4617
4618 if (c->selinux_context)
4619 fprintf(f,
5f8640fb
LP
4620 "%sSELinuxContext: %s%s\n",
4621 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 4622
80c21aea
WC
4623 if (c->apparmor_profile)
4624 fprintf(f,
4625 "%sAppArmorProfile: %s%s\n",
4626 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4627
4628 if (c->smack_process_label)
4629 fprintf(f,
4630 "%sSmackProcessLabel: %s%s\n",
4631 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4632
050f7277 4633 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
4634 fprintf(f,
4635 "%sPersonality: %s\n",
4636 prefix, strna(personality_to_string(c->personality)));
4637
78e864e5
TM
4638 fprintf(f,
4639 "%sLockPersonality: %s\n",
4640 prefix, yes_no(c->lock_personality));
4641
17df7223 4642 if (c->syscall_filter) {
349cc4a5 4643#if HAVE_SECCOMP
17df7223 4644 Iterator j;
8cfa775f 4645 void *id, *val;
17df7223 4646 bool first = true;
351a19b1 4647#endif
17df7223
LP
4648
4649 fprintf(f,
57183d11 4650 "%sSystemCallFilter: ",
17df7223
LP
4651 prefix);
4652
4653 if (!c->syscall_whitelist)
4654 fputc('~', f);
4655
349cc4a5 4656#if HAVE_SECCOMP
8cfa775f 4657 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
17df7223 4658 _cleanup_free_ char *name = NULL;
8cfa775f
YW
4659 const char *errno_name = NULL;
4660 int num = PTR_TO_INT(val);
17df7223
LP
4661
4662 if (first)
4663 first = false;
4664 else
4665 fputc(' ', f);
4666
57183d11 4667 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 4668 fputs(strna(name), f);
8cfa775f
YW
4669
4670 if (num >= 0) {
4671 errno_name = errno_to_name(num);
4672 if (errno_name)
4673 fprintf(f, ":%s", errno_name);
4674 else
4675 fprintf(f, ":%d", num);
4676 }
17df7223 4677 }
351a19b1 4678#endif
17df7223
LP
4679
4680 fputc('\n', f);
4681 }
4682
57183d11 4683 if (c->syscall_archs) {
349cc4a5 4684#if HAVE_SECCOMP
57183d11
LP
4685 Iterator j;
4686 void *id;
4687#endif
4688
4689 fprintf(f,
4690 "%sSystemCallArchitectures:",
4691 prefix);
4692
349cc4a5 4693#if HAVE_SECCOMP
57183d11
LP
4694 SET_FOREACH(id, c->syscall_archs, j)
4695 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4696#endif
4697 fputc('\n', f);
4698 }
4699
add00535
LP
4700 if (exec_context_restrict_namespaces_set(c)) {
4701 _cleanup_free_ char *s = NULL;
4702
86c2a9f1 4703 r = namespace_flags_to_string(c->restrict_namespaces, &s);
add00535
LP
4704 if (r >= 0)
4705 fprintf(f, "%sRestrictNamespaces: %s\n",
4706 prefix, s);
4707 }
4708
a8d08f39
LP
4709 if (c->network_namespace_path)
4710 fprintf(f,
4711 "%sNetworkNamespacePath: %s\n",
4712 prefix, c->network_namespace_path);
4713
3df90f24
YW
4714 if (c->syscall_errno > 0) {
4715 const char *errno_name;
4716
4717 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4718
4719 errno_name = errno_to_name(c->syscall_errno);
4720 if (errno_name)
4721 fprintf(f, "%s\n", errno_name);
4722 else
4723 fprintf(f, "%d\n", c->syscall_errno);
4724 }
5cb5a6ff
LP
4725}
4726
34cf6c43 4727bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
4728 assert(c);
4729
61233823 4730 /* Returns true if the process forked off would run under
a931ad47
LP
4731 * an unchanged UID or as root. */
4732
4733 if (!c->user)
4734 return true;
4735
4736 if (streq(c->user, "root") || streq(c->user, "0"))
4737 return true;
4738
4739 return false;
4740}
4741
34cf6c43 4742int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
4743 int p;
4744
4745 assert(c);
4746
4747 if (c->ioprio_set)
4748 return c->ioprio;
4749
4750 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4751 if (p < 0)
4752 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4753
4754 return p;
4755}
4756
d3070fbd
LP
4757void exec_context_free_log_extra_fields(ExecContext *c) {
4758 size_t l;
4759
4760 assert(c);
4761
4762 for (l = 0; l < c->n_log_extra_fields; l++)
4763 free(c->log_extra_fields[l].iov_base);
4764 c->log_extra_fields = mfree(c->log_extra_fields);
4765 c->n_log_extra_fields = 0;
4766}
4767
6f765baf
LP
4768void exec_context_revert_tty(ExecContext *c) {
4769 int r;
4770
4771 assert(c);
4772
4773 /* First, reset the TTY (possibly kicking everybody else from the TTY) */
4774 exec_context_tty_reset(c, NULL);
4775
4776 /* And then undo what chown_terminal() did earlier. Note that we only do this if we have a path
4777 * configured. If the TTY was passed to us as file descriptor we assume the TTY is opened and managed
4778 * by whoever passed it to us and thus knows better when and how to chmod()/chown() it back. */
4779
4780 if (exec_context_may_touch_tty(c)) {
4781 const char *path;
4782
4783 path = exec_context_tty_path(c);
4784 if (path) {
4785 r = chmod_and_chown(path, TTY_MODE, 0, TTY_GID);
4786 if (r < 0 && r != -ENOENT)
4787 log_warning_errno(r, "Failed to reset TTY ownership/access mode of %s, ignoring: %m", path);
4788 }
4789 }
4790}
4791
4c2f5842
LP
4792int exec_context_get_clean_directories(
4793 ExecContext *c,
4794 char **prefix,
4795 ExecCleanMask mask,
4796 char ***ret) {
4797
4798 _cleanup_strv_free_ char **l = NULL;
4799 ExecDirectoryType t;
4800 int r;
4801
4802 assert(c);
4803 assert(prefix);
4804 assert(ret);
4805
4806 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
4807 char **i;
4808
4809 if (!FLAGS_SET(mask, 1U << t))
4810 continue;
4811
4812 if (!prefix[t])
4813 continue;
4814
4815 STRV_FOREACH(i, c->directories[t].paths) {
4816 char *j;
4817
4818 j = path_join(prefix[t], *i);
4819 if (!j)
4820 return -ENOMEM;
4821
4822 r = strv_consume(&l, j);
4823 if (r < 0)
4824 return r;
4825 }
4826 }
4827
4828 *ret = TAKE_PTR(l);
4829 return 0;
4830}
4831
4832int exec_context_get_clean_mask(ExecContext *c, ExecCleanMask *ret) {
4833 ExecCleanMask mask = 0;
4834
4835 assert(c);
4836 assert(ret);
4837
4838 for (ExecDirectoryType t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++)
4839 if (!strv_isempty(c->directories[t].paths))
4840 mask |= 1U << t;
4841
4842 *ret = mask;
4843 return 0;
4844}
4845
b58b4116 4846void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 4847 assert(s);
5cb5a6ff 4848
2ed26ed0
LP
4849 *s = (ExecStatus) {
4850 .pid = pid,
4851 };
4852
b58b4116
LP
4853 dual_timestamp_get(&s->start_timestamp);
4854}
4855
34cf6c43 4856void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
4857 assert(s);
4858
2ed26ed0
LP
4859 if (s->pid != pid) {
4860 *s = (ExecStatus) {
4861 .pid = pid,
4862 };
4863 }
b58b4116 4864
63983207 4865 dual_timestamp_get(&s->exit_timestamp);
9fb86720 4866
034c6ed7
LP
4867 s->code = code;
4868 s->status = status;
169c1bda 4869
6f765baf
LP
4870 if (context && context->utmp_id)
4871 (void) utmp_put_dead_process(context->utmp_id, pid, code, status);
9fb86720
LP
4872}
4873
6a1d4d9f
LP
4874void exec_status_reset(ExecStatus *s) {
4875 assert(s);
4876
4877 *s = (ExecStatus) {};
4878}
4879
34cf6c43 4880void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
4881 char buf[FORMAT_TIMESTAMP_MAX];
4882
4883 assert(s);
4884 assert(f);
4885
9fb86720
LP
4886 if (s->pid <= 0)
4887 return;
4888
4c940960
LP
4889 prefix = strempty(prefix);
4890
9fb86720 4891 fprintf(f,
ccd06097
ZJS
4892 "%sPID: "PID_FMT"\n",
4893 prefix, s->pid);
9fb86720 4894
af9d16e1 4895 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
4896 fprintf(f,
4897 "%sStart Timestamp: %s\n",
63983207 4898 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
9fb86720 4899
af9d16e1 4900 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
4901 fprintf(f,
4902 "%sExit Timestamp: %s\n"
4903 "%sExit Code: %s\n"
4904 "%sExit Status: %i\n",
63983207 4905 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
9fb86720
LP
4906 prefix, sigchld_code_to_string(s->code),
4907 prefix, s->status);
5cb5a6ff 4908}
44d8db9e 4909
34cf6c43 4910static char *exec_command_line(char **argv) {
44d8db9e
LP
4911 size_t k;
4912 char *n, *p, **a;
4913 bool first = true;
4914
9e2f7c11 4915 assert(argv);
44d8db9e 4916
9164977d 4917 k = 1;
9e2f7c11 4918 STRV_FOREACH(a, argv)
44d8db9e
LP
4919 k += strlen(*a)+3;
4920
5cd9cd35
LP
4921 n = new(char, k);
4922 if (!n)
44d8db9e
LP
4923 return NULL;
4924
4925 p = n;
9e2f7c11 4926 STRV_FOREACH(a, argv) {
44d8db9e
LP
4927
4928 if (!first)
4929 *(p++) = ' ';
4930 else
4931 first = false;
4932
4933 if (strpbrk(*a, WHITESPACE)) {
4934 *(p++) = '\'';
4935 p = stpcpy(p, *a);
4936 *(p++) = '\'';
4937 } else
4938 p = stpcpy(p, *a);
4939
4940 }
4941
9164977d
LP
4942 *p = 0;
4943
44d8db9e
LP
4944 /* FIXME: this doesn't really handle arguments that have
4945 * spaces and ticks in them */
4946
4947 return n;
4948}
4949
34cf6c43 4950static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 4951 _cleanup_free_ char *cmd = NULL;
4c940960 4952 const char *prefix2;
44d8db9e
LP
4953
4954 assert(c);
4955 assert(f);
4956
4c940960 4957 prefix = strempty(prefix);
63c372cb 4958 prefix2 = strjoina(prefix, "\t");
44d8db9e 4959
9e2f7c11 4960 cmd = exec_command_line(c->argv);
44d8db9e
LP
4961 fprintf(f,
4962 "%sCommand Line: %s\n",
4bbccb02 4963 prefix, cmd ? cmd : strerror_safe(ENOMEM));
44d8db9e 4964
9fb86720 4965 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
4966}
4967
4968void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4969 assert(f);
4970
4c940960 4971 prefix = strempty(prefix);
44d8db9e
LP
4972
4973 LIST_FOREACH(command, c, c)
4974 exec_command_dump(c, f, prefix);
4975}
94f04347 4976
a6a80b4f
LP
4977void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4978 ExecCommand *end;
4979
4980 assert(l);
4981 assert(e);
4982
4983 if (*l) {
35b8ca3a 4984 /* It's kind of important, that we keep the order here */
71fda00f
LP
4985 LIST_FIND_TAIL(command, *l, end);
4986 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
4987 } else
4988 *l = e;
4989}
4990
26fd040d
LP
4991int exec_command_set(ExecCommand *c, const char *path, ...) {
4992 va_list ap;
4993 char **l, *p;
4994
4995 assert(c);
4996 assert(path);
4997
4998 va_start(ap, path);
4999 l = strv_new_ap(path, ap);
5000 va_end(ap);
5001
5002 if (!l)
5003 return -ENOMEM;
5004
250a918d
LP
5005 p = strdup(path);
5006 if (!p) {
26fd040d
LP
5007 strv_free(l);
5008 return -ENOMEM;
5009 }
5010
6897dfe8 5011 free_and_replace(c->path, p);
26fd040d 5012
130d3d22 5013 return strv_free_and_replace(c->argv, l);
26fd040d
LP
5014}
5015
86b23b07 5016int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 5017 _cleanup_strv_free_ char **l = NULL;
86b23b07 5018 va_list ap;
86b23b07
JS
5019 int r;
5020
5021 assert(c);
5022 assert(path);
5023
5024 va_start(ap, path);
5025 l = strv_new_ap(path, ap);
5026 va_end(ap);
5027
5028 if (!l)
5029 return -ENOMEM;
5030
e287086b 5031 r = strv_extend_strv(&c->argv, l, false);
e63ff941 5032 if (r < 0)
86b23b07 5033 return r;
86b23b07
JS
5034
5035 return 0;
5036}
5037
e8a565cb
YW
5038static void *remove_tmpdir_thread(void *p) {
5039 _cleanup_free_ char *path = p;
86b23b07 5040
e8a565cb
YW
5041 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
5042 return NULL;
5043}
5044
5045static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
5046 int r;
5047
5048 if (!rt)
5049 return NULL;
5050
5051 if (rt->manager)
5052 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
5053
5054 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
5055 if (destroy && rt->tmp_dir) {
5056 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
5057
5058 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
5059 if (r < 0) {
5060 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
5061 free(rt->tmp_dir);
5062 }
5063
5064 rt->tmp_dir = NULL;
5065 }
613b411c 5066
e8a565cb
YW
5067 if (destroy && rt->var_tmp_dir) {
5068 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
5069
5070 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
5071 if (r < 0) {
5072 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
5073 free(rt->var_tmp_dir);
5074 }
5075
5076 rt->var_tmp_dir = NULL;
5077 }
5078
5079 rt->id = mfree(rt->id);
5080 rt->tmp_dir = mfree(rt->tmp_dir);
5081 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
5082 safe_close_pair(rt->netns_storage_socket);
5083 return mfree(rt);
5084}
5085
5086static void exec_runtime_freep(ExecRuntime **rt) {
da6bc6ed 5087 (void) exec_runtime_free(*rt, false);
e8a565cb
YW
5088}
5089
8e8009dc
LP
5090static int exec_runtime_allocate(ExecRuntime **ret) {
5091 ExecRuntime *n;
613b411c 5092
8e8009dc 5093 assert(ret);
613b411c 5094
8e8009dc
LP
5095 n = new(ExecRuntime, 1);
5096 if (!n)
613b411c
LP
5097 return -ENOMEM;
5098
8e8009dc
LP
5099 *n = (ExecRuntime) {
5100 .netns_storage_socket = { -1, -1 },
5101 };
5102
5103 *ret = n;
613b411c
LP
5104 return 0;
5105}
5106
e8a565cb
YW
5107static int exec_runtime_add(
5108 Manager *m,
5109 const char *id,
5110 const char *tmp_dir,
5111 const char *var_tmp_dir,
5112 const int netns_storage_socket[2],
5113 ExecRuntime **ret) {
5114
5115 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
5116 int r;
5117
e8a565cb 5118 assert(m);
613b411c
LP
5119 assert(id);
5120
e8a565cb
YW
5121 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
5122 if (r < 0)
5123 return r;
613b411c 5124
e8a565cb 5125 r = exec_runtime_allocate(&rt);
613b411c
LP
5126 if (r < 0)
5127 return r;
5128
e8a565cb
YW
5129 rt->id = strdup(id);
5130 if (!rt->id)
5131 return -ENOMEM;
5132
5133 if (tmp_dir) {
5134 rt->tmp_dir = strdup(tmp_dir);
5135 if (!rt->tmp_dir)
5136 return -ENOMEM;
5137
5138 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
5139 assert(var_tmp_dir);
5140 rt->var_tmp_dir = strdup(var_tmp_dir);
5141 if (!rt->var_tmp_dir)
5142 return -ENOMEM;
5143 }
5144
5145 if (netns_storage_socket) {
5146 rt->netns_storage_socket[0] = netns_storage_socket[0];
5147 rt->netns_storage_socket[1] = netns_storage_socket[1];
613b411c
LP
5148 }
5149
e8a565cb
YW
5150 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
5151 if (r < 0)
5152 return r;
5153
5154 rt->manager = m;
5155
5156 if (ret)
5157 *ret = rt;
5158
5159 /* do not remove created ExecRuntime object when the operation succeeds. */
5160 rt = NULL;
5161 return 0;
5162}
5163
5164static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
5165 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
2fa3742d 5166 _cleanup_close_pair_ int netns_storage_socket[2] = { -1, -1 };
e8a565cb
YW
5167 int r;
5168
5169 assert(m);
5170 assert(c);
5171 assert(id);
5172
5173 /* It is not necessary to create ExecRuntime object. */
a8d08f39 5174 if (!c->private_network && !c->private_tmp && !c->network_namespace_path)
e8a565cb
YW
5175 return 0;
5176
5177 if (c->private_tmp) {
5178 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
5179 if (r < 0)
5180 return r;
5181 }
5182
a8d08f39 5183 if (c->private_network || c->network_namespace_path) {
e8a565cb
YW
5184 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
5185 return -errno;
5186 }
5187
5188 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
5189 if (r < 0)
5190 return r;
5191
5192 /* Avoid cleanup */
2fa3742d 5193 netns_storage_socket[0] = netns_storage_socket[1] = -1;
613b411c
LP
5194 return 1;
5195}
5196
e8a565cb
YW
5197int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
5198 ExecRuntime *rt;
5199 int r;
613b411c 5200
e8a565cb
YW
5201 assert(m);
5202 assert(id);
5203 assert(ret);
5204
5205 rt = hashmap_get(m->exec_runtime_by_id, id);
5206 if (rt)
5207 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
5208 goto ref;
5209
5210 if (!create)
5211 return 0;
5212
5213 /* If not found, then create a new object. */
5214 r = exec_runtime_make(m, c, id, &rt);
5215 if (r <= 0)
5216 /* When r == 0, it is not necessary to create ExecRuntime object. */
5217 return r;
613b411c 5218
e8a565cb
YW
5219ref:
5220 /* increment reference counter. */
5221 rt->n_ref++;
5222 *ret = rt;
5223 return 1;
5224}
613b411c 5225
e8a565cb
YW
5226ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
5227 if (!rt)
613b411c
LP
5228 return NULL;
5229
e8a565cb 5230 assert(rt->n_ref > 0);
613b411c 5231
e8a565cb
YW
5232 rt->n_ref--;
5233 if (rt->n_ref > 0)
f2341e0a
LP
5234 return NULL;
5235
e8a565cb 5236 return exec_runtime_free(rt, destroy);
613b411c
LP
5237}
5238
e8a565cb
YW
5239int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
5240 ExecRuntime *rt;
5241 Iterator i;
5242
5243 assert(m);
613b411c
LP
5244 assert(f);
5245 assert(fds);
5246
e8a565cb
YW
5247 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5248 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 5249
e8a565cb
YW
5250 if (rt->tmp_dir)
5251 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 5252
e8a565cb
YW
5253 if (rt->var_tmp_dir)
5254 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 5255
e8a565cb
YW
5256 if (rt->netns_storage_socket[0] >= 0) {
5257 int copy;
613b411c 5258
e8a565cb
YW
5259 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
5260 if (copy < 0)
5261 return copy;
613b411c 5262
e8a565cb
YW
5263 fprintf(f, " netns-socket-0=%i", copy);
5264 }
613b411c 5265
e8a565cb
YW
5266 if (rt->netns_storage_socket[1] >= 0) {
5267 int copy;
613b411c 5268
e8a565cb
YW
5269 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
5270 if (copy < 0)
5271 return copy;
613b411c 5272
e8a565cb
YW
5273 fprintf(f, " netns-socket-1=%i", copy);
5274 }
5275
5276 fputc('\n', f);
613b411c
LP
5277 }
5278
5279 return 0;
5280}
5281
e8a565cb
YW
5282int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
5283 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
5284 ExecRuntime *rt;
613b411c
LP
5285 int r;
5286
e8a565cb
YW
5287 /* This is for the migration from old (v237 or earlier) deserialization text.
5288 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
5289 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
5290 * so or not from the serialized text, then we always creates a new object owned by this. */
5291
5292 assert(u);
613b411c
LP
5293 assert(key);
5294 assert(value);
5295
e8a565cb
YW
5296 /* Manager manages ExecRuntime objects by the unit id.
5297 * So, we omit the serialized text when the unit does not have id (yet?)... */
5298 if (isempty(u->id)) {
5299 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
5300 return 0;
5301 }
613b411c 5302
e8a565cb
YW
5303 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
5304 if (r < 0) {
5305 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
5306 return 0;
5307 }
5308
5309 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
5310 if (!rt) {
5311 r = exec_runtime_allocate(&rt_create);
613b411c 5312 if (r < 0)
f2341e0a 5313 return log_oom();
613b411c 5314
e8a565cb
YW
5315 rt_create->id = strdup(u->id);
5316 if (!rt_create->id)
5317 return log_oom();
5318
5319 rt = rt_create;
5320 }
5321
5322 if (streq(key, "tmp-dir")) {
5323 char *copy;
5324
613b411c
LP
5325 copy = strdup(value);
5326 if (!copy)
5327 return log_oom();
5328
e8a565cb 5329 free_and_replace(rt->tmp_dir, copy);
613b411c
LP
5330
5331 } else if (streq(key, "var-tmp-dir")) {
5332 char *copy;
5333
613b411c
LP
5334 copy = strdup(value);
5335 if (!copy)
5336 return log_oom();
5337
e8a565cb 5338 free_and_replace(rt->var_tmp_dir, copy);
613b411c
LP
5339
5340 } else if (streq(key, "netns-socket-0")) {
5341 int fd;
5342
e8a565cb 5343 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 5344 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 5345 return 0;
613b411c 5346 }
e8a565cb
YW
5347
5348 safe_close(rt->netns_storage_socket[0]);
5349 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
5350
613b411c
LP
5351 } else if (streq(key, "netns-socket-1")) {
5352 int fd;
5353
e8a565cb 5354 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 5355 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 5356 return 0;
613b411c 5357 }
e8a565cb
YW
5358
5359 safe_close(rt->netns_storage_socket[1]);
5360 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
613b411c
LP
5361 } else
5362 return 0;
5363
e8a565cb
YW
5364 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
5365 if (rt_create) {
5366 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
5367 if (r < 0) {
3fe91079 5368 log_unit_debug_errno(u, r, "Failed to put runtime parameter to manager's storage: %m");
e8a565cb
YW
5369 return 0;
5370 }
613b411c 5371
e8a565cb 5372 rt_create->manager = u->manager;
613b411c 5373
e8a565cb
YW
5374 /* Avoid cleanup */
5375 rt_create = NULL;
5376 }
98b47d54 5377
e8a565cb
YW
5378 return 1;
5379}
613b411c 5380
e8a565cb
YW
5381void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
5382 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
5383 int r, fd0 = -1, fd1 = -1;
5384 const char *p, *v = value;
5385 size_t n;
613b411c 5386
e8a565cb
YW
5387 assert(m);
5388 assert(value);
5389 assert(fds);
98b47d54 5390
e8a565cb
YW
5391 n = strcspn(v, " ");
5392 id = strndupa(v, n);
5393 if (v[n] != ' ')
5394 goto finalize;
5395 p = v + n + 1;
5396
5397 v = startswith(p, "tmp-dir=");
5398 if (v) {
5399 n = strcspn(v, " ");
5400 tmp_dir = strndupa(v, n);
5401 if (v[n] != ' ')
5402 goto finalize;
5403 p = v + n + 1;
5404 }
5405
5406 v = startswith(p, "var-tmp-dir=");
5407 if (v) {
5408 n = strcspn(v, " ");
5409 var_tmp_dir = strndupa(v, n);
5410 if (v[n] != ' ')
5411 goto finalize;
5412 p = v + n + 1;
5413 }
5414
5415 v = startswith(p, "netns-socket-0=");
5416 if (v) {
5417 char *buf;
5418
5419 n = strcspn(v, " ");
5420 buf = strndupa(v, n);
5421 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
5422 log_debug("Unable to process exec-runtime netns fd specification.");
5423 return;
98b47d54 5424 }
e8a565cb
YW
5425 fd0 = fdset_remove(fds, fd0);
5426 if (v[n] != ' ')
5427 goto finalize;
5428 p = v + n + 1;
613b411c
LP
5429 }
5430
e8a565cb
YW
5431 v = startswith(p, "netns-socket-1=");
5432 if (v) {
5433 char *buf;
98b47d54 5434
e8a565cb
YW
5435 n = strcspn(v, " ");
5436 buf = strndupa(v, n);
5437 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
5438 log_debug("Unable to process exec-runtime netns fd specification.");
5439 return;
98b47d54 5440 }
e8a565cb
YW
5441 fd1 = fdset_remove(fds, fd1);
5442 }
98b47d54 5443
e8a565cb
YW
5444finalize:
5445
5446 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
7d853ca6 5447 if (r < 0)
e8a565cb 5448 log_debug_errno(r, "Failed to add exec-runtime: %m");
e8a565cb 5449}
613b411c 5450
e8a565cb
YW
5451void exec_runtime_vacuum(Manager *m) {
5452 ExecRuntime *rt;
5453 Iterator i;
5454
5455 assert(m);
5456
5457 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
5458
5459 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
5460 if (rt->n_ref > 0)
5461 continue;
5462
5463 (void) exec_runtime_free(rt, false);
5464 }
613b411c
LP
5465}
5466
b9c04eaf
YW
5467void exec_params_clear(ExecParameters *p) {
5468 if (!p)
5469 return;
5470
5471 strv_free(p->environment);
5472}
5473
80876c20
LP
5474static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
5475 [EXEC_INPUT_NULL] = "null",
5476 [EXEC_INPUT_TTY] = "tty",
5477 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 5478 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
5479 [EXEC_INPUT_SOCKET] = "socket",
5480 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 5481 [EXEC_INPUT_DATA] = "data",
2038c3f5 5482 [EXEC_INPUT_FILE] = "file",
80876c20
LP
5483};
5484
8a0867d6
LP
5485DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
5486
94f04347 5487static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 5488 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 5489 [EXEC_OUTPUT_NULL] = "null",
80876c20 5490 [EXEC_OUTPUT_TTY] = "tty",
94f04347 5491 [EXEC_OUTPUT_SYSLOG] = "syslog",
28dbc1e8 5492 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
9a6bca7a 5493 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 5494 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
5495 [EXEC_OUTPUT_JOURNAL] = "journal",
5496 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
5497 [EXEC_OUTPUT_SOCKET] = "socket",
5498 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 5499 [EXEC_OUTPUT_FILE] = "file",
566b7d23 5500 [EXEC_OUTPUT_FILE_APPEND] = "append",
94f04347
LP
5501};
5502
5503DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
5504
5505static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
5506 [EXEC_UTMP_INIT] = "init",
5507 [EXEC_UTMP_LOGIN] = "login",
5508 [EXEC_UTMP_USER] = "user",
5509};
5510
5511DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
5512
5513static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
5514 [EXEC_PRESERVE_NO] = "no",
5515 [EXEC_PRESERVE_YES] = "yes",
5516 [EXEC_PRESERVE_RESTART] = "restart",
5517};
5518
5519DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 5520
6b7b2ed9 5521/* This table maps ExecDirectoryType to the setting it is configured with in the unit */
72fd1768 5522static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
5523 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
5524 [EXEC_DIRECTORY_STATE] = "StateDirectory",
5525 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
5526 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
5527 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5528};
5529
5530DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445 5531
6b7b2ed9
LP
5532/* And this table maps ExecDirectoryType too, but to a generic term identifying the type of resource. This
5533 * one is supposed to be generic enough to be used for unit types that don't use ExecContext and per-unit
5534 * directories, specifically .timer units with their timestamp touch file. */
5535static const char* const exec_resource_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5536 [EXEC_DIRECTORY_RUNTIME] = "runtime",
5537 [EXEC_DIRECTORY_STATE] = "state",
5538 [EXEC_DIRECTORY_CACHE] = "cache",
5539 [EXEC_DIRECTORY_LOGS] = "logs",
5540 [EXEC_DIRECTORY_CONFIGURATION] = "configuration",
5541};
5542
5543DEFINE_STRING_TABLE_LOOKUP(exec_resource_type, ExecDirectoryType);
5544
5545/* And this table also maps ExecDirectoryType, to the environment variable we pass the selected directory to
5546 * the service payload in. */
fb2042dd
YW
5547static const char* const exec_directory_env_name_table[_EXEC_DIRECTORY_TYPE_MAX] = {
5548 [EXEC_DIRECTORY_RUNTIME] = "RUNTIME_DIRECTORY",
5549 [EXEC_DIRECTORY_STATE] = "STATE_DIRECTORY",
5550 [EXEC_DIRECTORY_CACHE] = "CACHE_DIRECTORY",
5551 [EXEC_DIRECTORY_LOGS] = "LOGS_DIRECTORY",
5552 [EXEC_DIRECTORY_CONFIGURATION] = "CONFIGURATION_DIRECTORY",
5553};
5554
5555DEFINE_PRIVATE_STRING_TABLE_LOOKUP_TO_STRING(exec_directory_env_name, ExecDirectoryType);
5556
b1edf445
LP
5557static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5558 [EXEC_KEYRING_INHERIT] = "inherit",
5559 [EXEC_KEYRING_PRIVATE] = "private",
5560 [EXEC_KEYRING_SHARED] = "shared",
5561};
5562
5563DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);