]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/execute.c
tree-wide: drop license boilerplate
[thirdparty/systemd.git] / src / core / execute.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
a7334b09
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2010 Lennart Poettering
a7334b09
LP
6***/
7
034c6ed7
LP
8#include <errno.h>
9#include <fcntl.h>
8dd4c05b
LP
10#include <glob.h>
11#include <grp.h>
12#include <poll.h>
309bff19 13#include <signal.h>
8dd4c05b 14#include <string.h>
19c0b0b9 15#include <sys/capability.h>
d251207d 16#include <sys/eventfd.h>
f3e43635 17#include <sys/mman.h>
8dd4c05b 18#include <sys/personality.h>
94f04347 19#include <sys/prctl.h>
d2ffa389 20#include <sys/shm.h>
8dd4c05b 21#include <sys/socket.h>
451a074f 22#include <sys/stat.h>
d2ffa389 23#include <sys/types.h>
8dd4c05b
LP
24#include <sys/un.h>
25#include <unistd.h>
023a4f67 26#include <utmpx.h>
5cb5a6ff 27
349cc4a5 28#if HAVE_PAM
5b6319dc
LP
29#include <security/pam_appl.h>
30#endif
31
349cc4a5 32#if HAVE_SELINUX
7b52a628
MS
33#include <selinux/selinux.h>
34#endif
35
349cc4a5 36#if HAVE_SECCOMP
17df7223
LP
37#include <seccomp.h>
38#endif
39
349cc4a5 40#if HAVE_APPARMOR
eef65bf3
MS
41#include <sys/apparmor.h>
42#endif
43
24882e06 44#include "sd-messages.h"
8dd4c05b
LP
45
46#include "af-list.h"
b5efdb8a 47#include "alloc-util.h"
349cc4a5 48#if HAVE_APPARMOR
3ffd4af2
LP
49#include "apparmor-util.h"
50#endif
8dd4c05b
LP
51#include "async.h"
52#include "barrier.h"
8dd4c05b 53#include "cap-list.h"
430f0182 54#include "capability-util.h"
a1164ae3 55#include "chown-recursive.h"
da681e1b 56#include "cpu-set-util.h"
f6a6225e 57#include "def.h"
4d1a6904 58#include "env-util.h"
17df7223 59#include "errno-list.h"
3ffd4af2 60#include "execute.h"
8dd4c05b 61#include "exit-status.h"
3ffd4af2 62#include "fd-util.h"
8dd4c05b 63#include "fileio.h"
f97b34a6 64#include "format-util.h"
f4f15635 65#include "fs-util.h"
7d50b32a 66#include "glob-util.h"
c004493c 67#include "io-util.h"
8dd4c05b 68#include "ioprio.h"
a1164ae3 69#include "label.h"
8dd4c05b
LP
70#include "log.h"
71#include "macro.h"
e8a565cb 72#include "manager.h"
8dd4c05b
LP
73#include "missing.h"
74#include "mkdir.h"
75#include "namespace.h"
6bedfcbb 76#include "parse-util.h"
8dd4c05b 77#include "path-util.h"
0b452006 78#include "process-util.h"
78f22b97 79#include "rlimit-util.h"
8dd4c05b 80#include "rm-rf.h"
349cc4a5 81#if HAVE_SECCOMP
3ffd4af2
LP
82#include "seccomp-util.h"
83#endif
8dd4c05b 84#include "securebits.h"
07d46372 85#include "securebits-util.h"
8dd4c05b 86#include "selinux-util.h"
24882e06 87#include "signal-util.h"
8dd4c05b 88#include "smack-util.h"
fd63e712 89#include "special.h"
949befd3 90#include "stat-util.h"
8b43440b 91#include "string-table.h"
07630cea 92#include "string-util.h"
8dd4c05b 93#include "strv.h"
7ccbd1ae 94#include "syslog-util.h"
8dd4c05b
LP
95#include "terminal-util.h"
96#include "unit.h"
b1d4f8e1 97#include "user-util.h"
8dd4c05b
LP
98#include "util.h"
99#include "utmp-wtmp.h"
5cb5a6ff 100
e056b01d 101#define IDLE_TIMEOUT_USEC (5*USEC_PER_SEC)
31a7eb86 102#define IDLE_TIMEOUT2_USEC (1*USEC_PER_SEC)
e6a26745 103
02a51aba
LP
104/* This assumes there is a 'tty' group */
105#define TTY_MODE 0620
106
531dca78
LP
107#define SNDBUF_SIZE (8*1024*1024)
108
034c6ed7
LP
109static int shift_fds(int fds[], unsigned n_fds) {
110 int start, restart_from;
111
112 if (n_fds <= 0)
113 return 0;
114
a0d40ac5
LP
115 /* Modifies the fds array! (sorts it) */
116
034c6ed7
LP
117 assert(fds);
118
119 start = 0;
120 for (;;) {
121 int i;
122
123 restart_from = -1;
124
125 for (i = start; i < (int) n_fds; i++) {
126 int nfd;
127
128 /* Already at right index? */
129 if (fds[i] == i+3)
130 continue;
131
3cc2aff1
LP
132 nfd = fcntl(fds[i], F_DUPFD, i + 3);
133 if (nfd < 0)
034c6ed7
LP
134 return -errno;
135
03e334a1 136 safe_close(fds[i]);
034c6ed7
LP
137 fds[i] = nfd;
138
139 /* Hmm, the fd we wanted isn't free? Then
ee33e53a 140 * let's remember that and try again from here */
034c6ed7
LP
141 if (nfd != i+3 && restart_from < 0)
142 restart_from = i;
143 }
144
145 if (restart_from < 0)
146 break;
147
148 start = restart_from;
149 }
150
151 return 0;
152}
153
4c47affc
FB
154static int flags_fds(const int fds[], unsigned n_storage_fds, unsigned n_socket_fds, bool nonblock) {
155 unsigned i, n_fds;
e2c76839 156 int r;
47a71eed 157
4c47affc 158 n_fds = n_storage_fds + n_socket_fds;
47a71eed
LP
159 if (n_fds <= 0)
160 return 0;
161
162 assert(fds);
163
9b141911
FB
164 /* Drops/Sets O_NONBLOCK and FD_CLOEXEC from the file flags.
165 * O_NONBLOCK only applies to socket activation though. */
47a71eed
LP
166
167 for (i = 0; i < n_fds; i++) {
47a71eed 168
9b141911
FB
169 if (i < n_socket_fds) {
170 r = fd_nonblock(fds[i], nonblock);
171 if (r < 0)
172 return r;
173 }
47a71eed 174
451a074f
LP
175 /* We unconditionally drop FD_CLOEXEC from the fds,
176 * since after all we want to pass these fds to our
177 * children */
47a71eed 178
3cc2aff1
LP
179 r = fd_cloexec(fds[i], false);
180 if (r < 0)
e2c76839 181 return r;
47a71eed
LP
182 }
183
184 return 0;
185}
186
1e22b5cd 187static const char *exec_context_tty_path(const ExecContext *context) {
80876c20
LP
188 assert(context);
189
1e22b5cd
LP
190 if (context->stdio_as_fds)
191 return NULL;
192
80876c20
LP
193 if (context->tty_path)
194 return context->tty_path;
195
196 return "/dev/console";
197}
198
1e22b5cd
LP
199static void exec_context_tty_reset(const ExecContext *context, const ExecParameters *p) {
200 const char *path;
201
6ea832a2
LP
202 assert(context);
203
1e22b5cd 204 path = exec_context_tty_path(context);
6ea832a2 205
1e22b5cd
LP
206 if (context->tty_vhangup) {
207 if (p && p->stdin_fd >= 0)
208 (void) terminal_vhangup_fd(p->stdin_fd);
209 else if (path)
210 (void) terminal_vhangup(path);
211 }
6ea832a2 212
1e22b5cd
LP
213 if (context->tty_reset) {
214 if (p && p->stdin_fd >= 0)
215 (void) reset_terminal_fd(p->stdin_fd, true);
216 else if (path)
217 (void) reset_terminal(path);
218 }
219
220 if (context->tty_vt_disallocate && path)
221 (void) vt_disallocate(path);
6ea832a2
LP
222}
223
6af760f3
LP
224static bool is_terminal_input(ExecInput i) {
225 return IN_SET(i,
226 EXEC_INPUT_TTY,
227 EXEC_INPUT_TTY_FORCE,
228 EXEC_INPUT_TTY_FAIL);
229}
230
3a1286b6 231static bool is_terminal_output(ExecOutput o) {
6af760f3
LP
232 return IN_SET(o,
233 EXEC_OUTPUT_TTY,
234 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
235 EXEC_OUTPUT_KMSG_AND_CONSOLE,
236 EXEC_OUTPUT_JOURNAL_AND_CONSOLE);
237}
238
aac8c0c3
LP
239static bool is_syslog_output(ExecOutput o) {
240 return IN_SET(o,
241 EXEC_OUTPUT_SYSLOG,
242 EXEC_OUTPUT_SYSLOG_AND_CONSOLE);
243}
244
245static bool is_kmsg_output(ExecOutput o) {
246 return IN_SET(o,
247 EXEC_OUTPUT_KMSG,
248 EXEC_OUTPUT_KMSG_AND_CONSOLE);
249}
250
6af760f3
LP
251static bool exec_context_needs_term(const ExecContext *c) {
252 assert(c);
253
254 /* Return true if the execution context suggests we should set $TERM to something useful. */
255
256 if (is_terminal_input(c->std_input))
257 return true;
258
259 if (is_terminal_output(c->std_output))
260 return true;
261
262 if (is_terminal_output(c->std_error))
263 return true;
264
265 return !!c->tty_path;
3a1286b6
MS
266}
267
80876c20 268static int open_null_as(int flags, int nfd) {
046a82c1 269 int fd;
071830ff 270
80876c20 271 assert(nfd >= 0);
071830ff 272
613b411c
LP
273 fd = open("/dev/null", flags|O_NOCTTY);
274 if (fd < 0)
071830ff
LP
275 return -errno;
276
046a82c1 277 return move_fd(fd, nfd, false);
071830ff
LP
278}
279
524daa8c 280static int connect_journal_socket(int fd, uid_t uid, gid_t gid) {
92a17af9 281 static const union sockaddr_union sa = {
b92bea5d
ZJS
282 .un.sun_family = AF_UNIX,
283 .un.sun_path = "/run/systemd/journal/stdout",
284 };
524daa8c
ZJS
285 uid_t olduid = UID_INVALID;
286 gid_t oldgid = GID_INVALID;
287 int r;
288
cad93f29 289 if (gid_is_valid(gid)) {
524daa8c
ZJS
290 oldgid = getgid();
291
92a17af9 292 if (setegid(gid) < 0)
524daa8c
ZJS
293 return -errno;
294 }
295
cad93f29 296 if (uid_is_valid(uid)) {
524daa8c
ZJS
297 olduid = getuid();
298
92a17af9 299 if (seteuid(uid) < 0) {
524daa8c
ZJS
300 r = -errno;
301 goto restore_gid;
302 }
303 }
304
92a17af9 305 r = connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0 ? -errno : 0;
524daa8c
ZJS
306
307 /* If we fail to restore the uid or gid, things will likely
308 fail later on. This should only happen if an LSM interferes. */
309
cad93f29 310 if (uid_is_valid(uid))
524daa8c
ZJS
311 (void) seteuid(olduid);
312
313 restore_gid:
cad93f29 314 if (gid_is_valid(gid))
524daa8c
ZJS
315 (void) setegid(oldgid);
316
317 return r;
318}
319
fd1f9c89 320static int connect_logger_as(
34cf6c43 321 const Unit *unit,
fd1f9c89 322 const ExecContext *context,
af635cf3 323 const ExecParameters *params,
fd1f9c89
LP
324 ExecOutput output,
325 const char *ident,
fd1f9c89
LP
326 int nfd,
327 uid_t uid,
328 gid_t gid) {
329
524daa8c 330 int fd, r;
071830ff
LP
331
332 assert(context);
af635cf3 333 assert(params);
80876c20
LP
334 assert(output < _EXEC_OUTPUT_MAX);
335 assert(ident);
336 assert(nfd >= 0);
071830ff 337
54fe0cdb
LP
338 fd = socket(AF_UNIX, SOCK_STREAM, 0);
339 if (fd < 0)
80876c20 340 return -errno;
071830ff 341
524daa8c
ZJS
342 r = connect_journal_socket(fd, uid, gid);
343 if (r < 0)
344 return r;
071830ff 345
80876c20 346 if (shutdown(fd, SHUT_RD) < 0) {
03e334a1 347 safe_close(fd);
80876c20
LP
348 return -errno;
349 }
071830ff 350
fd1f9c89 351 (void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
531dca78 352
80876c20 353 dprintf(fd,
62bca2c6 354 "%s\n"
80876c20
LP
355 "%s\n"
356 "%i\n"
54fe0cdb
LP
357 "%i\n"
358 "%i\n"
359 "%i\n"
4f4a1dbf 360 "%i\n",
c867611e 361 context->syslog_identifier ?: ident,
af635cf3 362 params->flags & EXEC_PASS_LOG_UNIT ? unit->id : "",
54fe0cdb
LP
363 context->syslog_priority,
364 !!context->syslog_level_prefix,
aac8c0c3
LP
365 is_syslog_output(output),
366 is_kmsg_output(output),
3a1286b6 367 is_terminal_output(output));
80876c20 368
046a82c1 369 return move_fd(fd, nfd, false);
80876c20 370}
3a274a21 371static int open_terminal_as(const char *path, int flags, int nfd) {
046a82c1 372 int fd;
071830ff 373
80876c20
LP
374 assert(path);
375 assert(nfd >= 0);
fd1f9c89 376
3a274a21 377 fd = open_terminal(path, flags | O_NOCTTY);
3cc2aff1 378 if (fd < 0)
80876c20 379 return fd;
071830ff 380
046a82c1 381 return move_fd(fd, nfd, false);
80876c20 382}
071830ff 383
2038c3f5
LP
384static int acquire_path(const char *path, int flags, mode_t mode) {
385 union sockaddr_union sa = {
386 .sa.sa_family = AF_UNIX,
387 };
80876c20 388 int fd, r;
071830ff 389
80876c20 390 assert(path);
071830ff 391
2038c3f5
LP
392 if (IN_SET(flags & O_ACCMODE, O_WRONLY, O_RDWR))
393 flags |= O_CREAT;
394
395 fd = open(path, flags|O_NOCTTY, mode);
396 if (fd >= 0)
80876c20 397 return fd;
071830ff 398
2038c3f5
LP
399 if (errno != ENXIO) /* ENXIO is returned when we try to open() an AF_UNIX file system socket on Linux */
400 return -errno;
401 if (strlen(path) > sizeof(sa.un.sun_path)) /* Too long, can't be a UNIX socket */
402 return -ENXIO;
403
404 /* So, it appears the specified path could be an AF_UNIX socket. Let's see if we can connect to it. */
405
406 fd = socket(AF_UNIX, SOCK_STREAM, 0);
407 if (fd < 0)
408 return -errno;
409
410 strncpy(sa.un.sun_path, path, sizeof(sa.un.sun_path));
411 if (connect(fd, &sa.sa, SOCKADDR_UN_LEN(sa.un)) < 0) {
03e334a1 412 safe_close(fd);
2038c3f5
LP
413 return errno == EINVAL ? -ENXIO : -errno; /* Propagate initial error if we get EINVAL, i.e. we have
414 * indication that his wasn't an AF_UNIX socket after all */
415 }
071830ff 416
2038c3f5
LP
417 if ((flags & O_ACCMODE) == O_RDONLY)
418 r = shutdown(fd, SHUT_WR);
419 else if ((flags & O_ACCMODE) == O_WRONLY)
420 r = shutdown(fd, SHUT_RD);
421 else
422 return fd;
423 if (r < 0) {
424 safe_close(fd);
425 return -errno;
426 }
427
428 return fd;
80876c20 429}
071830ff 430
08f3be7a
LP
431static int fixup_input(
432 const ExecContext *context,
433 int socket_fd,
434 bool apply_tty_stdin) {
435
436 ExecInput std_input;
437
438 assert(context);
439
440 std_input = context->std_input;
1e3ad081
LP
441
442 if (is_terminal_input(std_input) && !apply_tty_stdin)
443 return EXEC_INPUT_NULL;
071830ff 444
03fd9c49 445 if (std_input == EXEC_INPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
446 return EXEC_INPUT_NULL;
447
08f3be7a
LP
448 if (std_input == EXEC_INPUT_DATA && context->stdin_data_size == 0)
449 return EXEC_INPUT_NULL;
450
03fd9c49 451 return std_input;
4f2d528d
LP
452}
453
03fd9c49 454static int fixup_output(ExecOutput std_output, int socket_fd) {
4f2d528d 455
03fd9c49 456 if (std_output == EXEC_OUTPUT_SOCKET && socket_fd < 0)
4f2d528d
LP
457 return EXEC_OUTPUT_INHERIT;
458
03fd9c49 459 return std_output;
4f2d528d
LP
460}
461
a34ceba6
LP
462static int setup_input(
463 const ExecContext *context,
464 const ExecParameters *params,
52c239d7
LB
465 int socket_fd,
466 int named_iofds[3]) {
a34ceba6 467
4f2d528d
LP
468 ExecInput i;
469
470 assert(context);
a34ceba6
LP
471 assert(params);
472
473 if (params->stdin_fd >= 0) {
474 if (dup2(params->stdin_fd, STDIN_FILENO) < 0)
475 return -errno;
476
477 /* Try to make this the controlling tty, if it is a tty, and reset it */
1fb0682e
LP
478 if (isatty(STDIN_FILENO)) {
479 (void) ioctl(STDIN_FILENO, TIOCSCTTY, context->std_input == EXEC_INPUT_TTY_FORCE);
480 (void) reset_terminal_fd(STDIN_FILENO, true);
481 }
a34ceba6
LP
482
483 return STDIN_FILENO;
484 }
4f2d528d 485
08f3be7a 486 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
4f2d528d
LP
487
488 switch (i) {
071830ff 489
80876c20
LP
490 case EXEC_INPUT_NULL:
491 return open_null_as(O_RDONLY, STDIN_FILENO);
492
493 case EXEC_INPUT_TTY:
494 case EXEC_INPUT_TTY_FORCE:
495 case EXEC_INPUT_TTY_FAIL: {
046a82c1 496 int fd;
071830ff 497
1e22b5cd 498 fd = acquire_terminal(exec_context_tty_path(context),
8854d795
LP
499 i == EXEC_INPUT_TTY_FAIL ? ACQUIRE_TERMINAL_TRY :
500 i == EXEC_INPUT_TTY_FORCE ? ACQUIRE_TERMINAL_FORCE :
501 ACQUIRE_TERMINAL_WAIT,
3a43da28 502 USEC_INFINITY);
970edce6 503 if (fd < 0)
80876c20
LP
504 return fd;
505
046a82c1 506 return move_fd(fd, STDIN_FILENO, false);
80876c20
LP
507 }
508
4f2d528d 509 case EXEC_INPUT_SOCKET:
e75a9ed1
LP
510 assert(socket_fd >= 0);
511
4f2d528d
LP
512 return dup2(socket_fd, STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
513
52c239d7 514 case EXEC_INPUT_NAMED_FD:
e75a9ed1
LP
515 assert(named_iofds[STDIN_FILENO] >= 0);
516
52c239d7
LB
517 (void) fd_nonblock(named_iofds[STDIN_FILENO], false);
518 return dup2(named_iofds[STDIN_FILENO], STDIN_FILENO) < 0 ? -errno : STDIN_FILENO;
519
08f3be7a
LP
520 case EXEC_INPUT_DATA: {
521 int fd;
522
523 fd = acquire_data_fd(context->stdin_data, context->stdin_data_size, 0);
524 if (fd < 0)
525 return fd;
526
527 return move_fd(fd, STDIN_FILENO, false);
528 }
529
2038c3f5
LP
530 case EXEC_INPUT_FILE: {
531 bool rw;
532 int fd;
533
534 assert(context->stdio_file[STDIN_FILENO]);
535
536 rw = (context->std_output == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDOUT_FILENO])) ||
537 (context->std_error == EXEC_OUTPUT_FILE && streq_ptr(context->stdio_file[STDIN_FILENO], context->stdio_file[STDERR_FILENO]));
538
539 fd = acquire_path(context->stdio_file[STDIN_FILENO], rw ? O_RDWR : O_RDONLY, 0666 & ~context->umask);
540 if (fd < 0)
541 return fd;
542
543 return move_fd(fd, STDIN_FILENO, false);
544 }
545
80876c20
LP
546 default:
547 assert_not_reached("Unknown input type");
548 }
549}
550
a34ceba6 551static int setup_output(
34cf6c43 552 const Unit *unit,
a34ceba6
LP
553 const ExecContext *context,
554 const ExecParameters *params,
555 int fileno,
556 int socket_fd,
52c239d7 557 int named_iofds[3],
a34ceba6 558 const char *ident,
7bce046b
LP
559 uid_t uid,
560 gid_t gid,
561 dev_t *journal_stream_dev,
562 ino_t *journal_stream_ino) {
a34ceba6 563
4f2d528d
LP
564 ExecOutput o;
565 ExecInput i;
47c1d80d 566 int r;
4f2d528d 567
f2341e0a 568 assert(unit);
80876c20 569 assert(context);
a34ceba6 570 assert(params);
80876c20 571 assert(ident);
7bce046b
LP
572 assert(journal_stream_dev);
573 assert(journal_stream_ino);
80876c20 574
a34ceba6
LP
575 if (fileno == STDOUT_FILENO && params->stdout_fd >= 0) {
576
577 if (dup2(params->stdout_fd, STDOUT_FILENO) < 0)
578 return -errno;
579
580 return STDOUT_FILENO;
581 }
582
583 if (fileno == STDERR_FILENO && params->stderr_fd >= 0) {
584 if (dup2(params->stderr_fd, STDERR_FILENO) < 0)
585 return -errno;
586
587 return STDERR_FILENO;
588 }
589
08f3be7a 590 i = fixup_input(context, socket_fd, params->flags & EXEC_APPLY_TTY_STDIN);
03fd9c49 591 o = fixup_output(context->std_output, socket_fd);
4f2d528d 592
eb17e935
MS
593 if (fileno == STDERR_FILENO) {
594 ExecOutput e;
595 e = fixup_output(context->std_error, socket_fd);
80876c20 596
eb17e935
MS
597 /* This expects the input and output are already set up */
598
599 /* Don't change the stderr file descriptor if we inherit all
600 * the way and are not on a tty */
601 if (e == EXEC_OUTPUT_INHERIT &&
602 o == EXEC_OUTPUT_INHERIT &&
603 i == EXEC_INPUT_NULL &&
604 !is_terminal_input(context->std_input) &&
605 getppid () != 1)
606 return fileno;
607
608 /* Duplicate from stdout if possible */
52c239d7 609 if ((e == o && e != EXEC_OUTPUT_NAMED_FD) || e == EXEC_OUTPUT_INHERIT)
eb17e935 610 return dup2(STDOUT_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 611
eb17e935 612 o = e;
80876c20 613
eb17e935 614 } else if (o == EXEC_OUTPUT_INHERIT) {
21d21ea4
LP
615 /* If input got downgraded, inherit the original value */
616 if (i == EXEC_INPUT_NULL && is_terminal_input(context->std_input))
1e22b5cd 617 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
21d21ea4 618
08f3be7a
LP
619 /* If the input is connected to anything that's not a /dev/null or a data fd, inherit that... */
620 if (!IN_SET(i, EXEC_INPUT_NULL, EXEC_INPUT_DATA))
eb17e935 621 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
071830ff 622
acb591e4
LP
623 /* If we are not started from PID 1 we just inherit STDOUT from our parent process. */
624 if (getppid() != 1)
eb17e935 625 return fileno;
94f04347 626
eb17e935
MS
627 /* We need to open /dev/null here anew, to get the right access mode. */
628 return open_null_as(O_WRONLY, fileno);
071830ff 629 }
94f04347 630
eb17e935 631 switch (o) {
80876c20
LP
632
633 case EXEC_OUTPUT_NULL:
eb17e935 634 return open_null_as(O_WRONLY, fileno);
80876c20
LP
635
636 case EXEC_OUTPUT_TTY:
4f2d528d 637 if (is_terminal_input(i))
eb17e935 638 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
80876c20
LP
639
640 /* We don't reset the terminal if this is just about output */
1e22b5cd 641 return open_terminal_as(exec_context_tty_path(context), O_WRONLY, fileno);
80876c20
LP
642
643 case EXEC_OUTPUT_SYSLOG:
28dbc1e8 644 case EXEC_OUTPUT_SYSLOG_AND_CONSOLE:
9a6bca7a 645 case EXEC_OUTPUT_KMSG:
28dbc1e8 646 case EXEC_OUTPUT_KMSG_AND_CONSOLE:
706343f4
LP
647 case EXEC_OUTPUT_JOURNAL:
648 case EXEC_OUTPUT_JOURNAL_AND_CONSOLE:
af635cf3 649 r = connect_logger_as(unit, context, params, o, ident, fileno, uid, gid);
47c1d80d 650 if (r < 0) {
82677ae4 651 log_unit_warning_errno(unit, r, "Failed to connect %s to the journal socket, ignoring: %m", fileno == STDOUT_FILENO ? "stdout" : "stderr");
eb17e935 652 r = open_null_as(O_WRONLY, fileno);
7bce046b
LP
653 } else {
654 struct stat st;
655
656 /* If we connected this fd to the journal via a stream, patch the device/inode into the passed
657 * parameters, but only then. This is useful so that we can set $JOURNAL_STREAM that permits
ab2116b1
LP
658 * services to detect whether they are connected to the journal or not.
659 *
660 * If both stdout and stderr are connected to a stream then let's make sure to store the data
661 * about STDERR as that's usually the best way to do logging. */
7bce046b 662
ab2116b1
LP
663 if (fstat(fileno, &st) >= 0 &&
664 (*journal_stream_ino == 0 || fileno == STDERR_FILENO)) {
7bce046b
LP
665 *journal_stream_dev = st.st_dev;
666 *journal_stream_ino = st.st_ino;
667 }
47c1d80d
MS
668 }
669 return r;
4f2d528d
LP
670
671 case EXEC_OUTPUT_SOCKET:
672 assert(socket_fd >= 0);
e75a9ed1 673
eb17e935 674 return dup2(socket_fd, fileno) < 0 ? -errno : fileno;
94f04347 675
52c239d7 676 case EXEC_OUTPUT_NAMED_FD:
e75a9ed1
LP
677 assert(named_iofds[fileno] >= 0);
678
52c239d7
LB
679 (void) fd_nonblock(named_iofds[fileno], false);
680 return dup2(named_iofds[fileno], fileno) < 0 ? -errno : fileno;
681
2038c3f5
LP
682 case EXEC_OUTPUT_FILE: {
683 bool rw;
684 int fd;
685
686 assert(context->stdio_file[fileno]);
687
688 rw = context->std_input == EXEC_INPUT_FILE &&
689 streq_ptr(context->stdio_file[fileno], context->stdio_file[STDIN_FILENO]);
690
691 if (rw)
692 return dup2(STDIN_FILENO, fileno) < 0 ? -errno : fileno;
693
694 fd = acquire_path(context->stdio_file[fileno], O_WRONLY, 0666 & ~context->umask);
695 if (fd < 0)
696 return fd;
697
698 return move_fd(fd, fileno, false);
699 }
700
94f04347 701 default:
80876c20 702 assert_not_reached("Unknown error type");
94f04347 703 }
071830ff
LP
704}
705
02a51aba
LP
706static int chown_terminal(int fd, uid_t uid) {
707 struct stat st;
708
709 assert(fd >= 0);
02a51aba 710
1ff74fb6
LP
711 /* Before we chown/chmod the TTY, let's ensure this is actually a tty */
712 if (isatty(fd) < 1)
713 return 0;
714
02a51aba 715 /* This might fail. What matters are the results. */
bab45044
LP
716 (void) fchown(fd, uid, -1);
717 (void) fchmod(fd, TTY_MODE);
02a51aba
LP
718
719 if (fstat(fd, &st) < 0)
720 return -errno;
721
d8b4e2e9 722 if (st.st_uid != uid || (st.st_mode & 0777) != TTY_MODE)
02a51aba
LP
723 return -EPERM;
724
725 return 0;
726}
727
7d5ceb64 728static int setup_confirm_stdio(const char *vc, int *_saved_stdin, int *_saved_stdout) {
3d18b167
LP
729 _cleanup_close_ int fd = -1, saved_stdin = -1, saved_stdout = -1;
730 int r;
80876c20 731
80876c20
LP
732 assert(_saved_stdin);
733 assert(_saved_stdout);
734
af6da548
LP
735 saved_stdin = fcntl(STDIN_FILENO, F_DUPFD, 3);
736 if (saved_stdin < 0)
737 return -errno;
80876c20 738
af6da548 739 saved_stdout = fcntl(STDOUT_FILENO, F_DUPFD, 3);
3d18b167
LP
740 if (saved_stdout < 0)
741 return -errno;
80876c20 742
8854d795 743 fd = acquire_terminal(vc, ACQUIRE_TERMINAL_WAIT, DEFAULT_CONFIRM_USEC);
3d18b167
LP
744 if (fd < 0)
745 return fd;
80876c20 746
af6da548
LP
747 r = chown_terminal(fd, getuid());
748 if (r < 0)
3d18b167 749 return r;
02a51aba 750
3d18b167
LP
751 r = reset_terminal_fd(fd, true);
752 if (r < 0)
753 return r;
80876c20 754
2b33ab09 755 r = rearrange_stdio(fd, fd, STDERR_FILENO);
3d18b167 756 fd = -1;
2b33ab09
LP
757 if (r < 0)
758 return r;
80876c20
LP
759
760 *_saved_stdin = saved_stdin;
761 *_saved_stdout = saved_stdout;
762
3d18b167 763 saved_stdin = saved_stdout = -1;
80876c20 764
3d18b167 765 return 0;
80876c20
LP
766}
767
63d77c92 768static void write_confirm_error_fd(int err, int fd, const Unit *u) {
3b20f877
FB
769 assert(err < 0);
770
771 if (err == -ETIMEDOUT)
63d77c92 772 dprintf(fd, "Confirmation question timed out for %s, assuming positive response.\n", u->id);
3b20f877
FB
773 else {
774 errno = -err;
63d77c92 775 dprintf(fd, "Couldn't ask confirmation for %s: %m, assuming positive response.\n", u->id);
3b20f877
FB
776 }
777}
778
63d77c92 779static void write_confirm_error(int err, const char *vc, const Unit *u) {
03e334a1 780 _cleanup_close_ int fd = -1;
80876c20 781
3b20f877 782 assert(vc);
80876c20 783
7d5ceb64 784 fd = open_terminal(vc, O_WRONLY|O_NOCTTY|O_CLOEXEC);
af6da548 785 if (fd < 0)
3b20f877 786 return;
80876c20 787
63d77c92 788 write_confirm_error_fd(err, fd, u);
af6da548 789}
80876c20 790
3d18b167 791static int restore_confirm_stdio(int *saved_stdin, int *saved_stdout) {
af6da548 792 int r = 0;
80876c20 793
af6da548
LP
794 assert(saved_stdin);
795 assert(saved_stdout);
796
797 release_terminal();
798
799 if (*saved_stdin >= 0)
80876c20 800 if (dup2(*saved_stdin, STDIN_FILENO) < 0)
af6da548 801 r = -errno;
80876c20 802
af6da548 803 if (*saved_stdout >= 0)
80876c20 804 if (dup2(*saved_stdout, STDOUT_FILENO) < 0)
af6da548 805 r = -errno;
80876c20 806
3d18b167
LP
807 *saved_stdin = safe_close(*saved_stdin);
808 *saved_stdout = safe_close(*saved_stdout);
af6da548
LP
809
810 return r;
811}
812
3b20f877
FB
813enum {
814 CONFIRM_PRETEND_FAILURE = -1,
815 CONFIRM_PRETEND_SUCCESS = 0,
816 CONFIRM_EXECUTE = 1,
817};
818
eedf223a 819static int ask_for_confirmation(const char *vc, Unit *u, const char *cmdline) {
af6da548 820 int saved_stdout = -1, saved_stdin = -1, r;
2bcd3c26 821 _cleanup_free_ char *e = NULL;
3b20f877 822 char c;
af6da548 823
3b20f877 824 /* For any internal errors, assume a positive response. */
7d5ceb64 825 r = setup_confirm_stdio(vc, &saved_stdin, &saved_stdout);
3b20f877 826 if (r < 0) {
63d77c92 827 write_confirm_error(r, vc, u);
3b20f877
FB
828 return CONFIRM_EXECUTE;
829 }
af6da548 830
b0eb2944
FB
831 /* confirm_spawn might have been disabled while we were sleeping. */
832 if (manager_is_confirm_spawn_disabled(u->manager)) {
833 r = 1;
834 goto restore_stdio;
835 }
af6da548 836
2bcd3c26
FB
837 e = ellipsize(cmdline, 60, 100);
838 if (!e) {
839 log_oom();
840 r = CONFIRM_EXECUTE;
841 goto restore_stdio;
842 }
af6da548 843
d172b175 844 for (;;) {
539622bd 845 r = ask_char(&c, "yfshiDjcn", "Execute %s? [y, f, s – h for help] ", e);
d172b175 846 if (r < 0) {
63d77c92 847 write_confirm_error_fd(r, STDOUT_FILENO, u);
d172b175
FB
848 r = CONFIRM_EXECUTE;
849 goto restore_stdio;
850 }
af6da548 851
d172b175 852 switch (c) {
b0eb2944
FB
853 case 'c':
854 printf("Resuming normal execution.\n");
855 manager_disable_confirm_spawn();
856 r = 1;
857 break;
dd6f9ac0
FB
858 case 'D':
859 unit_dump(u, stdout, " ");
860 continue; /* ask again */
d172b175
FB
861 case 'f':
862 printf("Failing execution.\n");
863 r = CONFIRM_PRETEND_FAILURE;
864 break;
865 case 'h':
b0eb2944
FB
866 printf(" c - continue, proceed without asking anymore\n"
867 " D - dump, show the state of the unit\n"
dd6f9ac0 868 " f - fail, don't execute the command and pretend it failed\n"
d172b175 869 " h - help\n"
eedf223a 870 " i - info, show a short summary of the unit\n"
56fde33a 871 " j - jobs, show jobs that are in progress\n"
d172b175
FB
872 " s - skip, don't execute the command and pretend it succeeded\n"
873 " y - yes, execute the command\n");
dd6f9ac0 874 continue; /* ask again */
eedf223a
FB
875 case 'i':
876 printf(" Description: %s\n"
877 " Unit: %s\n"
878 " Command: %s\n",
879 u->id, u->description, cmdline);
880 continue; /* ask again */
56fde33a
FB
881 case 'j':
882 manager_dump_jobs(u->manager, stdout, " ");
883 continue; /* ask again */
539622bd
FB
884 case 'n':
885 /* 'n' was removed in favor of 'f'. */
886 printf("Didn't understand 'n', did you mean 'f'?\n");
887 continue; /* ask again */
d172b175
FB
888 case 's':
889 printf("Skipping execution.\n");
890 r = CONFIRM_PRETEND_SUCCESS;
891 break;
892 case 'y':
893 r = CONFIRM_EXECUTE;
894 break;
895 default:
896 assert_not_reached("Unhandled choice");
897 }
3b20f877 898 break;
3b20f877 899 }
af6da548 900
3b20f877 901restore_stdio:
af6da548 902 restore_confirm_stdio(&saved_stdin, &saved_stdout);
af6da548 903 return r;
80876c20
LP
904}
905
4d885bd3
DH
906static int get_fixed_user(const ExecContext *c, const char **user,
907 uid_t *uid, gid_t *gid,
908 const char **home, const char **shell) {
81a2b7ce 909 int r;
4d885bd3 910 const char *name;
81a2b7ce 911
4d885bd3 912 assert(c);
81a2b7ce 913
23deef88
LP
914 if (!c->user)
915 return 0;
916
4d885bd3
DH
917 /* Note that we don't set $HOME or $SHELL if they are not particularly enlightening anyway
918 * (i.e. are "/" or "/bin/nologin"). */
81a2b7ce 919
23deef88 920 name = c->user;
4d885bd3
DH
921 r = get_user_creds_clean(&name, uid, gid, home, shell);
922 if (r < 0)
923 return r;
81a2b7ce 924
4d885bd3
DH
925 *user = name;
926 return 0;
927}
928
929static int get_fixed_group(const ExecContext *c, const char **group, gid_t *gid) {
930 int r;
931 const char *name;
932
933 assert(c);
934
935 if (!c->group)
936 return 0;
937
938 name = c->group;
939 r = get_group_creds(&name, gid);
940 if (r < 0)
941 return r;
942
943 *group = name;
944 return 0;
945}
946
cdc5d5c5
DH
947static int get_supplementary_groups(const ExecContext *c, const char *user,
948 const char *group, gid_t gid,
949 gid_t **supplementary_gids, int *ngids) {
4d885bd3
DH
950 char **i;
951 int r, k = 0;
952 int ngroups_max;
953 bool keep_groups = false;
954 gid_t *groups = NULL;
955 _cleanup_free_ gid_t *l_gids = NULL;
956
957 assert(c);
958
bbeea271
DH
959 /*
960 * If user is given, then lookup GID and supplementary groups list.
961 * We avoid NSS lookups for gid=0. Also we have to initialize groups
cdc5d5c5
DH
962 * here and as early as possible so we keep the list of supplementary
963 * groups of the caller.
bbeea271
DH
964 */
965 if (user && gid_is_valid(gid) && gid != 0) {
966 /* First step, initialize groups from /etc/groups */
967 if (initgroups(user, gid) < 0)
968 return -errno;
969
970 keep_groups = true;
971 }
972
ac6e8be6 973 if (strv_isempty(c->supplementary_groups))
4d885bd3
DH
974 return 0;
975
366ddd25
DH
976 /*
977 * If SupplementaryGroups= was passed then NGROUPS_MAX has to
978 * be positive, otherwise fail.
979 */
980 errno = 0;
981 ngroups_max = (int) sysconf(_SC_NGROUPS_MAX);
982 if (ngroups_max <= 0) {
983 if (errno > 0)
984 return -errno;
985 else
986 return -EOPNOTSUPP; /* For all other values */
987 }
988
4d885bd3
DH
989 l_gids = new(gid_t, ngroups_max);
990 if (!l_gids)
991 return -ENOMEM;
81a2b7ce 992
4d885bd3
DH
993 if (keep_groups) {
994 /*
995 * Lookup the list of groups that the user belongs to, we
996 * avoid NSS lookups here too for gid=0.
997 */
998 k = ngroups_max;
999 if (getgrouplist(user, gid, l_gids, &k) < 0)
1000 return -EINVAL;
1001 } else
1002 k = 0;
81a2b7ce 1003
4d885bd3
DH
1004 STRV_FOREACH(i, c->supplementary_groups) {
1005 const char *g;
81a2b7ce 1006
4d885bd3
DH
1007 if (k >= ngroups_max)
1008 return -E2BIG;
81a2b7ce 1009
4d885bd3
DH
1010 g = *i;
1011 r = get_group_creds(&g, l_gids+k);
1012 if (r < 0)
1013 return r;
81a2b7ce 1014
4d885bd3
DH
1015 k++;
1016 }
81a2b7ce 1017
4d885bd3
DH
1018 /*
1019 * Sets ngids to zero to drop all supplementary groups, happens
1020 * when we are under root and SupplementaryGroups= is empty.
1021 */
1022 if (k == 0) {
1023 *ngids = 0;
1024 return 0;
1025 }
81a2b7ce 1026
4d885bd3
DH
1027 /* Otherwise get the final list of supplementary groups */
1028 groups = memdup(l_gids, sizeof(gid_t) * k);
1029 if (!groups)
1030 return -ENOMEM;
1031
1032 *supplementary_gids = groups;
1033 *ngids = k;
1034
1035 groups = NULL;
1036
1037 return 0;
1038}
1039
34cf6c43 1040static int enforce_groups(gid_t gid, const gid_t *supplementary_gids, int ngids) {
4d885bd3
DH
1041 int r;
1042
709dbeac
YW
1043 /* Handle SupplementaryGroups= if it is not empty */
1044 if (ngids > 0) {
4d885bd3
DH
1045 r = maybe_setgroups(ngids, supplementary_gids);
1046 if (r < 0)
97f0e76f 1047 return r;
4d885bd3 1048 }
81a2b7ce 1049
4d885bd3
DH
1050 if (gid_is_valid(gid)) {
1051 /* Then set our gids */
1052 if (setresgid(gid, gid, gid) < 0)
1053 return -errno;
81a2b7ce
LP
1054 }
1055
1056 return 0;
1057}
1058
1059static int enforce_user(const ExecContext *context, uid_t uid) {
81a2b7ce
LP
1060 assert(context);
1061
4d885bd3
DH
1062 if (!uid_is_valid(uid))
1063 return 0;
1064
479050b3 1065 /* Sets (but doesn't look up) the uid and make sure we keep the
81a2b7ce
LP
1066 * capabilities while doing so. */
1067
479050b3 1068 if (context->capability_ambient_set != 0) {
81a2b7ce
LP
1069
1070 /* First step: If we need to keep capabilities but
1071 * drop privileges we need to make sure we keep our
cbb21cca 1072 * caps, while we drop privileges. */
693ced48 1073 if (uid != 0) {
cbb21cca 1074 int sb = context->secure_bits | 1<<SECURE_KEEP_CAPS;
693ced48
LP
1075
1076 if (prctl(PR_GET_SECUREBITS) != sb)
1077 if (prctl(PR_SET_SECUREBITS, sb) < 0)
1078 return -errno;
1079 }
81a2b7ce
LP
1080 }
1081
479050b3 1082 /* Second step: actually set the uids */
81a2b7ce
LP
1083 if (setresuid(uid, uid, uid) < 0)
1084 return -errno;
1085
1086 /* At this point we should have all necessary capabilities but
1087 are otherwise a normal user. However, the caps might got
1088 corrupted due to the setresuid() so we need clean them up
1089 later. This is done outside of this call. */
1090
1091 return 0;
1092}
1093
349cc4a5 1094#if HAVE_PAM
5b6319dc
LP
1095
1096static int null_conv(
1097 int num_msg,
1098 const struct pam_message **msg,
1099 struct pam_response **resp,
1100 void *appdata_ptr) {
1101
1102 /* We don't support conversations */
1103
1104 return PAM_CONV_ERR;
1105}
1106
cefc33ae
LP
1107#endif
1108
5b6319dc
LP
1109static int setup_pam(
1110 const char *name,
1111 const char *user,
940c5210 1112 uid_t uid,
2d6fce8d 1113 gid_t gid,
5b6319dc 1114 const char *tty,
2065ca69 1115 char ***env,
5b6319dc
LP
1116 int fds[], unsigned n_fds) {
1117
349cc4a5 1118#if HAVE_PAM
cefc33ae 1119
5b6319dc
LP
1120 static const struct pam_conv conv = {
1121 .conv = null_conv,
1122 .appdata_ptr = NULL
1123 };
1124
2d7c6aa2 1125 _cleanup_(barrier_destroy) Barrier barrier = BARRIER_NULL;
5b6319dc 1126 pam_handle_t *handle = NULL;
d6e5f3ad 1127 sigset_t old_ss;
7bb70b6e 1128 int pam_code = PAM_SUCCESS, r;
84eada2f 1129 char **nv, **e = NULL;
5b6319dc
LP
1130 bool close_session = false;
1131 pid_t pam_pid = 0, parent_pid;
970edce6 1132 int flags = 0;
5b6319dc
LP
1133
1134 assert(name);
1135 assert(user);
2065ca69 1136 assert(env);
5b6319dc
LP
1137
1138 /* We set up PAM in the parent process, then fork. The child
35b8ca3a 1139 * will then stay around until killed via PR_GET_PDEATHSIG or
5b6319dc
LP
1140 * systemd via the cgroup logic. It will then remove the PAM
1141 * session again. The parent process will exec() the actual
1142 * daemon. We do things this way to ensure that the main PID
1143 * of the daemon is the one we initially fork()ed. */
1144
7bb70b6e
LP
1145 r = barrier_create(&barrier);
1146 if (r < 0)
2d7c6aa2
DH
1147 goto fail;
1148
553d2243 1149 if (log_get_max_level() < LOG_DEBUG)
970edce6
ZJS
1150 flags |= PAM_SILENT;
1151
f546241b
ZJS
1152 pam_code = pam_start(name, user, &conv, &handle);
1153 if (pam_code != PAM_SUCCESS) {
5b6319dc
LP
1154 handle = NULL;
1155 goto fail;
1156 }
1157
f546241b
ZJS
1158 if (tty) {
1159 pam_code = pam_set_item(handle, PAM_TTY, tty);
1160 if (pam_code != PAM_SUCCESS)
5b6319dc 1161 goto fail;
f546241b 1162 }
5b6319dc 1163
84eada2f
JW
1164 STRV_FOREACH(nv, *env) {
1165 pam_code = pam_putenv(handle, *nv);
2065ca69
JW
1166 if (pam_code != PAM_SUCCESS)
1167 goto fail;
1168 }
1169
970edce6 1170 pam_code = pam_acct_mgmt(handle, flags);
f546241b 1171 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1172 goto fail;
1173
970edce6 1174 pam_code = pam_open_session(handle, flags);
f546241b 1175 if (pam_code != PAM_SUCCESS)
5b6319dc
LP
1176 goto fail;
1177
1178 close_session = true;
1179
f546241b
ZJS
1180 e = pam_getenvlist(handle);
1181 if (!e) {
5b6319dc
LP
1182 pam_code = PAM_BUF_ERR;
1183 goto fail;
1184 }
1185
1186 /* Block SIGTERM, so that we know that it won't get lost in
1187 * the child */
ce30c8dc 1188
72c0a2c2 1189 assert_se(sigprocmask_many(SIG_BLOCK, &old_ss, SIGTERM, -1) >= 0);
5b6319dc 1190
df0ff127 1191 parent_pid = getpid_cached();
5b6319dc 1192
4c253ed1
LP
1193 r = safe_fork("(sd-pam)", 0, &pam_pid);
1194 if (r < 0)
5b6319dc 1195 goto fail;
4c253ed1 1196 if (r == 0) {
7bb70b6e 1197 int sig, ret = EXIT_PAM;
5b6319dc
LP
1198
1199 /* The child's job is to reset the PAM session on
1200 * termination */
2d7c6aa2 1201 barrier_set_role(&barrier, BARRIER_CHILD);
5b6319dc 1202
4c253ed1
LP
1203 /* Make sure we don't keep open the passed fds in this child. We assume that otherwise only those fds
1204 * are open here that have been opened by PAM. */
1205 (void) close_many(fds, n_fds);
5b6319dc 1206
940c5210
AK
1207 /* Drop privileges - we don't need any to pam_close_session
1208 * and this will make PR_SET_PDEATHSIG work in most cases.
1209 * If this fails, ignore the error - but expect sd-pam threads
1210 * to fail to exit normally */
2d6fce8d 1211
97f0e76f
LP
1212 r = maybe_setgroups(0, NULL);
1213 if (r < 0)
1214 log_warning_errno(r, "Failed to setgroups() in sd-pam: %m");
2d6fce8d
LP
1215 if (setresgid(gid, gid, gid) < 0)
1216 log_warning_errno(errno, "Failed to setresgid() in sd-pam: %m");
940c5210 1217 if (setresuid(uid, uid, uid) < 0)
2d6fce8d 1218 log_warning_errno(errno, "Failed to setresuid() in sd-pam: %m");
940c5210 1219
ce30c8dc
LP
1220 (void) ignore_signals(SIGPIPE, -1);
1221
940c5210
AK
1222 /* Wait until our parent died. This will only work if
1223 * the above setresuid() succeeds, otherwise the kernel
1224 * will not allow unprivileged parents kill their privileged
1225 * children this way. We rely on the control groups kill logic
5b6319dc
LP
1226 * to do the rest for us. */
1227 if (prctl(PR_SET_PDEATHSIG, SIGTERM) < 0)
1228 goto child_finish;
1229
2d7c6aa2
DH
1230 /* Tell the parent that our setup is done. This is especially
1231 * important regarding dropping privileges. Otherwise, unit
643f4706
ZJS
1232 * setup might race against our setresuid(2) call.
1233 *
1234 * If the parent aborted, we'll detect this below, hence ignore
1235 * return failure here. */
1236 (void) barrier_place(&barrier);
2d7c6aa2 1237
643f4706 1238 /* Check if our parent process might already have died? */
5b6319dc 1239 if (getppid() == parent_pid) {
d6e5f3ad
DM
1240 sigset_t ss;
1241
1242 assert_se(sigemptyset(&ss) >= 0);
1243 assert_se(sigaddset(&ss, SIGTERM) >= 0);
1244
3dead8d9
LP
1245 for (;;) {
1246 if (sigwait(&ss, &sig) < 0) {
1247 if (errno == EINTR)
1248 continue;
1249
1250 goto child_finish;
1251 }
5b6319dc 1252
3dead8d9
LP
1253 assert(sig == SIGTERM);
1254 break;
1255 }
5b6319dc
LP
1256 }
1257
3dead8d9 1258 /* If our parent died we'll end the session */
f546241b 1259 if (getppid() != parent_pid) {
970edce6 1260 pam_code = pam_close_session(handle, flags);
f546241b 1261 if (pam_code != PAM_SUCCESS)
5b6319dc 1262 goto child_finish;
f546241b 1263 }
5b6319dc 1264
7bb70b6e 1265 ret = 0;
5b6319dc
LP
1266
1267 child_finish:
970edce6 1268 pam_end(handle, pam_code | flags);
7bb70b6e 1269 _exit(ret);
5b6319dc
LP
1270 }
1271
2d7c6aa2
DH
1272 barrier_set_role(&barrier, BARRIER_PARENT);
1273
5b6319dc
LP
1274 /* If the child was forked off successfully it will do all the
1275 * cleanups, so forget about the handle here. */
1276 handle = NULL;
1277
3b8bddde 1278 /* Unblock SIGTERM again in the parent */
72c0a2c2 1279 assert_se(sigprocmask(SIG_SETMASK, &old_ss, NULL) >= 0);
5b6319dc
LP
1280
1281 /* We close the log explicitly here, since the PAM modules
1282 * might have opened it, but we don't want this fd around. */
1283 closelog();
1284
2d7c6aa2
DH
1285 /* Synchronously wait for the child to initialize. We don't care for
1286 * errors as we cannot recover. However, warn loudly if it happens. */
1287 if (!barrier_place_and_sync(&barrier))
1288 log_error("PAM initialization failed");
1289
2065ca69
JW
1290 strv_free(*env);
1291 *env = e;
aa87e624 1292
5b6319dc
LP
1293 return 0;
1294
1295fail:
970edce6
ZJS
1296 if (pam_code != PAM_SUCCESS) {
1297 log_error("PAM failed: %s", pam_strerror(handle, pam_code));
7bb70b6e
LP
1298 r = -EPERM; /* PAM errors do not map to errno */
1299 } else
1300 log_error_errno(r, "PAM failed: %m");
9ba35398 1301
5b6319dc
LP
1302 if (handle) {
1303 if (close_session)
970edce6 1304 pam_code = pam_close_session(handle, flags);
5b6319dc 1305
970edce6 1306 pam_end(handle, pam_code | flags);
5b6319dc
LP
1307 }
1308
1309 strv_free(e);
5b6319dc
LP
1310 closelog();
1311
7bb70b6e 1312 return r;
cefc33ae
LP
1313#else
1314 return 0;
5b6319dc 1315#endif
cefc33ae 1316}
5b6319dc 1317
5d6b1584
LP
1318static void rename_process_from_path(const char *path) {
1319 char process_name[11];
1320 const char *p;
1321 size_t l;
1322
1323 /* This resulting string must fit in 10 chars (i.e. the length
1324 * of "/sbin/init") to look pretty in /bin/ps */
1325
2b6bf07d 1326 p = basename(path);
5d6b1584
LP
1327 if (isempty(p)) {
1328 rename_process("(...)");
1329 return;
1330 }
1331
1332 l = strlen(p);
1333 if (l > 8) {
1334 /* The end of the process name is usually more
1335 * interesting, since the first bit might just be
1336 * "systemd-" */
1337 p = p + l - 8;
1338 l = 8;
1339 }
1340
1341 process_name[0] = '(';
1342 memcpy(process_name+1, p, l);
1343 process_name[1+l] = ')';
1344 process_name[1+l+1] = 0;
1345
1346 rename_process(process_name);
1347}
1348
469830d1
LP
1349static bool context_has_address_families(const ExecContext *c) {
1350 assert(c);
1351
1352 return c->address_families_whitelist ||
1353 !set_isempty(c->address_families);
1354}
1355
1356static bool context_has_syscall_filters(const ExecContext *c) {
1357 assert(c);
1358
1359 return c->syscall_whitelist ||
8cfa775f 1360 !hashmap_isempty(c->syscall_filter);
469830d1
LP
1361}
1362
1363static bool context_has_no_new_privileges(const ExecContext *c) {
1364 assert(c);
1365
1366 if (c->no_new_privileges)
1367 return true;
1368
1369 if (have_effective_cap(CAP_SYS_ADMIN)) /* if we are privileged, we don't need NNP */
1370 return false;
1371
1372 /* We need NNP if we have any form of seccomp and are unprivileged */
1373 return context_has_address_families(c) ||
1374 c->memory_deny_write_execute ||
1375 c->restrict_realtime ||
1376 exec_context_restrict_namespaces_set(c) ||
1377 c->protect_kernel_tunables ||
1378 c->protect_kernel_modules ||
1379 c->private_devices ||
1380 context_has_syscall_filters(c) ||
78e864e5
TM
1381 !set_isempty(c->syscall_archs) ||
1382 c->lock_personality;
469830d1
LP
1383}
1384
349cc4a5 1385#if HAVE_SECCOMP
17df7223 1386
83f12b27 1387static bool skip_seccomp_unavailable(const Unit* u, const char* msg) {
f673b62d
LP
1388
1389 if (is_seccomp_available())
1390 return false;
1391
f673b62d 1392 log_unit_debug(u, "SECCOMP features not detected in the kernel, skipping %s", msg);
f673b62d 1393 return true;
83f12b27
FS
1394}
1395
165a31c0 1396static int apply_syscall_filter(const Unit* u, const ExecContext *c, bool needs_ambient_hack) {
469830d1 1397 uint32_t negative_action, default_action, action;
165a31c0 1398 int r;
8351ceae 1399
469830d1 1400 assert(u);
c0467cf3 1401 assert(c);
8351ceae 1402
469830d1 1403 if (!context_has_syscall_filters(c))
83f12b27
FS
1404 return 0;
1405
469830d1
LP
1406 if (skip_seccomp_unavailable(u, "SystemCallFilter="))
1407 return 0;
e9642be2 1408
469830d1 1409 negative_action = c->syscall_errno == 0 ? SCMP_ACT_KILL : SCMP_ACT_ERRNO(c->syscall_errno);
e9642be2 1410
469830d1
LP
1411 if (c->syscall_whitelist) {
1412 default_action = negative_action;
1413 action = SCMP_ACT_ALLOW;
7c66bae2 1414 } else {
469830d1
LP
1415 default_action = SCMP_ACT_ALLOW;
1416 action = negative_action;
57183d11 1417 }
8351ceae 1418
165a31c0
LP
1419 if (needs_ambient_hack) {
1420 r = seccomp_filter_set_add(c->syscall_filter, c->syscall_whitelist, syscall_filter_sets + SYSCALL_FILTER_SET_SETUID);
1421 if (r < 0)
1422 return r;
1423 }
1424
469830d1 1425 return seccomp_load_syscall_filter_set_raw(default_action, c->syscall_filter, action);
4298d0b5
LP
1426}
1427
469830d1
LP
1428static int apply_syscall_archs(const Unit *u, const ExecContext *c) {
1429 assert(u);
4298d0b5
LP
1430 assert(c);
1431
469830d1 1432 if (set_isempty(c->syscall_archs))
83f12b27
FS
1433 return 0;
1434
469830d1
LP
1435 if (skip_seccomp_unavailable(u, "SystemCallArchitectures="))
1436 return 0;
4298d0b5 1437
469830d1
LP
1438 return seccomp_restrict_archs(c->syscall_archs);
1439}
4298d0b5 1440
469830d1
LP
1441static int apply_address_families(const Unit* u, const ExecContext *c) {
1442 assert(u);
1443 assert(c);
4298d0b5 1444
469830d1
LP
1445 if (!context_has_address_families(c))
1446 return 0;
4298d0b5 1447
469830d1
LP
1448 if (skip_seccomp_unavailable(u, "RestrictAddressFamilies="))
1449 return 0;
4298d0b5 1450
469830d1 1451 return seccomp_restrict_address_families(c->address_families, c->address_families_whitelist);
8351ceae 1452}
4298d0b5 1453
83f12b27 1454static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
469830d1 1455 assert(u);
f3e43635
TM
1456 assert(c);
1457
469830d1 1458 if (!c->memory_deny_write_execute)
83f12b27
FS
1459 return 0;
1460
469830d1
LP
1461 if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
1462 return 0;
f3e43635 1463
469830d1 1464 return seccomp_memory_deny_write_execute();
f3e43635
TM
1465}
1466
83f12b27 1467static int apply_restrict_realtime(const Unit* u, const ExecContext *c) {
469830d1 1468 assert(u);
f4170c67
LP
1469 assert(c);
1470
469830d1 1471 if (!c->restrict_realtime)
83f12b27
FS
1472 return 0;
1473
469830d1
LP
1474 if (skip_seccomp_unavailable(u, "RestrictRealtime="))
1475 return 0;
f4170c67 1476
469830d1 1477 return seccomp_restrict_realtime();
f4170c67
LP
1478}
1479
59e856c7 1480static int apply_protect_sysctl(const Unit *u, const ExecContext *c) {
469830d1 1481 assert(u);
59eeb84b
LP
1482 assert(c);
1483
1484 /* Turn off the legacy sysctl() system call. Many distributions turn this off while building the kernel, but
1485 * let's protect even those systems where this is left on in the kernel. */
1486
469830d1 1487 if (!c->protect_kernel_tunables)
59eeb84b
LP
1488 return 0;
1489
469830d1
LP
1490 if (skip_seccomp_unavailable(u, "ProtectKernelTunables="))
1491 return 0;
59eeb84b 1492
469830d1 1493 return seccomp_protect_sysctl();
59eeb84b
LP
1494}
1495
59e856c7 1496static int apply_protect_kernel_modules(const Unit *u, const ExecContext *c) {
469830d1 1497 assert(u);
502d704e
DH
1498 assert(c);
1499
25a8d8a0 1500 /* Turn off module syscalls on ProtectKernelModules=yes */
502d704e 1501
469830d1
LP
1502 if (!c->protect_kernel_modules)
1503 return 0;
1504
502d704e
DH
1505 if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
1506 return 0;
1507
469830d1 1508 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_MODULE, SCMP_ACT_ERRNO(EPERM));
502d704e
DH
1509}
1510
59e856c7 1511static int apply_private_devices(const Unit *u, const ExecContext *c) {
469830d1 1512 assert(u);
ba128bb8
LP
1513 assert(c);
1514
8f81a5f6 1515 /* If PrivateDevices= is set, also turn off iopl and all @raw-io syscalls. */
ba128bb8 1516
469830d1
LP
1517 if (!c->private_devices)
1518 return 0;
1519
ba128bb8
LP
1520 if (skip_seccomp_unavailable(u, "PrivateDevices="))
1521 return 0;
1522
469830d1 1523 return seccomp_load_syscall_filter_set(SCMP_ACT_ALLOW, syscall_filter_sets + SYSCALL_FILTER_SET_RAW_IO, SCMP_ACT_ERRNO(EPERM));
ba128bb8
LP
1524}
1525
34cf6c43 1526static int apply_restrict_namespaces(const Unit *u, const ExecContext *c) {
469830d1 1527 assert(u);
add00535
LP
1528 assert(c);
1529
1530 if (!exec_context_restrict_namespaces_set(c))
1531 return 0;
1532
1533 if (skip_seccomp_unavailable(u, "RestrictNamespaces="))
1534 return 0;
1535
1536 return seccomp_restrict_namespaces(c->restrict_namespaces);
1537}
1538
78e864e5 1539static int apply_lock_personality(const Unit* u, const ExecContext *c) {
e8132d63
LP
1540 unsigned long personality;
1541 int r;
78e864e5
TM
1542
1543 assert(u);
1544 assert(c);
1545
1546 if (!c->lock_personality)
1547 return 0;
1548
1549 if (skip_seccomp_unavailable(u, "LockPersonality="))
1550 return 0;
1551
e8132d63
LP
1552 personality = c->personality;
1553
1554 /* If personality is not specified, use either PER_LINUX or PER_LINUX32 depending on what is currently set. */
1555 if (personality == PERSONALITY_INVALID) {
1556
1557 r = opinionated_personality(&personality);
1558 if (r < 0)
1559 return r;
1560 }
78e864e5
TM
1561
1562 return seccomp_lock_personality(personality);
1563}
1564
c0467cf3 1565#endif
8351ceae 1566
31a7eb86
ZJS
1567static void do_idle_pipe_dance(int idle_pipe[4]) {
1568 assert(idle_pipe);
1569
54eb2300
LP
1570 idle_pipe[1] = safe_close(idle_pipe[1]);
1571 idle_pipe[2] = safe_close(idle_pipe[2]);
31a7eb86
ZJS
1572
1573 if (idle_pipe[0] >= 0) {
1574 int r;
1575
1576 r = fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT_USEC);
1577
1578 if (idle_pipe[3] >= 0 && r == 0 /* timeout */) {
c7cc737f
LP
1579 ssize_t n;
1580
31a7eb86 1581 /* Signal systemd that we are bored and want to continue. */
c7cc737f
LP
1582 n = write(idle_pipe[3], "x", 1);
1583 if (n > 0)
cd972d69
ZJS
1584 /* Wait for systemd to react to the signal above. */
1585 fd_wait_for_event(idle_pipe[0], POLLHUP, IDLE_TIMEOUT2_USEC);
31a7eb86
ZJS
1586 }
1587
54eb2300 1588 idle_pipe[0] = safe_close(idle_pipe[0]);
31a7eb86
ZJS
1589
1590 }
1591
54eb2300 1592 idle_pipe[3] = safe_close(idle_pipe[3]);
31a7eb86
ZJS
1593}
1594
7cae38c4 1595static int build_environment(
34cf6c43 1596 const Unit *u,
9fa95f85 1597 const ExecContext *c,
1e22b5cd 1598 const ExecParameters *p,
7cae38c4
LP
1599 unsigned n_fds,
1600 const char *home,
1601 const char *username,
1602 const char *shell,
7bce046b
LP
1603 dev_t journal_stream_dev,
1604 ino_t journal_stream_ino,
7cae38c4
LP
1605 char ***ret) {
1606
1607 _cleanup_strv_free_ char **our_env = NULL;
1608 unsigned n_env = 0;
1609 char *x;
1610
4b58153d 1611 assert(u);
7cae38c4
LP
1612 assert(c);
1613 assert(ret);
1614
4b58153d 1615 our_env = new0(char*, 14);
7cae38c4
LP
1616 if (!our_env)
1617 return -ENOMEM;
1618
1619 if (n_fds > 0) {
8dd4c05b
LP
1620 _cleanup_free_ char *joined = NULL;
1621
df0ff127 1622 if (asprintf(&x, "LISTEN_PID="PID_FMT, getpid_cached()) < 0)
7cae38c4
LP
1623 return -ENOMEM;
1624 our_env[n_env++] = x;
1625
1626 if (asprintf(&x, "LISTEN_FDS=%u", n_fds) < 0)
1627 return -ENOMEM;
1628 our_env[n_env++] = x;
8dd4c05b 1629
1e22b5cd 1630 joined = strv_join(p->fd_names, ":");
8dd4c05b
LP
1631 if (!joined)
1632 return -ENOMEM;
1633
605405c6 1634 x = strjoin("LISTEN_FDNAMES=", joined);
8dd4c05b
LP
1635 if (!x)
1636 return -ENOMEM;
1637 our_env[n_env++] = x;
7cae38c4
LP
1638 }
1639
b08af3b1 1640 if ((p->flags & EXEC_SET_WATCHDOG) && p->watchdog_usec > 0) {
df0ff127 1641 if (asprintf(&x, "WATCHDOG_PID="PID_FMT, getpid_cached()) < 0)
09812eb7
LP
1642 return -ENOMEM;
1643 our_env[n_env++] = x;
1644
1e22b5cd 1645 if (asprintf(&x, "WATCHDOG_USEC="USEC_FMT, p->watchdog_usec) < 0)
09812eb7
LP
1646 return -ENOMEM;
1647 our_env[n_env++] = x;
1648 }
1649
fd63e712
LP
1650 /* If this is D-Bus, tell the nss-systemd module, since it relies on being able to use D-Bus look up dynamic
1651 * users via PID 1, possibly dead-locking the dbus daemon. This way it will not use D-Bus to resolve names, but
1652 * check the database directly. */
ac647978 1653 if (p->flags & EXEC_NSS_BYPASS_BUS) {
fd63e712
LP
1654 x = strdup("SYSTEMD_NSS_BYPASS_BUS=1");
1655 if (!x)
1656 return -ENOMEM;
1657 our_env[n_env++] = x;
1658 }
1659
7cae38c4
LP
1660 if (home) {
1661 x = strappend("HOME=", home);
1662 if (!x)
1663 return -ENOMEM;
1664 our_env[n_env++] = x;
1665 }
1666
1667 if (username) {
1668 x = strappend("LOGNAME=", username);
1669 if (!x)
1670 return -ENOMEM;
1671 our_env[n_env++] = x;
1672
1673 x = strappend("USER=", username);
1674 if (!x)
1675 return -ENOMEM;
1676 our_env[n_env++] = x;
1677 }
1678
1679 if (shell) {
1680 x = strappend("SHELL=", shell);
1681 if (!x)
1682 return -ENOMEM;
1683 our_env[n_env++] = x;
1684 }
1685
4b58153d
LP
1686 if (!sd_id128_is_null(u->invocation_id)) {
1687 if (asprintf(&x, "INVOCATION_ID=" SD_ID128_FORMAT_STR, SD_ID128_FORMAT_VAL(u->invocation_id)) < 0)
1688 return -ENOMEM;
1689
1690 our_env[n_env++] = x;
1691 }
1692
6af760f3
LP
1693 if (exec_context_needs_term(c)) {
1694 const char *tty_path, *term = NULL;
1695
1696 tty_path = exec_context_tty_path(c);
1697
1698 /* If we are forked off PID 1 and we are supposed to operate on /dev/console, then let's try to inherit
1699 * the $TERM set for PID 1. This is useful for containers so that the $TERM the container manager
1700 * passes to PID 1 ends up all the way in the console login shown. */
1701
1702 if (path_equal(tty_path, "/dev/console") && getppid() == 1)
1703 term = getenv("TERM");
1704 if (!term)
1705 term = default_term_for_tty(tty_path);
7cae38c4 1706
6af760f3 1707 x = strappend("TERM=", term);
7cae38c4
LP
1708 if (!x)
1709 return -ENOMEM;
1710 our_env[n_env++] = x;
1711 }
1712
7bce046b
LP
1713 if (journal_stream_dev != 0 && journal_stream_ino != 0) {
1714 if (asprintf(&x, "JOURNAL_STREAM=" DEV_FMT ":" INO_FMT, journal_stream_dev, journal_stream_ino) < 0)
1715 return -ENOMEM;
1716
1717 our_env[n_env++] = x;
1718 }
1719
7cae38c4 1720 our_env[n_env++] = NULL;
7bce046b 1721 assert(n_env <= 12);
7cae38c4 1722
ae2a15bc 1723 *ret = TAKE_PTR(our_env);
7cae38c4
LP
1724
1725 return 0;
1726}
1727
b4c14404
FB
1728static int build_pass_environment(const ExecContext *c, char ***ret) {
1729 _cleanup_strv_free_ char **pass_env = NULL;
1730 size_t n_env = 0, n_bufsize = 0;
1731 char **i;
1732
1733 STRV_FOREACH(i, c->pass_environment) {
1734 _cleanup_free_ char *x = NULL;
1735 char *v;
1736
1737 v = getenv(*i);
1738 if (!v)
1739 continue;
605405c6 1740 x = strjoin(*i, "=", v);
b4c14404
FB
1741 if (!x)
1742 return -ENOMEM;
00819cc1 1743
b4c14404
FB
1744 if (!GREEDY_REALLOC(pass_env, n_bufsize, n_env + 2))
1745 return -ENOMEM;
00819cc1 1746
1cc6c93a 1747 pass_env[n_env++] = TAKE_PTR(x);
b4c14404 1748 pass_env[n_env] = NULL;
b4c14404
FB
1749 }
1750
ae2a15bc 1751 *ret = TAKE_PTR(pass_env);
b4c14404
FB
1752
1753 return 0;
1754}
1755
8b44a3d2
LP
1756static bool exec_needs_mount_namespace(
1757 const ExecContext *context,
1758 const ExecParameters *params,
4657abb5 1759 const ExecRuntime *runtime) {
8b44a3d2
LP
1760
1761 assert(context);
1762 assert(params);
1763
915e6d16
LP
1764 if (context->root_image)
1765 return true;
1766
2a624c36
AP
1767 if (!strv_isempty(context->read_write_paths) ||
1768 !strv_isempty(context->read_only_paths) ||
1769 !strv_isempty(context->inaccessible_paths))
8b44a3d2
LP
1770 return true;
1771
42b1d8e0 1772 if (context->n_bind_mounts > 0)
d2d6c096
LP
1773 return true;
1774
2abd4e38
YW
1775 if (context->n_temporary_filesystems > 0)
1776 return true;
1777
8b44a3d2
LP
1778 if (context->mount_flags != 0)
1779 return true;
1780
1781 if (context->private_tmp && runtime && (runtime->tmp_dir || runtime->var_tmp_dir))
1782 return true;
1783
8b44a3d2
LP
1784 if (context->private_devices ||
1785 context->protect_system != PROTECT_SYSTEM_NO ||
59eeb84b
LP
1786 context->protect_home != PROTECT_HOME_NO ||
1787 context->protect_kernel_tunables ||
c575770b 1788 context->protect_kernel_modules ||
59eeb84b 1789 context->protect_control_groups)
8b44a3d2
LP
1790 return true;
1791
9c988f93 1792 if (context->mount_apivfs && (context->root_image || context->root_directory))
5d997827
LP
1793 return true;
1794
42b1d8e0 1795 if (context->dynamic_user &&
b43ee82f 1796 (!strv_isempty(context->directories[EXEC_DIRECTORY_STATE].paths) ||
42b1d8e0
YW
1797 !strv_isempty(context->directories[EXEC_DIRECTORY_CACHE].paths) ||
1798 !strv_isempty(context->directories[EXEC_DIRECTORY_LOGS].paths)))
1799 return true;
1800
8b44a3d2
LP
1801 return false;
1802}
1803
d251207d
LP
1804static int setup_private_users(uid_t uid, gid_t gid) {
1805 _cleanup_free_ char *uid_map = NULL, *gid_map = NULL;
1806 _cleanup_close_pair_ int errno_pipe[2] = { -1, -1 };
1807 _cleanup_close_ int unshare_ready_fd = -1;
1808 _cleanup_(sigkill_waitp) pid_t pid = 0;
1809 uint64_t c = 1;
d251207d
LP
1810 ssize_t n;
1811 int r;
1812
1813 /* Set up a user namespace and map root to root, the selected UID/GID to itself, and everything else to
1814 * nobody. In order to be able to write this mapping we need CAP_SETUID in the original user namespace, which
1815 * we however lack after opening the user namespace. To work around this we fork() a temporary child process,
1816 * which waits for the parent to create the new user namespace while staying in the original namespace. The
1817 * child then writes the UID mapping, under full privileges. The parent waits for the child to finish and
1818 * continues execution normally. */
1819
587ab01b
ZJS
1820 if (uid != 0 && uid_is_valid(uid)) {
1821 r = asprintf(&uid_map,
1822 "0 0 1\n" /* Map root → root */
1823 UID_FMT " " UID_FMT " 1\n", /* Map $UID → $UID */
1824 uid, uid);
1825 if (r < 0)
1826 return -ENOMEM;
1827 } else {
e0f3720e 1828 uid_map = strdup("0 0 1\n"); /* The case where the above is the same */
587ab01b
ZJS
1829 if (!uid_map)
1830 return -ENOMEM;
1831 }
d251207d 1832
587ab01b
ZJS
1833 if (gid != 0 && gid_is_valid(gid)) {
1834 r = asprintf(&gid_map,
1835 "0 0 1\n" /* Map root → root */
1836 GID_FMT " " GID_FMT " 1\n", /* Map $GID → $GID */
1837 gid, gid);
1838 if (r < 0)
1839 return -ENOMEM;
1840 } else {
d251207d 1841 gid_map = strdup("0 0 1\n"); /* The case where the above is the same */
587ab01b
ZJS
1842 if (!gid_map)
1843 return -ENOMEM;
1844 }
d251207d
LP
1845
1846 /* Create a communication channel so that the parent can tell the child when it finished creating the user
1847 * namespace. */
1848 unshare_ready_fd = eventfd(0, EFD_CLOEXEC);
1849 if (unshare_ready_fd < 0)
1850 return -errno;
1851
1852 /* Create a communication channel so that the child can tell the parent a proper error code in case it
1853 * failed. */
1854 if (pipe2(errno_pipe, O_CLOEXEC) < 0)
1855 return -errno;
1856
4c253ed1
LP
1857 r = safe_fork("(sd-userns)", FORK_RESET_SIGNALS|FORK_DEATHSIG, &pid);
1858 if (r < 0)
1859 return r;
1860 if (r == 0) {
d251207d
LP
1861 _cleanup_close_ int fd = -1;
1862 const char *a;
1863 pid_t ppid;
1864
1865 /* Child process, running in the original user namespace. Let's update the parent's UID/GID map from
1866 * here, after the parent opened its own user namespace. */
1867
1868 ppid = getppid();
1869 errno_pipe[0] = safe_close(errno_pipe[0]);
1870
1871 /* Wait until the parent unshared the user namespace */
1872 if (read(unshare_ready_fd, &c, sizeof(c)) < 0) {
1873 r = -errno;
1874 goto child_fail;
1875 }
1876
1877 /* Disable the setgroups() system call in the child user namespace, for good. */
1878 a = procfs_file_alloca(ppid, "setgroups");
1879 fd = open(a, O_WRONLY|O_CLOEXEC);
1880 if (fd < 0) {
1881 if (errno != ENOENT) {
1882 r = -errno;
1883 goto child_fail;
1884 }
1885
1886 /* If the file is missing the kernel is too old, let's continue anyway. */
1887 } else {
1888 if (write(fd, "deny\n", 5) < 0) {
1889 r = -errno;
1890 goto child_fail;
1891 }
1892
1893 fd = safe_close(fd);
1894 }
1895
1896 /* First write the GID map */
1897 a = procfs_file_alloca(ppid, "gid_map");
1898 fd = open(a, O_WRONLY|O_CLOEXEC);
1899 if (fd < 0) {
1900 r = -errno;
1901 goto child_fail;
1902 }
1903 if (write(fd, gid_map, strlen(gid_map)) < 0) {
1904 r = -errno;
1905 goto child_fail;
1906 }
1907 fd = safe_close(fd);
1908
1909 /* The write the UID map */
1910 a = procfs_file_alloca(ppid, "uid_map");
1911 fd = open(a, O_WRONLY|O_CLOEXEC);
1912 if (fd < 0) {
1913 r = -errno;
1914 goto child_fail;
1915 }
1916 if (write(fd, uid_map, strlen(uid_map)) < 0) {
1917 r = -errno;
1918 goto child_fail;
1919 }
1920
1921 _exit(EXIT_SUCCESS);
1922
1923 child_fail:
1924 (void) write(errno_pipe[1], &r, sizeof(r));
1925 _exit(EXIT_FAILURE);
1926 }
1927
1928 errno_pipe[1] = safe_close(errno_pipe[1]);
1929
1930 if (unshare(CLONE_NEWUSER) < 0)
1931 return -errno;
1932
1933 /* Let the child know that the namespace is ready now */
1934 if (write(unshare_ready_fd, &c, sizeof(c)) < 0)
1935 return -errno;
1936
1937 /* Try to read an error code from the child */
1938 n = read(errno_pipe[0], &r, sizeof(r));
1939 if (n < 0)
1940 return -errno;
1941 if (n == sizeof(r)) { /* an error code was sent to us */
1942 if (r < 0)
1943 return r;
1944 return -EIO;
1945 }
1946 if (n != 0) /* on success we should have read 0 bytes */
1947 return -EIO;
1948
2e87a1fd
LP
1949 r = wait_for_terminate_and_check("(sd-userns)", pid, 0);
1950 pid = 0;
d251207d
LP
1951 if (r < 0)
1952 return r;
2e87a1fd 1953 if (r != EXIT_SUCCESS) /* If something strange happened with the child, let's consider this fatal, too */
d251207d
LP
1954 return -EIO;
1955
1956 return 0;
1957}
1958
3536f49e 1959static int setup_exec_directory(
07689d5d
LP
1960 const ExecContext *context,
1961 const ExecParameters *params,
1962 uid_t uid,
3536f49e 1963 gid_t gid,
3536f49e
YW
1964 ExecDirectoryType type,
1965 int *exit_status) {
07689d5d 1966
72fd1768 1967 static const int exit_status_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
1968 [EXEC_DIRECTORY_RUNTIME] = EXIT_RUNTIME_DIRECTORY,
1969 [EXEC_DIRECTORY_STATE] = EXIT_STATE_DIRECTORY,
1970 [EXEC_DIRECTORY_CACHE] = EXIT_CACHE_DIRECTORY,
1971 [EXEC_DIRECTORY_LOGS] = EXIT_LOGS_DIRECTORY,
1972 [EXEC_DIRECTORY_CONFIGURATION] = EXIT_CONFIGURATION_DIRECTORY,
1973 };
07689d5d
LP
1974 char **rt;
1975 int r;
1976
1977 assert(context);
1978 assert(params);
72fd1768 1979 assert(type >= 0 && type < _EXEC_DIRECTORY_TYPE_MAX);
3536f49e 1980 assert(exit_status);
07689d5d 1981
3536f49e
YW
1982 if (!params->prefix[type])
1983 return 0;
1984
8679efde 1985 if (params->flags & EXEC_CHOWN_DIRECTORIES) {
3536f49e
YW
1986 if (!uid_is_valid(uid))
1987 uid = 0;
1988 if (!gid_is_valid(gid))
1989 gid = 0;
1990 }
1991
1992 STRV_FOREACH(rt, context->directories[type].paths) {
6c47cd7d 1993 _cleanup_free_ char *p = NULL, *pp = NULL;
07689d5d 1994
3536f49e
YW
1995 p = strjoin(params->prefix[type], "/", *rt);
1996 if (!p) {
1997 r = -ENOMEM;
1998 goto fail;
1999 }
07689d5d 2000
23a7448e
YW
2001 r = mkdir_parents_label(p, 0755);
2002 if (r < 0)
3536f49e 2003 goto fail;
23a7448e 2004
8092a48c
YW
2005 if (context->dynamic_user &&
2006 !IN_SET(type, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
6c47cd7d
LP
2007 _cleanup_free_ char *private_root = NULL, *relative = NULL, *parent = NULL;
2008
2009 /* So, here's one extra complication when dealing with DynamicUser=1 units. In that case we
2010 * want to avoid leaving a directory around fully accessible that is owned by a dynamic user
2011 * whose UID is later on reused. To lock this down we use the same trick used by container
2012 * managers to prohibit host users to get access to files of the same UID in containers: we
2013 * place everything inside a directory that has an access mode of 0700 and is owned root:root,
2014 * so that it acts as security boundary for unprivileged host code. We then use fs namespacing
2015 * to make this directory permeable for the service itself.
2016 *
2017 * Specifically: for a service which wants a special directory "foo/" we first create a
2018 * directory "private/" with access mode 0700 owned by root:root. Then we place "foo" inside of
2019 * that directory (i.e. "private/foo/"), and make "foo" a symlink to "private/foo". This way,
2020 * privileged host users can access "foo/" as usual, but unprivileged host users can't look
2021 * into it. Inside of the namespaceof the container "private/" is replaced by a more liberally
2022 * accessible tmpfs, into which the host's "private/foo/" is mounted under the same name, thus
2023 * disabling the access boundary for the service and making sure it only gets access to the
2024 * dirs it needs but no others. Tricky? Yes, absolutely, but it works!
2025 *
2026 * Note that we don't do this for EXEC_DIRECTORY_CONFIGURATION as that's assumed not to be
8092a48c
YW
2027 * owned by the service itself.
2028 * Also, note that we don't do this for EXEC_DIRECTORY_RUNTIME as that's often used for sharing
2029 * files or sockets with other services. */
6c47cd7d
LP
2030
2031 private_root = strjoin(params->prefix[type], "/private");
2032 if (!private_root) {
2033 r = -ENOMEM;
2034 goto fail;
2035 }
2036
2037 /* First set up private root if it doesn't exist yet, with access mode 0700 and owned by root:root */
37c1d5e9 2038 r = mkdir_safe_label(private_root, 0700, 0, 0, MKDIR_WARN_MODE);
6c47cd7d
LP
2039 if (r < 0)
2040 goto fail;
2041
2042 pp = strjoin(private_root, "/", *rt);
2043 if (!pp) {
2044 r = -ENOMEM;
2045 goto fail;
2046 }
2047
2048 /* Create all directories between the configured directory and this private root, and mark them 0755 */
2049 r = mkdir_parents_label(pp, 0755);
2050 if (r < 0)
2051 goto fail;
2052
949befd3
LP
2053 if (is_dir(p, false) > 0 &&
2054 (laccess(pp, F_OK) < 0 && errno == ENOENT)) {
2055
2056 /* Hmm, the private directory doesn't exist yet, but the normal one exists? If so, move
2057 * it over. Most likely the service has been upgraded from one that didn't use
2058 * DynamicUser=1, to one that does. */
2059
2060 if (rename(p, pp) < 0) {
2061 r = -errno;
2062 goto fail;
2063 }
2064 } else {
2065 /* Otherwise, create the actual directory for the service */
2066
2067 r = mkdir_label(pp, context->directories[type].mode);
2068 if (r < 0 && r != -EEXIST)
2069 goto fail;
2070 }
6c47cd7d
LP
2071
2072 parent = dirname_malloc(p);
2073 if (!parent) {
2074 r = -ENOMEM;
2075 goto fail;
2076 }
2077
2078 r = path_make_relative(parent, pp, &relative);
2079 if (r < 0)
2080 goto fail;
2081
2082 /* And link it up from the original place */
2083 r = symlink_idempotent(relative, p);
2084 if (r < 0)
2085 goto fail;
2086
30c81ce2
ZJS
2087 /* Lock down the access mode */
2088 if (chmod(pp, context->directories[type].mode) < 0) {
2089 r = -errno;
2090 goto fail;
2091 }
6c47cd7d
LP
2092 } else {
2093 r = mkdir_label(p, context->directories[type].mode);
30c81ce2
ZJS
2094 if (r == -EEXIST)
2095 continue;
2096 if (r < 0)
6c47cd7d 2097 goto fail;
a1164ae3 2098 }
07689d5d 2099
c71b2eb7
LP
2100 /* Don't change the owner of the configuration directory, as in the common case it is not written to by
2101 * a service, and shall not be writable. */
2102 if (type == EXEC_DIRECTORY_CONFIGURATION)
2103 continue;
2104
a1164ae3 2105 /* Then, change the ownership of the whole tree, if necessary */
30c81ce2 2106 r = path_chown_recursive(pp ?: p, uid, gid);
07689d5d 2107 if (r < 0)
3536f49e 2108 goto fail;
07689d5d
LP
2109 }
2110
2111 return 0;
3536f49e
YW
2112
2113fail:
2114 *exit_status = exit_status_table[type];
3536f49e 2115 return r;
07689d5d
LP
2116}
2117
92b423b9 2118#if ENABLE_SMACK
cefc33ae
LP
2119static int setup_smack(
2120 const ExecContext *context,
2121 const ExecCommand *command) {
2122
cefc33ae
LP
2123 int r;
2124
2125 assert(context);
2126 assert(command);
2127
cefc33ae
LP
2128 if (context->smack_process_label) {
2129 r = mac_smack_apply_pid(0, context->smack_process_label);
2130 if (r < 0)
2131 return r;
2132 }
2133#ifdef SMACK_DEFAULT_PROCESS_LABEL
2134 else {
2135 _cleanup_free_ char *exec_label = NULL;
2136
2137 r = mac_smack_read(command->path, SMACK_ATTR_EXEC, &exec_label);
4c701096 2138 if (r < 0 && !IN_SET(r, -ENODATA, -EOPNOTSUPP))
cefc33ae
LP
2139 return r;
2140
2141 r = mac_smack_apply_pid(0, exec_label ? : SMACK_DEFAULT_PROCESS_LABEL);
2142 if (r < 0)
2143 return r;
2144 }
cefc33ae
LP
2145#endif
2146
2147 return 0;
2148}
92b423b9 2149#endif
cefc33ae 2150
6c47cd7d
LP
2151static int compile_bind_mounts(
2152 const ExecContext *context,
2153 const ExecParameters *params,
2154 BindMount **ret_bind_mounts,
2155 unsigned *ret_n_bind_mounts,
2156 char ***ret_empty_directories) {
2157
2158 _cleanup_strv_free_ char **empty_directories = NULL;
2159 BindMount *bind_mounts;
2160 unsigned n, h = 0, i;
2161 ExecDirectoryType t;
2162 int r;
2163
2164 assert(context);
2165 assert(params);
2166 assert(ret_bind_mounts);
2167 assert(ret_n_bind_mounts);
2168 assert(ret_empty_directories);
2169
2170 n = context->n_bind_mounts;
2171 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2172 if (!params->prefix[t])
2173 continue;
2174
2175 n += strv_length(context->directories[t].paths);
2176 }
2177
2178 if (n <= 0) {
2179 *ret_bind_mounts = NULL;
2180 *ret_n_bind_mounts = 0;
2181 *ret_empty_directories = NULL;
2182 return 0;
2183 }
2184
2185 bind_mounts = new(BindMount, n);
2186 if (!bind_mounts)
2187 return -ENOMEM;
2188
a8cabc61 2189 for (i = 0; i < context->n_bind_mounts; i++) {
6c47cd7d
LP
2190 BindMount *item = context->bind_mounts + i;
2191 char *s, *d;
2192
2193 s = strdup(item->source);
2194 if (!s) {
2195 r = -ENOMEM;
2196 goto finish;
2197 }
2198
2199 d = strdup(item->destination);
2200 if (!d) {
2201 free(s);
2202 r = -ENOMEM;
2203 goto finish;
2204 }
2205
2206 bind_mounts[h++] = (BindMount) {
2207 .source = s,
2208 .destination = d,
2209 .read_only = item->read_only,
2210 .recursive = item->recursive,
2211 .ignore_enoent = item->ignore_enoent,
2212 };
2213 }
2214
2215 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2216 char **suffix;
2217
2218 if (!params->prefix[t])
2219 continue;
2220
2221 if (strv_isempty(context->directories[t].paths))
2222 continue;
2223
8092a48c
YW
2224 if (context->dynamic_user &&
2225 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION)) {
6c47cd7d
LP
2226 char *private_root;
2227
2228 /* So this is for a dynamic user, and we need to make sure the process can access its own
2229 * directory. For that we overmount the usually inaccessible "private" subdirectory with a
2230 * tmpfs that makes it accessible and is empty except for the submounts we do this for. */
2231
2232 private_root = strjoin(params->prefix[t], "/private");
2233 if (!private_root) {
2234 r = -ENOMEM;
2235 goto finish;
2236 }
2237
2238 r = strv_consume(&empty_directories, private_root);
a635a7ae 2239 if (r < 0)
6c47cd7d 2240 goto finish;
6c47cd7d
LP
2241 }
2242
2243 STRV_FOREACH(suffix, context->directories[t].paths) {
2244 char *s, *d;
2245
8092a48c
YW
2246 if (context->dynamic_user &&
2247 !IN_SET(t, EXEC_DIRECTORY_RUNTIME, EXEC_DIRECTORY_CONFIGURATION))
6c47cd7d
LP
2248 s = strjoin(params->prefix[t], "/private/", *suffix);
2249 else
2250 s = strjoin(params->prefix[t], "/", *suffix);
2251 if (!s) {
2252 r = -ENOMEM;
2253 goto finish;
2254 }
2255
2256 d = strdup(s);
2257 if (!d) {
2258 free(s);
2259 r = -ENOMEM;
2260 goto finish;
2261 }
2262
2263 bind_mounts[h++] = (BindMount) {
2264 .source = s,
2265 .destination = d,
2266 .read_only = false,
2267 .recursive = true,
2268 .ignore_enoent = false,
2269 };
2270 }
2271 }
2272
2273 assert(h == n);
2274
2275 *ret_bind_mounts = bind_mounts;
2276 *ret_n_bind_mounts = n;
ae2a15bc 2277 *ret_empty_directories = TAKE_PTR(empty_directories);
6c47cd7d
LP
2278
2279 return (int) n;
2280
2281finish:
2282 bind_mount_free_many(bind_mounts, h);
2283 return r;
2284}
2285
6818c54c 2286static int apply_mount_namespace(
34cf6c43
YW
2287 const Unit *u,
2288 const ExecCommand *command,
6818c54c
LP
2289 const ExecContext *context,
2290 const ExecParameters *params,
34cf6c43 2291 const ExecRuntime *runtime) {
6818c54c 2292
7bcef4ef 2293 _cleanup_strv_free_ char **empty_directories = NULL;
93c6bb51 2294 char *tmp = NULL, *var = NULL;
915e6d16 2295 const char *root_dir = NULL, *root_image = NULL;
bb0ff3fb 2296 NamespaceInfo ns_info = {
af964954 2297 .ignore_protect_paths = false,
93c6bb51
DH
2298 .private_dev = context->private_devices,
2299 .protect_control_groups = context->protect_control_groups,
2300 .protect_kernel_tunables = context->protect_kernel_tunables,
2301 .protect_kernel_modules = context->protect_kernel_modules,
5d997827 2302 .mount_apivfs = context->mount_apivfs,
93c6bb51 2303 };
165a31c0 2304 bool needs_sandboxing;
6c47cd7d
LP
2305 BindMount *bind_mounts = NULL;
2306 unsigned n_bind_mounts = 0;
6818c54c 2307 int r;
93c6bb51 2308
2b3c1b9e
DH
2309 assert(context);
2310
93c6bb51
DH
2311 /* The runtime struct only contains the parent of the private /tmp,
2312 * which is non-accessible to world users. Inside of it there's a /tmp
2313 * that is sticky, and that's the one we want to use here. */
2314
2315 if (context->private_tmp && runtime) {
2316 if (runtime->tmp_dir)
2317 tmp = strjoina(runtime->tmp_dir, "/tmp");
2318 if (runtime->var_tmp_dir)
2319 var = strjoina(runtime->var_tmp_dir, "/tmp");
2320 }
2321
915e6d16
LP
2322 if (params->flags & EXEC_APPLY_CHROOT) {
2323 root_image = context->root_image;
2324
2325 if (!root_image)
2326 root_dir = context->root_directory;
2327 }
93c6bb51 2328
6c47cd7d
LP
2329 r = compile_bind_mounts(context, params, &bind_mounts, &n_bind_mounts, &empty_directories);
2330 if (r < 0)
2331 return r;
2332
af964954
DH
2333 /*
2334 * If DynamicUser=no and RootDirectory= is set then lets pass a relaxed
2335 * sandbox info, otherwise enforce it, don't ignore protected paths and
2336 * fail if we are enable to apply the sandbox inside the mount namespace.
2337 */
2338 if (!context->dynamic_user && root_dir)
2339 ns_info.ignore_protect_paths = true;
2340
165a31c0 2341 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
6818c54c 2342
915e6d16 2343 r = setup_namespace(root_dir, root_image,
7bcef4ef 2344 &ns_info, context->read_write_paths,
165a31c0
LP
2345 needs_sandboxing ? context->read_only_paths : NULL,
2346 needs_sandboxing ? context->inaccessible_paths : NULL,
6c47cd7d
LP
2347 empty_directories,
2348 bind_mounts,
2349 n_bind_mounts,
2abd4e38
YW
2350 context->temporary_filesystems,
2351 context->n_temporary_filesystems,
93c6bb51
DH
2352 tmp,
2353 var,
165a31c0
LP
2354 needs_sandboxing ? context->protect_home : PROTECT_HOME_NO,
2355 needs_sandboxing ? context->protect_system : PROTECT_SYSTEM_NO,
915e6d16
LP
2356 context->mount_flags,
2357 DISSECT_IMAGE_DISCARD_ON_LOOP);
93c6bb51 2358
6c47cd7d
LP
2359 bind_mount_free_many(bind_mounts, n_bind_mounts);
2360
93c6bb51
DH
2361 /* If we couldn't set up the namespace this is probably due to a
2362 * missing capability. In this case, silently proceeed. */
2363 if (IN_SET(r, -EPERM, -EACCES)) {
93c6bb51 2364 log_unit_debug_errno(u, r, "Failed to set up namespace, assuming containerized execution, ignoring: %m");
86ffb325 2365 return 0;
93c6bb51
DH
2366 }
2367
2368 return r;
2369}
2370
915e6d16
LP
2371static int apply_working_directory(
2372 const ExecContext *context,
2373 const ExecParameters *params,
2374 const char *home,
376fecf6
LP
2375 const bool needs_mount_ns,
2376 int *exit_status) {
915e6d16 2377
6732edab 2378 const char *d, *wd;
2b3c1b9e
DH
2379
2380 assert(context);
376fecf6 2381 assert(exit_status);
2b3c1b9e 2382
6732edab
LP
2383 if (context->working_directory_home) {
2384
376fecf6
LP
2385 if (!home) {
2386 *exit_status = EXIT_CHDIR;
6732edab 2387 return -ENXIO;
376fecf6 2388 }
6732edab 2389
2b3c1b9e 2390 wd = home;
6732edab
LP
2391
2392 } else if (context->working_directory)
2b3c1b9e
DH
2393 wd = context->working_directory;
2394 else
2395 wd = "/";
e7f1e7c6
DH
2396
2397 if (params->flags & EXEC_APPLY_CHROOT) {
2398 if (!needs_mount_ns && context->root_directory)
376fecf6
LP
2399 if (chroot(context->root_directory) < 0) {
2400 *exit_status = EXIT_CHROOT;
e7f1e7c6 2401 return -errno;
376fecf6 2402 }
e7f1e7c6 2403
2b3c1b9e
DH
2404 d = wd;
2405 } else
3b0e5bb5 2406 d = prefix_roota(context->root_directory, wd);
e7f1e7c6 2407
376fecf6
LP
2408 if (chdir(d) < 0 && !context->working_directory_missing_ok) {
2409 *exit_status = EXIT_CHDIR;
2b3c1b9e 2410 return -errno;
376fecf6 2411 }
e7f1e7c6
DH
2412
2413 return 0;
2414}
2415
b1edf445 2416static int setup_keyring(
34cf6c43 2417 const Unit *u,
b1edf445
LP
2418 const ExecContext *context,
2419 const ExecParameters *p,
2420 uid_t uid, gid_t gid) {
2421
74dd6b51 2422 key_serial_t keyring;
e64c2d0b
DJL
2423 int r = 0;
2424 uid_t saved_uid;
2425 gid_t saved_gid;
74dd6b51
LP
2426
2427 assert(u);
b1edf445 2428 assert(context);
74dd6b51
LP
2429 assert(p);
2430
2431 /* Let's set up a new per-service "session" kernel keyring for each system service. This has the benefit that
2432 * each service runs with its own keyring shared among all processes of the service, but with no hook-up beyond
2433 * that scope, and in particular no link to the per-UID keyring. If we don't do this the keyring will be
2434 * automatically created on-demand and then linked to the per-UID keyring, by the kernel. The kernel's built-in
2435 * on-demand behaviour is very appropriate for login users, but probably not so much for system services, where
2436 * UIDs are not necessarily specific to a service but reused (at least in the case of UID 0). */
2437
2438 if (!(p->flags & EXEC_NEW_KEYRING))
2439 return 0;
2440
b1edf445
LP
2441 if (context->keyring_mode == EXEC_KEYRING_INHERIT)
2442 return 0;
2443
e64c2d0b
DJL
2444 /* Acquiring a reference to the user keyring is nasty. We briefly change identity in order to get things set up
2445 * properly by the kernel. If we don't do that then we can't create it atomically, and that sucks for parallel
2446 * execution. This mimics what pam_keyinit does, too. Setting up session keyring, to be owned by the right user
2447 * & group is just as nasty as acquiring a reference to the user keyring. */
2448
2449 saved_uid = getuid();
2450 saved_gid = getgid();
2451
2452 if (gid_is_valid(gid) && gid != saved_gid) {
2453 if (setregid(gid, -1) < 0)
2454 return log_unit_error_errno(u, errno, "Failed to change GID for user keyring: %m");
2455 }
2456
2457 if (uid_is_valid(uid) && uid != saved_uid) {
2458 if (setreuid(uid, -1) < 0) {
2459 r = log_unit_error_errno(u, errno, "Failed to change UID for user keyring: %m");
2460 goto out;
2461 }
2462 }
2463
74dd6b51
LP
2464 keyring = keyctl(KEYCTL_JOIN_SESSION_KEYRING, 0, 0, 0, 0);
2465 if (keyring == -1) {
2466 if (errno == ENOSYS)
8002fb97 2467 log_unit_debug_errno(u, errno, "Kernel keyring not supported, ignoring.");
74dd6b51 2468 else if (IN_SET(errno, EACCES, EPERM))
8002fb97 2469 log_unit_debug_errno(u, errno, "Kernel keyring access prohibited, ignoring.");
74dd6b51 2470 else if (errno == EDQUOT)
8002fb97 2471 log_unit_debug_errno(u, errno, "Out of kernel keyrings to allocate, ignoring.");
74dd6b51 2472 else
e64c2d0b 2473 r = log_unit_error_errno(u, errno, "Setting up kernel keyring failed: %m");
74dd6b51 2474
e64c2d0b 2475 goto out;
74dd6b51
LP
2476 }
2477
e64c2d0b
DJL
2478 /* When requested link the user keyring into the session keyring. */
2479 if (context->keyring_mode == EXEC_KEYRING_SHARED) {
2480
2481 if (keyctl(KEYCTL_LINK,
2482 KEY_SPEC_USER_KEYRING,
2483 KEY_SPEC_SESSION_KEYRING, 0, 0) < 0) {
2484 r = log_unit_error_errno(u, errno, "Failed to link user keyring into session keyring: %m");
2485 goto out;
2486 }
2487 }
2488
2489 /* Restore uid/gid back */
2490 if (uid_is_valid(uid) && uid != saved_uid) {
2491 if (setreuid(saved_uid, -1) < 0) {
2492 r = log_unit_error_errno(u, errno, "Failed to change UID back for user keyring: %m");
2493 goto out;
2494 }
2495 }
2496
2497 if (gid_is_valid(gid) && gid != saved_gid) {
2498 if (setregid(saved_gid, -1) < 0)
2499 return log_unit_error_errno(u, errno, "Failed to change GID back for user keyring: %m");
2500 }
2501
2502 /* Populate they keyring with the invocation ID by default, as original saved_uid. */
b3415f5d
LP
2503 if (!sd_id128_is_null(u->invocation_id)) {
2504 key_serial_t key;
2505
2506 key = add_key("user", "invocation_id", &u->invocation_id, sizeof(u->invocation_id), KEY_SPEC_SESSION_KEYRING);
2507 if (key == -1)
8002fb97 2508 log_unit_debug_errno(u, errno, "Failed to add invocation ID to keyring, ignoring: %m");
b3415f5d
LP
2509 else {
2510 if (keyctl(KEYCTL_SETPERM, key,
2511 KEY_POS_VIEW|KEY_POS_READ|KEY_POS_SEARCH|
2512 KEY_USR_VIEW|KEY_USR_READ|KEY_USR_SEARCH, 0, 0) < 0)
e64c2d0b 2513 r = log_unit_error_errno(u, errno, "Failed to restrict invocation ID permission: %m");
b3415f5d
LP
2514 }
2515 }
2516
e64c2d0b
DJL
2517out:
2518 /* Revert back uid & gid for the the last time, and exit */
2519 /* no extra logging, as only the first already reported error matters */
2520 if (getuid() != saved_uid)
2521 (void) setreuid(saved_uid, -1);
b1edf445 2522
e64c2d0b
DJL
2523 if (getgid() != saved_gid)
2524 (void) setregid(saved_gid, -1);
b1edf445 2525
e64c2d0b 2526 return r;
74dd6b51
LP
2527}
2528
34cf6c43 2529static void append_socket_pair(int *array, unsigned *n, const int pair[2]) {
29206d46
LP
2530 assert(array);
2531 assert(n);
2532
2533 if (!pair)
2534 return;
2535
2536 if (pair[0] >= 0)
2537 array[(*n)++] = pair[0];
2538 if (pair[1] >= 0)
2539 array[(*n)++] = pair[1];
2540}
2541
a34ceba6
LP
2542static int close_remaining_fds(
2543 const ExecParameters *params,
34cf6c43
YW
2544 const ExecRuntime *runtime,
2545 const DynamicCreds *dcreds,
00d9ef85 2546 int user_lookup_fd,
a34ceba6
LP
2547 int socket_fd,
2548 int *fds, unsigned n_fds) {
2549
2550 unsigned n_dont_close = 0;
00d9ef85 2551 int dont_close[n_fds + 12];
a34ceba6
LP
2552
2553 assert(params);
2554
2555 if (params->stdin_fd >= 0)
2556 dont_close[n_dont_close++] = params->stdin_fd;
2557 if (params->stdout_fd >= 0)
2558 dont_close[n_dont_close++] = params->stdout_fd;
2559 if (params->stderr_fd >= 0)
2560 dont_close[n_dont_close++] = params->stderr_fd;
2561
2562 if (socket_fd >= 0)
2563 dont_close[n_dont_close++] = socket_fd;
2564 if (n_fds > 0) {
2565 memcpy(dont_close + n_dont_close, fds, sizeof(int) * n_fds);
2566 n_dont_close += n_fds;
2567 }
2568
29206d46
LP
2569 if (runtime)
2570 append_socket_pair(dont_close, &n_dont_close, runtime->netns_storage_socket);
2571
2572 if (dcreds) {
2573 if (dcreds->user)
2574 append_socket_pair(dont_close, &n_dont_close, dcreds->user->storage_socket);
2575 if (dcreds->group)
2576 append_socket_pair(dont_close, &n_dont_close, dcreds->group->storage_socket);
a34ceba6
LP
2577 }
2578
00d9ef85
LP
2579 if (user_lookup_fd >= 0)
2580 dont_close[n_dont_close++] = user_lookup_fd;
2581
a34ceba6
LP
2582 return close_all_fds(dont_close, n_dont_close);
2583}
2584
00d9ef85
LP
2585static int send_user_lookup(
2586 Unit *unit,
2587 int user_lookup_fd,
2588 uid_t uid,
2589 gid_t gid) {
2590
2591 assert(unit);
2592
2593 /* Send the resolved UID/GID to PID 1 after we learnt it. We send a single datagram, containing the UID/GID
2594 * data as well as the unit name. Note that we suppress sending this if no user/group to resolve was
2595 * specified. */
2596
2597 if (user_lookup_fd < 0)
2598 return 0;
2599
2600 if (!uid_is_valid(uid) && !gid_is_valid(gid))
2601 return 0;
2602
2603 if (writev(user_lookup_fd,
2604 (struct iovec[]) {
e6a7ec4b
LP
2605 IOVEC_INIT(&uid, sizeof(uid)),
2606 IOVEC_INIT(&gid, sizeof(gid)),
2607 IOVEC_INIT_STRING(unit->id) }, 3) < 0)
00d9ef85
LP
2608 return -errno;
2609
2610 return 0;
2611}
2612
6732edab
LP
2613static int acquire_home(const ExecContext *c, uid_t uid, const char** home, char **buf) {
2614 int r;
2615
2616 assert(c);
2617 assert(home);
2618 assert(buf);
2619
2620 /* If WorkingDirectory=~ is set, try to acquire a usable home directory. */
2621
2622 if (*home)
2623 return 0;
2624
2625 if (!c->working_directory_home)
2626 return 0;
2627
2628 if (uid == 0) {
2629 /* Hardcode /root as home directory for UID 0 */
2630 *home = "/root";
2631 return 1;
2632 }
2633
2634 r = get_home_dir(buf);
2635 if (r < 0)
2636 return r;
2637
2638 *home = *buf;
2639 return 1;
2640}
2641
da50b85a
LP
2642static int compile_suggested_paths(const ExecContext *c, const ExecParameters *p, char ***ret) {
2643 _cleanup_strv_free_ char ** list = NULL;
2644 ExecDirectoryType t;
2645 int r;
2646
2647 assert(c);
2648 assert(p);
2649 assert(ret);
2650
2651 assert(c->dynamic_user);
2652
2653 /* Compile a list of paths that it might make sense to read the owning UID from to use as initial candidate for
2654 * dynamic UID allocation, in order to save us from doing costly recursive chown()s of the special
2655 * directories. */
2656
2657 for (t = 0; t < _EXEC_DIRECTORY_TYPE_MAX; t++) {
2658 char **i;
2659
2660 if (t == EXEC_DIRECTORY_CONFIGURATION)
2661 continue;
2662
2663 if (!p->prefix[t])
2664 continue;
2665
2666 STRV_FOREACH(i, c->directories[t].paths) {
2667 char *e;
2668
8092a48c
YW
2669 if (t == EXEC_DIRECTORY_RUNTIME)
2670 e = strjoin(p->prefix[t], "/", *i);
2671 else
2672 e = strjoin(p->prefix[t], "/private/", *i);
da50b85a
LP
2673 if (!e)
2674 return -ENOMEM;
2675
2676 r = strv_consume(&list, e);
2677 if (r < 0)
2678 return r;
2679 }
2680 }
2681
ae2a15bc 2682 *ret = TAKE_PTR(list);
da50b85a
LP
2683
2684 return 0;
2685}
2686
34cf6c43
YW
2687static char *exec_command_line(char **argv);
2688
ff0af2a1 2689static int exec_child(
f2341e0a 2690 Unit *unit,
34cf6c43 2691 const ExecCommand *command,
ff0af2a1
LP
2692 const ExecContext *context,
2693 const ExecParameters *params,
2694 ExecRuntime *runtime,
29206d46 2695 DynamicCreds *dcreds,
ff0af2a1
LP
2696 char **argv,
2697 int socket_fd,
52c239d7 2698 int named_iofds[3],
4c47affc
FB
2699 int *fds,
2700 unsigned n_storage_fds,
9b141911 2701 unsigned n_socket_fds,
ff0af2a1 2702 char **files_env,
00d9ef85 2703 int user_lookup_fd,
12145637 2704 int *exit_status) {
d35fbf6b 2705
2065ca69 2706 _cleanup_strv_free_ char **our_env = NULL, **pass_env = NULL, **accum_env = NULL, **final_argv = NULL;
7f59dd35 2707 _cleanup_free_ char *home_buffer = NULL;
4d885bd3
DH
2708 _cleanup_free_ gid_t *supplementary_gids = NULL;
2709 const char *username = NULL, *groupname = NULL;
2b3c1b9e 2710 const char *home = NULL, *shell = NULL;
7bce046b
LP
2711 dev_t journal_stream_dev = 0;
2712 ino_t journal_stream_ino = 0;
165a31c0
LP
2713 bool needs_sandboxing, /* Do we need to set up full sandboxing? (i.e. all namespacing, all MAC stuff, caps, yadda yadda */
2714 needs_setuid, /* Do we need to do the actual setresuid()/setresgid() calls? */
2715 needs_mount_namespace, /* Do we need to set up a mount namespace for this kernel? */
2716 needs_ambient_hack; /* Do we need to apply the ambient capabilities hack? */
349cc4a5 2717#if HAVE_SELINUX
7f59dd35 2718 _cleanup_free_ char *mac_selinux_context_net = NULL;
43b1f709 2719 bool use_selinux = false;
ecfbc84f 2720#endif
f9fa32f0 2721#if ENABLE_SMACK
43b1f709 2722 bool use_smack = false;
ecfbc84f 2723#endif
349cc4a5 2724#if HAVE_APPARMOR
43b1f709 2725 bool use_apparmor = false;
ecfbc84f 2726#endif
fed1e721
LP
2727 uid_t uid = UID_INVALID;
2728 gid_t gid = GID_INVALID;
4d885bd3 2729 int i, r, ngids = 0;
4c47affc 2730 unsigned n_fds;
3536f49e 2731 ExecDirectoryType dt;
165a31c0 2732 int secure_bits;
034c6ed7 2733
f2341e0a 2734 assert(unit);
5cb5a6ff
LP
2735 assert(command);
2736 assert(context);
d35fbf6b 2737 assert(params);
ff0af2a1 2738 assert(exit_status);
d35fbf6b
DM
2739
2740 rename_process_from_path(command->path);
2741
2742 /* We reset exactly these signals, since they are the
2743 * only ones we set to SIG_IGN in the main daemon. All
2744 * others we leave untouched because we set them to
2745 * SIG_DFL or a valid handler initially, both of which
2746 * will be demoted to SIG_DFL. */
ce30c8dc
LP
2747 (void) default_signals(SIGNALS_CRASH_HANDLER,
2748 SIGNALS_IGNORE, -1);
d35fbf6b
DM
2749
2750 if (context->ignore_sigpipe)
ce30c8dc 2751 (void) ignore_signals(SIGPIPE, -1);
d35fbf6b 2752
ff0af2a1
LP
2753 r = reset_signal_mask();
2754 if (r < 0) {
2755 *exit_status = EXIT_SIGNAL_MASK;
12145637 2756 return log_unit_error_errno(unit, r, "Failed to set process signal mask: %m");
d35fbf6b 2757 }
034c6ed7 2758
d35fbf6b
DM
2759 if (params->idle_pipe)
2760 do_idle_pipe_dance(params->idle_pipe);
4f2d528d 2761
2c027c62
LP
2762 /* Close fds we don't need very early to make sure we don't block init reexecution because it cannot bind its
2763 * sockets. Among the fds we close are the logging fds, and we want to keep them closed, so that we don't have
2764 * any fds open we don't really want open during the transition. In order to make logging work, we switch the
2765 * log subsystem into open_when_needed mode, so that it reopens the logs on every single log call. */
ff0af2a1 2766
d35fbf6b 2767 log_forget_fds();
2c027c62 2768 log_set_open_when_needed(true);
4f2d528d 2769
40a80078
LP
2770 /* In case anything used libc syslog(), close this here, too */
2771 closelog();
2772
4c47affc 2773 n_fds = n_storage_fds + n_socket_fds;
00d9ef85 2774 r = close_remaining_fds(params, runtime, dcreds, user_lookup_fd, socket_fd, fds, n_fds);
ff0af2a1
LP
2775 if (r < 0) {
2776 *exit_status = EXIT_FDS;
12145637 2777 return log_unit_error_errno(unit, r, "Failed to close unwanted file descriptors: %m");
8c7be95e
LP
2778 }
2779
d35fbf6b
DM
2780 if (!context->same_pgrp)
2781 if (setsid() < 0) {
ff0af2a1 2782 *exit_status = EXIT_SETSID;
12145637 2783 return log_unit_error_errno(unit, errno, "Failed to create new process session: %m");
d35fbf6b 2784 }
9e2f7c11 2785
1e22b5cd 2786 exec_context_tty_reset(context, params);
d35fbf6b 2787
c891efaf 2788 if (unit_shall_confirm_spawn(unit)) {
7d5ceb64 2789 const char *vc = params->confirm_spawn;
3b20f877
FB
2790 _cleanup_free_ char *cmdline = NULL;
2791
2792 cmdline = exec_command_line(argv);
2793 if (!cmdline) {
0460aa5c 2794 *exit_status = EXIT_MEMORY;
12145637 2795 return log_oom();
3b20f877 2796 }
d35fbf6b 2797
eedf223a 2798 r = ask_for_confirmation(vc, unit, cmdline);
3b20f877
FB
2799 if (r != CONFIRM_EXECUTE) {
2800 if (r == CONFIRM_PRETEND_SUCCESS) {
2801 *exit_status = EXIT_SUCCESS;
2802 return 0;
2803 }
ff0af2a1 2804 *exit_status = EXIT_CONFIRM;
12145637 2805 log_unit_error(unit, "Execution cancelled by the user");
d35fbf6b 2806 return -ECANCELED;
d35fbf6b
DM
2807 }
2808 }
1a63a750 2809
29206d46 2810 if (context->dynamic_user && dcreds) {
da50b85a 2811 _cleanup_strv_free_ char **suggested_paths = NULL;
29206d46 2812
409093fe
LP
2813 /* Make sure we bypass our own NSS module for any NSS checks */
2814 if (putenv((char*) "SYSTEMD_NSS_DYNAMIC_BYPASS=1") != 0) {
2815 *exit_status = EXIT_USER;
12145637 2816 return log_unit_error_errno(unit, errno, "Failed to update environment: %m");
409093fe
LP
2817 }
2818
da50b85a
LP
2819 r = compile_suggested_paths(context, params, &suggested_paths);
2820 if (r < 0) {
2821 *exit_status = EXIT_MEMORY;
2822 return log_oom();
2823 }
2824
2825 r = dynamic_creds_realize(dcreds, suggested_paths, &uid, &gid);
ff0af2a1
LP
2826 if (r < 0) {
2827 *exit_status = EXIT_USER;
e2b0cc34
YW
2828 if (r == -EILSEQ) {
2829 log_unit_error(unit, "Failed to update dynamic user credentials: User or group with specified name already exists.");
2830 return -EOPNOTSUPP;
2831 }
12145637 2832 return log_unit_error_errno(unit, r, "Failed to update dynamic user credentials: %m");
524daa8c 2833 }
524daa8c 2834
70dd455c 2835 if (!uid_is_valid(uid)) {
29206d46 2836 *exit_status = EXIT_USER;
12145637 2837 log_unit_error(unit, "UID validation failed for \""UID_FMT"\"", uid);
70dd455c
ZJS
2838 return -ESRCH;
2839 }
2840
2841 if (!gid_is_valid(gid)) {
2842 *exit_status = EXIT_USER;
12145637 2843 log_unit_error(unit, "GID validation failed for \""GID_FMT"\"", gid);
29206d46
LP
2844 return -ESRCH;
2845 }
5bc7452b 2846
29206d46
LP
2847 if (dcreds->user)
2848 username = dcreds->user->name;
2849
2850 } else {
4d885bd3
DH
2851 r = get_fixed_user(context, &username, &uid, &gid, &home, &shell);
2852 if (r < 0) {
2853 *exit_status = EXIT_USER;
12145637 2854 return log_unit_error_errno(unit, r, "Failed to determine user credentials: %m");
5bc7452b 2855 }
5bc7452b 2856
4d885bd3
DH
2857 r = get_fixed_group(context, &groupname, &gid);
2858 if (r < 0) {
2859 *exit_status = EXIT_GROUP;
12145637 2860 return log_unit_error_errno(unit, r, "Failed to determine group credentials: %m");
4d885bd3 2861 }
cdc5d5c5 2862 }
29206d46 2863
cdc5d5c5
DH
2864 /* Initialize user supplementary groups and get SupplementaryGroups= ones */
2865 r = get_supplementary_groups(context, username, groupname, gid,
2866 &supplementary_gids, &ngids);
2867 if (r < 0) {
2868 *exit_status = EXIT_GROUP;
12145637 2869 return log_unit_error_errno(unit, r, "Failed to determine supplementary groups: %m");
29206d46 2870 }
5bc7452b 2871
00d9ef85
LP
2872 r = send_user_lookup(unit, user_lookup_fd, uid, gid);
2873 if (r < 0) {
2874 *exit_status = EXIT_USER;
12145637 2875 return log_unit_error_errno(unit, r, "Failed to send user credentials to PID1: %m");
00d9ef85
LP
2876 }
2877
2878 user_lookup_fd = safe_close(user_lookup_fd);
2879
6732edab
LP
2880 r = acquire_home(context, uid, &home, &home_buffer);
2881 if (r < 0) {
2882 *exit_status = EXIT_CHDIR;
12145637 2883 return log_unit_error_errno(unit, r, "Failed to determine $HOME for user: %m");
6732edab
LP
2884 }
2885
d35fbf6b
DM
2886 /* If a socket is connected to STDIN/STDOUT/STDERR, we
2887 * must sure to drop O_NONBLOCK */
2888 if (socket_fd >= 0)
a34ceba6 2889 (void) fd_nonblock(socket_fd, false);
acbb0225 2890
52c239d7 2891 r = setup_input(context, params, socket_fd, named_iofds);
ff0af2a1
LP
2892 if (r < 0) {
2893 *exit_status = EXIT_STDIN;
12145637 2894 return log_unit_error_errno(unit, r, "Failed to set up standard input: %m");
d35fbf6b 2895 }
034c6ed7 2896
52c239d7 2897 r = setup_output(unit, context, params, STDOUT_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
2898 if (r < 0) {
2899 *exit_status = EXIT_STDOUT;
12145637 2900 return log_unit_error_errno(unit, r, "Failed to set up standard output: %m");
d35fbf6b
DM
2901 }
2902
52c239d7 2903 r = setup_output(unit, context, params, STDERR_FILENO, socket_fd, named_iofds, basename(command->path), uid, gid, &journal_stream_dev, &journal_stream_ino);
ff0af2a1
LP
2904 if (r < 0) {
2905 *exit_status = EXIT_STDERR;
12145637 2906 return log_unit_error_errno(unit, r, "Failed to set up standard error output: %m");
d35fbf6b
DM
2907 }
2908
2909 if (params->cgroup_path) {
ff0af2a1
LP
2910 r = cg_attach_everywhere(params->cgroup_supported, params->cgroup_path, 0, NULL, NULL);
2911 if (r < 0) {
2912 *exit_status = EXIT_CGROUP;
12145637 2913 return log_unit_error_errno(unit, r, "Failed to attach to cgroup %s: %m", params->cgroup_path);
309bff19 2914 }
d35fbf6b 2915 }
309bff19 2916
d35fbf6b 2917 if (context->oom_score_adjust_set) {
d5243d62 2918 char t[DECIMAL_STR_MAX(context->oom_score_adjust)];
f2b68789 2919
d5243d62
LP
2920 /* When we can't make this change due to EPERM, then
2921 * let's silently skip over it. User namespaces
2922 * prohibit write access to this file, and we
2923 * shouldn't trip up over that. */
613b411c 2924
d5243d62 2925 sprintf(t, "%i", context->oom_score_adjust);
ad118bda 2926 r = write_string_file("/proc/self/oom_score_adj", t, 0);
12145637 2927 if (IN_SET(r, -EPERM, -EACCES))
f2341e0a 2928 log_unit_debug_errno(unit, r, "Failed to adjust OOM setting, assuming containerized execution, ignoring: %m");
12145637 2929 else if (r < 0) {
ff0af2a1 2930 *exit_status = EXIT_OOM_ADJUST;
12145637 2931 return log_unit_error_errno(unit, r, "Failed to adjust OOM setting: %m");
613b411c 2932 }
d35fbf6b
DM
2933 }
2934
2935 if (context->nice_set)
2936 if (setpriority(PRIO_PROCESS, 0, context->nice) < 0) {
ff0af2a1 2937 *exit_status = EXIT_NICE;
12145637 2938 return log_unit_error_errno(unit, errno, "Failed to set up process scheduling priority (nice level): %m");
613b411c
LP
2939 }
2940
d35fbf6b
DM
2941 if (context->cpu_sched_set) {
2942 struct sched_param param = {
2943 .sched_priority = context->cpu_sched_priority,
2944 };
2945
ff0af2a1
LP
2946 r = sched_setscheduler(0,
2947 context->cpu_sched_policy |
2948 (context->cpu_sched_reset_on_fork ?
2949 SCHED_RESET_ON_FORK : 0),
2950 &param);
2951 if (r < 0) {
2952 *exit_status = EXIT_SETSCHEDULER;
12145637 2953 return log_unit_error_errno(unit, errno, "Failed to set up CPU scheduling: %m");
fc9b2a84 2954 }
d35fbf6b 2955 }
fc9b2a84 2956
d35fbf6b
DM
2957 if (context->cpuset)
2958 if (sched_setaffinity(0, CPU_ALLOC_SIZE(context->cpuset_ncpus), context->cpuset) < 0) {
ff0af2a1 2959 *exit_status = EXIT_CPUAFFINITY;
12145637 2960 return log_unit_error_errno(unit, errno, "Failed to set up CPU affinity: %m");
034c6ed7
LP
2961 }
2962
d35fbf6b
DM
2963 if (context->ioprio_set)
2964 if (ioprio_set(IOPRIO_WHO_PROCESS, 0, context->ioprio) < 0) {
ff0af2a1 2965 *exit_status = EXIT_IOPRIO;
12145637 2966 return log_unit_error_errno(unit, errno, "Failed to set up IO scheduling priority: %m");
d35fbf6b 2967 }
da726a4d 2968
d35fbf6b
DM
2969 if (context->timer_slack_nsec != NSEC_INFINITY)
2970 if (prctl(PR_SET_TIMERSLACK, context->timer_slack_nsec) < 0) {
ff0af2a1 2971 *exit_status = EXIT_TIMERSLACK;
12145637 2972 return log_unit_error_errno(unit, errno, "Failed to set up timer slack: %m");
4c2630eb 2973 }
9eba9da4 2974
21022b9d
LP
2975 if (context->personality != PERSONALITY_INVALID) {
2976 r = safe_personality(context->personality);
2977 if (r < 0) {
ff0af2a1 2978 *exit_status = EXIT_PERSONALITY;
12145637 2979 return log_unit_error_errno(unit, r, "Failed to set up execution domain (personality): %m");
4c2630eb 2980 }
21022b9d 2981 }
94f04347 2982
d35fbf6b 2983 if (context->utmp_id)
df0ff127 2984 utmp_put_init_process(context->utmp_id, getpid_cached(), getsid(0),
6a93917d 2985 context->tty_path,
023a4f67
LP
2986 context->utmp_mode == EXEC_UTMP_INIT ? INIT_PROCESS :
2987 context->utmp_mode == EXEC_UTMP_LOGIN ? LOGIN_PROCESS :
2988 USER_PROCESS,
6a93917d 2989 username);
d35fbf6b 2990
e0d2adfd 2991 if (context->user) {
ff0af2a1
LP
2992 r = chown_terminal(STDIN_FILENO, uid);
2993 if (r < 0) {
2994 *exit_status = EXIT_STDIN;
12145637 2995 return log_unit_error_errno(unit, r, "Failed to change ownership of terminal: %m");
071830ff 2996 }
d35fbf6b 2997 }
8e274523 2998
62b9bb26
LP
2999 /* If delegation is enabled we'll pass ownership of the cgroup to the user of the new process. On cgroupsv1
3000 * this is only about systemd's own hierarchy, i.e. not the controller hierarchies, simply because that's not
3001 * safe. On cgroupsv2 there's only one hierarchy anyway, and delegation is safe there, hence in that case only
3002 * touch a single hierarchy too. */
584b8688 3003 if (params->cgroup_path && context->user && (params->flags & EXEC_CGROUP_DELEGATE)) {
62b9bb26 3004 r = cg_set_access(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, uid, gid);
ff0af2a1
LP
3005 if (r < 0) {
3006 *exit_status = EXIT_CGROUP;
12145637 3007 return log_unit_error_errno(unit, r, "Failed to adjust control group access: %m");
034c6ed7 3008 }
d35fbf6b 3009 }
034c6ed7 3010
72fd1768 3011 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
8679efde 3012 r = setup_exec_directory(context, params, uid, gid, dt, exit_status);
12145637
LP
3013 if (r < 0)
3014 return log_unit_error_errno(unit, r, "Failed to set up special execution directory in %s: %m", params->prefix[dt]);
d35fbf6b 3015 }
94f04347 3016
7bce046b 3017 r = build_environment(
fd63e712 3018 unit,
7bce046b
LP
3019 context,
3020 params,
3021 n_fds,
3022 home,
3023 username,
3024 shell,
3025 journal_stream_dev,
3026 journal_stream_ino,
3027 &our_env);
2065ca69
JW
3028 if (r < 0) {
3029 *exit_status = EXIT_MEMORY;
12145637 3030 return log_oom();
2065ca69
JW
3031 }
3032
3033 r = build_pass_environment(context, &pass_env);
3034 if (r < 0) {
3035 *exit_status = EXIT_MEMORY;
12145637 3036 return log_oom();
2065ca69
JW
3037 }
3038
3039 accum_env = strv_env_merge(5,
3040 params->environment,
3041 our_env,
3042 pass_env,
3043 context->environment,
3044 files_env,
3045 NULL);
3046 if (!accum_env) {
3047 *exit_status = EXIT_MEMORY;
12145637 3048 return log_oom();
2065ca69 3049 }
1280503b 3050 accum_env = strv_env_clean(accum_env);
2065ca69 3051
096424d1 3052 (void) umask(context->umask);
b213e1c1 3053
b1edf445 3054 r = setup_keyring(unit, context, params, uid, gid);
74dd6b51
LP
3055 if (r < 0) {
3056 *exit_status = EXIT_KEYRING;
12145637 3057 return log_unit_error_errno(unit, r, "Failed to set up kernel keyring: %m");
74dd6b51
LP
3058 }
3059
165a31c0 3060 /* We need sandboxing if the caller asked us to apply it and the command isn't explicitly excepted from it */
1703fa41 3061 needs_sandboxing = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & EXEC_COMMAND_FULLY_PRIVILEGED);
7f18ef0a 3062
165a31c0
LP
3063 /* We need the ambient capability hack, if the caller asked us to apply it and the command is marked for it, and the kernel doesn't actually support ambient caps */
3064 needs_ambient_hack = (params->flags & EXEC_APPLY_SANDBOXING) && (command->flags & EXEC_COMMAND_AMBIENT_MAGIC) && !ambient_capabilities_supported();
7f18ef0a 3065
165a31c0
LP
3066 /* We need setresuid() if the caller asked us to apply sandboxing and the command isn't explicitly excepted from either whole sandboxing or just setresuid() itself, and the ambient hack is not desired */
3067 if (needs_ambient_hack)
3068 needs_setuid = false;
3069 else
3070 needs_setuid = (params->flags & EXEC_APPLY_SANDBOXING) && !(command->flags & (EXEC_COMMAND_FULLY_PRIVILEGED|EXEC_COMMAND_NO_SETUID));
3071
3072 if (needs_sandboxing) {
7f18ef0a
FK
3073 /* MAC enablement checks need to be done before a new mount ns is created, as they rely on /sys being
3074 * present. The actual MAC context application will happen later, as late as possible, to avoid
3075 * impacting our own code paths. */
3076
349cc4a5 3077#if HAVE_SELINUX
43b1f709 3078 use_selinux = mac_selinux_use();
7f18ef0a 3079#endif
f9fa32f0 3080#if ENABLE_SMACK
43b1f709 3081 use_smack = mac_smack_use();
7f18ef0a 3082#endif
349cc4a5 3083#if HAVE_APPARMOR
43b1f709 3084 use_apparmor = mac_apparmor_use();
7f18ef0a 3085#endif
165a31c0 3086 }
7f18ef0a 3087
165a31c0
LP
3088 if (needs_setuid) {
3089 if (context->pam_name && username) {
3090 r = setup_pam(context->pam_name, username, uid, gid, context->tty_path, &accum_env, fds, n_fds);
3091 if (r < 0) {
3092 *exit_status = EXIT_PAM;
12145637 3093 return log_unit_error_errno(unit, r, "Failed to set up PAM session: %m");
165a31c0
LP
3094 }
3095 }
b213e1c1 3096 }
ac45f971 3097
d35fbf6b 3098 if (context->private_network && runtime && runtime->netns_storage_socket[0] >= 0) {
6e2d7c4f
MS
3099 if (ns_type_supported(NAMESPACE_NET)) {
3100 r = setup_netns(runtime->netns_storage_socket);
3101 if (r < 0) {
3102 *exit_status = EXIT_NETWORK;
3103 return log_unit_error_errno(unit, r, "Failed to set up network namespacing: %m");
3104 }
3105 } else
3106 log_unit_warning(unit, "PrivateNetwork=yes is configured, but the kernel does not support network namespaces, ignoring.");
d35fbf6b 3107 }
169c1bda 3108
ee818b89 3109 needs_mount_namespace = exec_needs_mount_namespace(context, params, runtime);
ee818b89 3110 if (needs_mount_namespace) {
6818c54c 3111 r = apply_mount_namespace(unit, command, context, params, runtime);
3fbe8dbe
LP
3112 if (r < 0) {
3113 *exit_status = EXIT_NAMESPACE;
12145637 3114 return log_unit_error_errno(unit, r, "Failed to set up mount namespacing: %m");
3fbe8dbe 3115 }
d35fbf6b 3116 }
81a2b7ce 3117
50b3dfb9 3118 /* Apply just after mount namespace setup */
376fecf6 3119 r = apply_working_directory(context, params, home, needs_mount_namespace, exit_status);
12145637
LP
3120 if (r < 0)
3121 return log_unit_error_errno(unit, r, "Changing to the requested working directory failed: %m");
50b3dfb9 3122
bbeea271 3123 /* Drop groups as early as possbile */
165a31c0 3124 if (needs_setuid) {
709dbeac 3125 r = enforce_groups(gid, supplementary_gids, ngids);
096424d1
LP
3126 if (r < 0) {
3127 *exit_status = EXIT_GROUP;
12145637 3128 return log_unit_error_errno(unit, r, "Changing group credentials failed: %m");
096424d1 3129 }
165a31c0 3130 }
096424d1 3131
165a31c0 3132 if (needs_sandboxing) {
349cc4a5 3133#if HAVE_SELINUX
43b1f709 3134 if (use_selinux && params->selinux_context_net && socket_fd >= 0) {
937ccce9
LP
3135 r = mac_selinux_get_child_mls_label(socket_fd, command->path, context->selinux_context, &mac_selinux_context_net);
3136 if (r < 0) {
3137 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 3138 return log_unit_error_errno(unit, r, "Failed to determine SELinux context: %m");
937ccce9 3139 }
9008e1ac 3140 }
9008e1ac
MS
3141#endif
3142
937ccce9
LP
3143 if (context->private_users) {
3144 r = setup_private_users(uid, gid);
3145 if (r < 0) {
3146 *exit_status = EXIT_USER;
12145637 3147 return log_unit_error_errno(unit, r, "Failed to set up user namespacing: %m");
937ccce9 3148 }
d251207d
LP
3149 }
3150 }
3151
165a31c0
LP
3152 /* We repeat the fd closing here, to make sure that nothing is leaked from the PAM modules. Note that we are
3153 * more aggressive this time since socket_fd and the netns fds we don't need anymore. The custom endpoint fd
3154 * was needed to upload the policy and can now be closed as well. */
ff0af2a1
LP
3155 r = close_all_fds(fds, n_fds);
3156 if (r >= 0)
3157 r = shift_fds(fds, n_fds);
3158 if (r >= 0)
4c47affc 3159 r = flags_fds(fds, n_storage_fds, n_socket_fds, context->non_blocking);
ff0af2a1
LP
3160 if (r < 0) {
3161 *exit_status = EXIT_FDS;
12145637 3162 return log_unit_error_errno(unit, r, "Failed to adjust passed file descriptors: %m");
d35fbf6b 3163 }
e66cf1a3 3164
165a31c0 3165 secure_bits = context->secure_bits;
e66cf1a3 3166
165a31c0
LP
3167 if (needs_sandboxing) {
3168 uint64_t bset;
755d4b67 3169
d35fbf6b 3170 for (i = 0; i < _RLIMIT_MAX; i++) {
03857c43 3171
d35fbf6b
DM
3172 if (!context->rlimit[i])
3173 continue;
3174
03857c43
LP
3175 r = setrlimit_closest(i, context->rlimit[i]);
3176 if (r < 0) {
ff0af2a1 3177 *exit_status = EXIT_LIMITS;
12145637 3178 return log_unit_error_errno(unit, r, "Failed to adjust resource limit %s: %m", rlimit_to_string(i));
e66cf1a3
LP
3179 }
3180 }
3181
f4170c67
LP
3182 /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
3183 if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
3184 if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
3185 *exit_status = EXIT_LIMITS;
12145637 3186 return log_unit_error_errno(unit, errno, "Failed to adjust RLIMIT_RTPRIO resource limit: %m");
f4170c67
LP
3187 }
3188 }
3189
37ac2744
JB
3190#if ENABLE_SMACK
3191 /* LSM Smack needs the capability CAP_MAC_ADMIN to change the current execution security context of the
3192 * process. This is the latest place before dropping capabilities. Other MAC context are set later. */
3193 if (use_smack) {
3194 r = setup_smack(context, command);
3195 if (r < 0) {
3196 *exit_status = EXIT_SMACK_PROCESS_LABEL;
3197 return log_unit_error_errno(unit, r, "Failed to set SMACK process label: %m");
3198 }
3199 }
3200#endif
3201
165a31c0
LP
3202 bset = context->capability_bounding_set;
3203 /* If the ambient caps hack is enabled (which means the kernel can't do them, and the user asked for
3204 * our magic fallback), then let's add some extra caps, so that the service can drop privs of its own,
3205 * instead of us doing that */
3206 if (needs_ambient_hack)
3207 bset |= (UINT64_C(1) << CAP_SETPCAP) |
3208 (UINT64_C(1) << CAP_SETUID) |
3209 (UINT64_C(1) << CAP_SETGID);
3210
3211 if (!cap_test_all(bset)) {
3212 r = capability_bounding_set_drop(bset, false);
ff0af2a1
LP
3213 if (r < 0) {
3214 *exit_status = EXIT_CAPABILITIES;
12145637 3215 return log_unit_error_errno(unit, r, "Failed to drop capabilities: %m");
3b8bddde 3216 }
4c2630eb 3217 }
3b8bddde 3218
755d4b67
IP
3219 /* This is done before enforce_user, but ambient set
3220 * does not survive over setresuid() if keep_caps is not set. */
165a31c0
LP
3221 if (!needs_ambient_hack &&
3222 context->capability_ambient_set != 0) {
755d4b67
IP
3223 r = capability_ambient_set_apply(context->capability_ambient_set, true);
3224 if (r < 0) {
3225 *exit_status = EXIT_CAPABILITIES;
12145637 3226 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (before UID change): %m");
755d4b67 3227 }
755d4b67 3228 }
165a31c0 3229 }
755d4b67 3230
165a31c0 3231 if (needs_setuid) {
d35fbf6b 3232 if (context->user) {
ff0af2a1
LP
3233 r = enforce_user(context, uid);
3234 if (r < 0) {
3235 *exit_status = EXIT_USER;
12145637 3236 return log_unit_error_errno(unit, r, "Failed to change UID to " UID_FMT ": %m", uid);
5b6319dc 3237 }
165a31c0
LP
3238
3239 if (!needs_ambient_hack &&
3240 context->capability_ambient_set != 0) {
755d4b67
IP
3241
3242 /* Fix the ambient capabilities after user change. */
3243 r = capability_ambient_set_apply(context->capability_ambient_set, false);
3244 if (r < 0) {
3245 *exit_status = EXIT_CAPABILITIES;
12145637 3246 return log_unit_error_errno(unit, r, "Failed to apply ambient capabilities (after UID change): %m");
755d4b67
IP
3247 }
3248
3249 /* If we were asked to change user and ambient capabilities
3250 * were requested, we had to add keep-caps to the securebits
3251 * so that we would maintain the inherited capability set
3252 * through the setresuid(). Make sure that the bit is added
3253 * also to the context secure_bits so that we don't try to
3254 * drop the bit away next. */
3255
7f508f2c 3256 secure_bits |= 1<<SECURE_KEEP_CAPS;
755d4b67 3257 }
5b6319dc 3258 }
165a31c0 3259 }
d35fbf6b 3260
165a31c0 3261 if (needs_sandboxing) {
37ac2744 3262 /* Apply other MAC contexts late, but before seccomp syscall filtering, as those should really be last to
5cd9cd35
LP
3263 * influence our own codepaths as little as possible. Moreover, applying MAC contexts usually requires
3264 * syscalls that are subject to seccomp filtering, hence should probably be applied before the syscalls
3265 * are restricted. */
3266
349cc4a5 3267#if HAVE_SELINUX
43b1f709 3268 if (use_selinux) {
5cd9cd35
LP
3269 char *exec_context = mac_selinux_context_net ?: context->selinux_context;
3270
3271 if (exec_context) {
3272 r = setexeccon(exec_context);
3273 if (r < 0) {
3274 *exit_status = EXIT_SELINUX_CONTEXT;
12145637 3275 return log_unit_error_errno(unit, r, "Failed to change SELinux context to %s: %m", exec_context);
5cd9cd35
LP
3276 }
3277 }
3278 }
3279#endif
3280
349cc4a5 3281#if HAVE_APPARMOR
43b1f709 3282 if (use_apparmor && context->apparmor_profile) {
5cd9cd35
LP
3283 r = aa_change_onexec(context->apparmor_profile);
3284 if (r < 0 && !context->apparmor_profile_ignore) {
3285 *exit_status = EXIT_APPARMOR_PROFILE;
12145637 3286 return log_unit_error_errno(unit, errno, "Failed to prepare AppArmor profile change to %s: %m", context->apparmor_profile);
5cd9cd35
LP
3287 }
3288 }
3289#endif
3290
165a31c0
LP
3291 /* PR_GET_SECUREBITS is not privileged, while PR_SET_SECUREBITS is. So to suppress potential EPERMs
3292 * we'll try not to call PR_SET_SECUREBITS unless necessary. */
755d4b67
IP
3293 if (prctl(PR_GET_SECUREBITS) != secure_bits)
3294 if (prctl(PR_SET_SECUREBITS, secure_bits) < 0) {
ff0af2a1 3295 *exit_status = EXIT_SECUREBITS;
12145637 3296 return log_unit_error_errno(unit, errno, "Failed to set process secure bits: %m");
ff01d048 3297 }
5b6319dc 3298
59eeb84b 3299 if (context_has_no_new_privileges(context))
d35fbf6b 3300 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
ff0af2a1 3301 *exit_status = EXIT_NO_NEW_PRIVILEGES;
12145637 3302 return log_unit_error_errno(unit, errno, "Failed to disable new privileges: %m");
d35fbf6b
DM
3303 }
3304
349cc4a5 3305#if HAVE_SECCOMP
469830d1
LP
3306 r = apply_address_families(unit, context);
3307 if (r < 0) {
3308 *exit_status = EXIT_ADDRESS_FAMILIES;
12145637 3309 return log_unit_error_errno(unit, r, "Failed to restrict address families: %m");
4c2630eb 3310 }
04aa0cb9 3311
469830d1
LP
3312 r = apply_memory_deny_write_execute(unit, context);
3313 if (r < 0) {
3314 *exit_status = EXIT_SECCOMP;
12145637 3315 return log_unit_error_errno(unit, r, "Failed to disable writing to executable memory: %m");
f3e43635 3316 }
f4170c67 3317
469830d1
LP
3318 r = apply_restrict_realtime(unit, context);
3319 if (r < 0) {
3320 *exit_status = EXIT_SECCOMP;
12145637 3321 return log_unit_error_errno(unit, r, "Failed to apply realtime restrictions: %m");
f4170c67
LP
3322 }
3323
add00535
LP
3324 r = apply_restrict_namespaces(unit, context);
3325 if (r < 0) {
3326 *exit_status = EXIT_SECCOMP;
12145637 3327 return log_unit_error_errno(unit, r, "Failed to apply namespace restrictions: %m");
add00535
LP
3328 }
3329
469830d1
LP
3330 r = apply_protect_sysctl(unit, context);
3331 if (r < 0) {
3332 *exit_status = EXIT_SECCOMP;
12145637 3333 return log_unit_error_errno(unit, r, "Failed to apply sysctl restrictions: %m");
502d704e
DH
3334 }
3335
469830d1
LP
3336 r = apply_protect_kernel_modules(unit, context);
3337 if (r < 0) {
3338 *exit_status = EXIT_SECCOMP;
12145637 3339 return log_unit_error_errno(unit, r, "Failed to apply module loading restrictions: %m");
59eeb84b
LP
3340 }
3341
469830d1
LP
3342 r = apply_private_devices(unit, context);
3343 if (r < 0) {
3344 *exit_status = EXIT_SECCOMP;
12145637 3345 return log_unit_error_errno(unit, r, "Failed to set up private devices: %m");
469830d1
LP
3346 }
3347
3348 r = apply_syscall_archs(unit, context);
3349 if (r < 0) {
3350 *exit_status = EXIT_SECCOMP;
12145637 3351 return log_unit_error_errno(unit, r, "Failed to apply syscall architecture restrictions: %m");
ba128bb8
LP
3352 }
3353
78e864e5
TM
3354 r = apply_lock_personality(unit, context);
3355 if (r < 0) {
3356 *exit_status = EXIT_SECCOMP;
12145637 3357 return log_unit_error_errno(unit, r, "Failed to lock personalities: %m");
78e864e5
TM
3358 }
3359
5cd9cd35
LP
3360 /* This really should remain the last step before the execve(), to make sure our own code is unaffected
3361 * by the filter as little as possible. */
165a31c0 3362 r = apply_syscall_filter(unit, context, needs_ambient_hack);
469830d1
LP
3363 if (r < 0) {
3364 *exit_status = EXIT_SECCOMP;
12145637 3365 return log_unit_error_errno(unit, r, "Failed to apply system call filters: %m");
d35fbf6b
DM
3366 }
3367#endif
d35fbf6b 3368 }
034c6ed7 3369
00819cc1
LP
3370 if (!strv_isempty(context->unset_environment)) {
3371 char **ee = NULL;
3372
3373 ee = strv_env_delete(accum_env, 1, context->unset_environment);
3374 if (!ee) {
3375 *exit_status = EXIT_MEMORY;
12145637 3376 return log_oom();
00819cc1
LP
3377 }
3378
3379 strv_free(accum_env);
3380 accum_env = ee;
3381 }
3382
2065ca69 3383 final_argv = replace_env_argv(argv, accum_env);
d35fbf6b 3384 if (!final_argv) {
ff0af2a1 3385 *exit_status = EXIT_MEMORY;
12145637 3386 return log_oom();
d35fbf6b 3387 }
034c6ed7 3388
f1d34068 3389 if (DEBUG_LOGGING) {
d35fbf6b 3390 _cleanup_free_ char *line;
81a2b7ce 3391
d35fbf6b
DM
3392 line = exec_command_line(final_argv);
3393 if (line) {
f2341e0a 3394 log_struct(LOG_DEBUG,
f2341e0a
LP
3395 "EXECUTABLE=%s", command->path,
3396 LOG_UNIT_MESSAGE(unit, "Executing: %s", line),
ba360bb0 3397 LOG_UNIT_ID(unit),
f1c50bec 3398 LOG_UNIT_INVOCATION_ID(unit),
f2341e0a 3399 NULL);
d35fbf6b
DM
3400 }
3401 }
dd305ec9 3402
2065ca69 3403 execve(command->path, final_argv, accum_env);
12145637
LP
3404
3405 if (errno == ENOENT && (command->flags & EXEC_COMMAND_IGNORE_FAILURE)) {
3406
3407 log_struct_errno(LOG_INFO, errno,
3408 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3409 LOG_UNIT_ID(unit),
3410 LOG_UNIT_INVOCATION_ID(unit),
3411 LOG_UNIT_MESSAGE(unit, "Executable %s missing, skipping: %m",
3412 command->path),
3413 "EXECUTABLE=%s", command->path,
3414 NULL);
3415
3416 return 0;
3417 }
3418
ff0af2a1 3419 *exit_status = EXIT_EXEC;
12145637 3420 return log_unit_error_errno(unit, errno, "Failed to execute command: %m");
d35fbf6b 3421}
81a2b7ce 3422
34cf6c43
YW
3423static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l);
3424static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]);
3425
f2341e0a
LP
3426int exec_spawn(Unit *unit,
3427 ExecCommand *command,
d35fbf6b
DM
3428 const ExecContext *context,
3429 const ExecParameters *params,
3430 ExecRuntime *runtime,
29206d46 3431 DynamicCreds *dcreds,
d35fbf6b 3432 pid_t *ret) {
8351ceae 3433
d35fbf6b 3434 _cleanup_strv_free_ char **files_env = NULL;
9b141911 3435 int *fds = NULL;
4c47affc 3436 unsigned n_storage_fds = 0, n_socket_fds = 0;
ff0af2a1
LP
3437 _cleanup_free_ char *line = NULL;
3438 int socket_fd, r;
52c239d7 3439 int named_iofds[3] = { -1, -1, -1 };
ff0af2a1 3440 char **argv;
d35fbf6b 3441 pid_t pid;
8351ceae 3442
f2341e0a 3443 assert(unit);
d35fbf6b
DM
3444 assert(command);
3445 assert(context);
3446 assert(ret);
3447 assert(params);
4c47affc 3448 assert(params->fds || (params->n_storage_fds + params->n_socket_fds <= 0));
4298d0b5 3449
d35fbf6b
DM
3450 if (context->std_input == EXEC_INPUT_SOCKET ||
3451 context->std_output == EXEC_OUTPUT_SOCKET ||
3452 context->std_error == EXEC_OUTPUT_SOCKET) {
17df7223 3453
4c47affc 3454 if (params->n_socket_fds > 1) {
f2341e0a 3455 log_unit_error(unit, "Got more than one socket.");
d35fbf6b 3456 return -EINVAL;
ff0af2a1 3457 }
eef65bf3 3458
4c47affc 3459 if (params->n_socket_fds == 0) {
488ab41c
AA
3460 log_unit_error(unit, "Got no socket.");
3461 return -EINVAL;
3462 }
3463
d35fbf6b
DM
3464 socket_fd = params->fds[0];
3465 } else {
3466 socket_fd = -1;
3467 fds = params->fds;
4c47affc 3468 n_storage_fds = params->n_storage_fds;
9b141911 3469 n_socket_fds = params->n_socket_fds;
d35fbf6b 3470 }
94f04347 3471
34cf6c43 3472 r = exec_context_named_iofds(context, params, named_iofds);
52c239d7
LB
3473 if (r < 0)
3474 return log_unit_error_errno(unit, r, "Failed to load a named file descriptor: %m");
3475
f2341e0a 3476 r = exec_context_load_environment(unit, context, &files_env);
ff0af2a1 3477 if (r < 0)
f2341e0a 3478 return log_unit_error_errno(unit, r, "Failed to load environment files: %m");
034c6ed7 3479
d35fbf6b 3480 argv = params->argv ?: command->argv;
d35fbf6b
DM
3481 line = exec_command_line(argv);
3482 if (!line)
3483 return log_oom();
fab56fc5 3484
f2341e0a 3485 log_struct(LOG_DEBUG,
f2341e0a
LP
3486 LOG_UNIT_MESSAGE(unit, "About to execute: %s", line),
3487 "EXECUTABLE=%s", command->path,
ba360bb0 3488 LOG_UNIT_ID(unit),
f1c50bec 3489 LOG_UNIT_INVOCATION_ID(unit),
f2341e0a 3490 NULL);
12145637 3491
d35fbf6b
DM
3492 pid = fork();
3493 if (pid < 0)
74129a12 3494 return log_unit_error_errno(unit, errno, "Failed to fork: %m");
d35fbf6b
DM
3495
3496 if (pid == 0) {
12145637 3497 int exit_status = EXIT_SUCCESS;
ff0af2a1 3498
f2341e0a
LP
3499 r = exec_child(unit,
3500 command,
ff0af2a1
LP
3501 context,
3502 params,
3503 runtime,
29206d46 3504 dcreds,
ff0af2a1
LP
3505 argv,
3506 socket_fd,
52c239d7 3507 named_iofds,
4c47affc
FB
3508 fds,
3509 n_storage_fds,
9b141911 3510 n_socket_fds,
ff0af2a1 3511 files_env,
00d9ef85 3512 unit->manager->user_lookup_fds[1],
12145637
LP
3513 &exit_status);
3514
ff0af2a1 3515 if (r < 0) {
12145637
LP
3516 log_struct_errno(LOG_ERR, r,
3517 "MESSAGE_ID=" SD_MESSAGE_SPAWN_FAILED_STR,
3518 LOG_UNIT_ID(unit),
3519 LOG_UNIT_INVOCATION_ID(unit),
3520 LOG_UNIT_MESSAGE(unit, "Failed at step %s spawning %s: %m",
3521 exit_status_to_string(exit_status, EXIT_STATUS_SYSTEMD),
3522 command->path),
3523 "EXECUTABLE=%s", command->path,
3524 NULL);
4c2630eb
MS
3525 }
3526
ff0af2a1 3527 _exit(exit_status);
034c6ed7
LP
3528 }
3529
f2341e0a 3530 log_unit_debug(unit, "Forked %s as "PID_FMT, command->path, pid);
23635a85 3531
80876c20
LP
3532 /* We add the new process to the cgroup both in the child (so
3533 * that we can be sure that no user code is ever executed
3534 * outside of the cgroup) and in the parent (so that we can be
3535 * sure that when we kill the cgroup the process will be
3536 * killed too). */
d35fbf6b 3537 if (params->cgroup_path)
dd305ec9 3538 (void) cg_attach(SYSTEMD_CGROUP_CONTROLLER, params->cgroup_path, pid);
2da3263a 3539
b58b4116 3540 exec_status_start(&command->exec_status, pid);
9fb86720 3541
034c6ed7 3542 *ret = pid;
5cb5a6ff
LP
3543 return 0;
3544}
3545
034c6ed7 3546void exec_context_init(ExecContext *c) {
3536f49e
YW
3547 ExecDirectoryType i;
3548
034c6ed7
LP
3549 assert(c);
3550
4c12626c 3551 c->umask = 0022;
9eba9da4 3552 c->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 0);
94f04347 3553 c->cpu_sched_policy = SCHED_OTHER;
071830ff 3554 c->syslog_priority = LOG_DAEMON|LOG_INFO;
74922904 3555 c->syslog_level_prefix = true;
353e12c2 3556 c->ignore_sigpipe = true;
3a43da28 3557 c->timer_slack_nsec = NSEC_INFINITY;
050f7277 3558 c->personality = PERSONALITY_INVALID;
72fd1768 3559 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3536f49e 3560 c->directories[i].mode = 0755;
a103496c 3561 c->capability_bounding_set = CAP_ALL;
add00535 3562 c->restrict_namespaces = NAMESPACE_FLAGS_ALL;
d3070fbd 3563 c->log_level_max = -1;
034c6ed7
LP
3564}
3565
613b411c 3566void exec_context_done(ExecContext *c) {
3536f49e 3567 ExecDirectoryType i;
d3070fbd 3568 size_t l;
5cb5a6ff
LP
3569
3570 assert(c);
3571
6796073e
LP
3572 c->environment = strv_free(c->environment);
3573 c->environment_files = strv_free(c->environment_files);
b4c14404 3574 c->pass_environment = strv_free(c->pass_environment);
00819cc1 3575 c->unset_environment = strv_free(c->unset_environment);
8c7be95e 3576
1f6b4113 3577 for (l = 0; l < ELEMENTSOF(c->rlimit); l++)
a1e58e8e 3578 c->rlimit[l] = mfree(c->rlimit[l]);
034c6ed7 3579
2038c3f5 3580 for (l = 0; l < 3; l++) {
52c239d7 3581 c->stdio_fdname[l] = mfree(c->stdio_fdname[l]);
2038c3f5
LP
3582 c->stdio_file[l] = mfree(c->stdio_file[l]);
3583 }
52c239d7 3584
a1e58e8e
LP
3585 c->working_directory = mfree(c->working_directory);
3586 c->root_directory = mfree(c->root_directory);
915e6d16 3587 c->root_image = mfree(c->root_image);
a1e58e8e
LP
3588 c->tty_path = mfree(c->tty_path);
3589 c->syslog_identifier = mfree(c->syslog_identifier);
3590 c->user = mfree(c->user);
3591 c->group = mfree(c->group);
034c6ed7 3592
6796073e 3593 c->supplementary_groups = strv_free(c->supplementary_groups);
94f04347 3594
a1e58e8e 3595 c->pam_name = mfree(c->pam_name);
5b6319dc 3596
2a624c36
AP
3597 c->read_only_paths = strv_free(c->read_only_paths);
3598 c->read_write_paths = strv_free(c->read_write_paths);
3599 c->inaccessible_paths = strv_free(c->inaccessible_paths);
82c121a4 3600
d2d6c096 3601 bind_mount_free_many(c->bind_mounts, c->n_bind_mounts);
8e06d57c
YW
3602 c->bind_mounts = NULL;
3603 c->n_bind_mounts = 0;
2abd4e38
YW
3604 temporary_filesystem_free_many(c->temporary_filesystems, c->n_temporary_filesystems);
3605 c->temporary_filesystems = NULL;
3606 c->n_temporary_filesystems = 0;
d2d6c096 3607
da681e1b 3608 c->cpuset = cpu_set_mfree(c->cpuset);
86a3475b 3609
a1e58e8e
LP
3610 c->utmp_id = mfree(c->utmp_id);
3611 c->selinux_context = mfree(c->selinux_context);
3612 c->apparmor_profile = mfree(c->apparmor_profile);
5b8e1b77 3613 c->smack_process_label = mfree(c->smack_process_label);
eef65bf3 3614
8cfa775f 3615 c->syscall_filter = hashmap_free(c->syscall_filter);
525d3cc7
LP
3616 c->syscall_archs = set_free(c->syscall_archs);
3617 c->address_families = set_free(c->address_families);
e66cf1a3 3618
72fd1768 3619 for (i = 0; i < _EXEC_DIRECTORY_TYPE_MAX; i++)
3536f49e 3620 c->directories[i].paths = strv_free(c->directories[i].paths);
d3070fbd
LP
3621
3622 c->log_level_max = -1;
3623
3624 exec_context_free_log_extra_fields(c);
08f3be7a
LP
3625
3626 c->stdin_data = mfree(c->stdin_data);
3627 c->stdin_data_size = 0;
e66cf1a3
LP
3628}
3629
34cf6c43 3630int exec_context_destroy_runtime_directory(const ExecContext *c, const char *runtime_prefix) {
e66cf1a3
LP
3631 char **i;
3632
3633 assert(c);
3634
3635 if (!runtime_prefix)
3636 return 0;
3637
3536f49e 3638 STRV_FOREACH(i, c->directories[EXEC_DIRECTORY_RUNTIME].paths) {
e66cf1a3
LP
3639 _cleanup_free_ char *p;
3640
605405c6 3641 p = strjoin(runtime_prefix, "/", *i);
e66cf1a3
LP
3642 if (!p)
3643 return -ENOMEM;
3644
6c47cd7d 3645 /* We execute this synchronously, since we need to be sure this is gone when we start the service
e66cf1a3 3646 * next. */
c6878637 3647 (void) rm_rf(p, REMOVE_ROOT);
e66cf1a3
LP
3648 }
3649
3650 return 0;
5cb5a6ff
LP
3651}
3652
34cf6c43 3653static void exec_command_done(ExecCommand *c) {
43d0fcbd
LP
3654 assert(c);
3655
a1e58e8e 3656 c->path = mfree(c->path);
43d0fcbd 3657
6796073e 3658 c->argv = strv_free(c->argv);
43d0fcbd
LP
3659}
3660
3661void exec_command_done_array(ExecCommand *c, unsigned n) {
3662 unsigned i;
3663
3664 for (i = 0; i < n; i++)
3665 exec_command_done(c+i);
3666}
3667
f1acf85a 3668ExecCommand* exec_command_free_list(ExecCommand *c) {
5cb5a6ff
LP
3669 ExecCommand *i;
3670
3671 while ((i = c)) {
71fda00f 3672 LIST_REMOVE(command, c, i);
43d0fcbd 3673 exec_command_done(i);
5cb5a6ff
LP
3674 free(i);
3675 }
f1acf85a
ZJS
3676
3677 return NULL;
5cb5a6ff
LP
3678}
3679
034c6ed7
LP
3680void exec_command_free_array(ExecCommand **c, unsigned n) {
3681 unsigned i;
3682
f1acf85a
ZJS
3683 for (i = 0; i < n; i++)
3684 c[i] = exec_command_free_list(c[i]);
034c6ed7
LP
3685}
3686
039f0e70 3687typedef struct InvalidEnvInfo {
34cf6c43 3688 const Unit *unit;
039f0e70
LP
3689 const char *path;
3690} InvalidEnvInfo;
3691
3692static void invalid_env(const char *p, void *userdata) {
3693 InvalidEnvInfo *info = userdata;
3694
f2341e0a 3695 log_unit_error(info->unit, "Ignoring invalid environment assignment '%s': %s", p, info->path);
039f0e70
LP
3696}
3697
52c239d7
LB
3698const char* exec_context_fdname(const ExecContext *c, int fd_index) {
3699 assert(c);
3700
3701 switch (fd_index) {
5073ff6b 3702
52c239d7
LB
3703 case STDIN_FILENO:
3704 if (c->std_input != EXEC_INPUT_NAMED_FD)
3705 return NULL;
5073ff6b 3706
52c239d7 3707 return c->stdio_fdname[STDIN_FILENO] ?: "stdin";
5073ff6b 3708
52c239d7
LB
3709 case STDOUT_FILENO:
3710 if (c->std_output != EXEC_OUTPUT_NAMED_FD)
3711 return NULL;
5073ff6b 3712
52c239d7 3713 return c->stdio_fdname[STDOUT_FILENO] ?: "stdout";
5073ff6b 3714
52c239d7
LB
3715 case STDERR_FILENO:
3716 if (c->std_error != EXEC_OUTPUT_NAMED_FD)
3717 return NULL;
5073ff6b 3718
52c239d7 3719 return c->stdio_fdname[STDERR_FILENO] ?: "stderr";
5073ff6b 3720
52c239d7
LB
3721 default:
3722 return NULL;
3723 }
3724}
3725
34cf6c43 3726static int exec_context_named_iofds(const ExecContext *c, const ExecParameters *p, int named_iofds[3]) {
52c239d7 3727 unsigned i, targets;
56fbd561 3728 const char* stdio_fdname[3];
4c47affc 3729 unsigned n_fds;
52c239d7
LB
3730
3731 assert(c);
3732 assert(p);
3733
3734 targets = (c->std_input == EXEC_INPUT_NAMED_FD) +
3735 (c->std_output == EXEC_OUTPUT_NAMED_FD) +
3736 (c->std_error == EXEC_OUTPUT_NAMED_FD);
3737
3738 for (i = 0; i < 3; i++)
3739 stdio_fdname[i] = exec_context_fdname(c, i);
3740
4c47affc
FB
3741 n_fds = p->n_storage_fds + p->n_socket_fds;
3742
3743 for (i = 0; i < n_fds && targets > 0; i++)
56fbd561
ZJS
3744 if (named_iofds[STDIN_FILENO] < 0 &&
3745 c->std_input == EXEC_INPUT_NAMED_FD &&
3746 stdio_fdname[STDIN_FILENO] &&
3747 streq(p->fd_names[i], stdio_fdname[STDIN_FILENO])) {
3748
52c239d7
LB
3749 named_iofds[STDIN_FILENO] = p->fds[i];
3750 targets--;
56fbd561
ZJS
3751
3752 } else if (named_iofds[STDOUT_FILENO] < 0 &&
3753 c->std_output == EXEC_OUTPUT_NAMED_FD &&
3754 stdio_fdname[STDOUT_FILENO] &&
3755 streq(p->fd_names[i], stdio_fdname[STDOUT_FILENO])) {
3756
52c239d7
LB
3757 named_iofds[STDOUT_FILENO] = p->fds[i];
3758 targets--;
56fbd561
ZJS
3759
3760 } else if (named_iofds[STDERR_FILENO] < 0 &&
3761 c->std_error == EXEC_OUTPUT_NAMED_FD &&
3762 stdio_fdname[STDERR_FILENO] &&
3763 streq(p->fd_names[i], stdio_fdname[STDERR_FILENO])) {
3764
52c239d7
LB
3765 named_iofds[STDERR_FILENO] = p->fds[i];
3766 targets--;
3767 }
3768
56fbd561 3769 return targets == 0 ? 0 : -ENOENT;
52c239d7
LB
3770}
3771
34cf6c43 3772static int exec_context_load_environment(const Unit *unit, const ExecContext *c, char ***l) {
8c7be95e
LP
3773 char **i, **r = NULL;
3774
3775 assert(c);
3776 assert(l);
3777
3778 STRV_FOREACH(i, c->environment_files) {
3779 char *fn;
52511fae
ZJS
3780 int k;
3781 unsigned n;
8c7be95e
LP
3782 bool ignore = false;
3783 char **p;
7fd1b19b 3784 _cleanup_globfree_ glob_t pglob = {};
8c7be95e
LP
3785
3786 fn = *i;
3787
3788 if (fn[0] == '-') {
3789 ignore = true;
313cefa1 3790 fn++;
8c7be95e
LP
3791 }
3792
3793 if (!path_is_absolute(fn)) {
8c7be95e
LP
3794 if (ignore)
3795 continue;
3796
3797 strv_free(r);
3798 return -EINVAL;
3799 }
3800
2bef10ab 3801 /* Filename supports globbing, take all matching files */
d8c92e8b
ZJS
3802 k = safe_glob(fn, 0, &pglob);
3803 if (k < 0) {
2bef10ab
PL
3804 if (ignore)
3805 continue;
8c7be95e 3806
2bef10ab 3807 strv_free(r);
d8c92e8b 3808 return k;
2bef10ab 3809 }
8c7be95e 3810
d8c92e8b
ZJS
3811 /* When we don't match anything, -ENOENT should be returned */
3812 assert(pglob.gl_pathc > 0);
3813
3814 for (n = 0; n < pglob.gl_pathc; n++) {
717603e3 3815 k = load_env_file(NULL, pglob.gl_pathv[n], NULL, &p);
2bef10ab
PL
3816 if (k < 0) {
3817 if (ignore)
3818 continue;
8c7be95e 3819
2bef10ab 3820 strv_free(r);
2bef10ab 3821 return k;
e9c1ea9d 3822 }
ebc05a09 3823 /* Log invalid environment variables with filename */
039f0e70
LP
3824 if (p) {
3825 InvalidEnvInfo info = {
f2341e0a 3826 .unit = unit,
039f0e70
LP
3827 .path = pglob.gl_pathv[n]
3828 };
3829
3830 p = strv_env_clean_with_callback(p, invalid_env, &info);
3831 }
8c7be95e 3832
234519ae 3833 if (!r)
2bef10ab
PL
3834 r = p;
3835 else {
3836 char **m;
8c7be95e 3837
2bef10ab
PL
3838 m = strv_env_merge(2, r, p);
3839 strv_free(r);
3840 strv_free(p);
c84a9488 3841 if (!m)
2bef10ab 3842 return -ENOMEM;
2bef10ab
PL
3843
3844 r = m;
3845 }
8c7be95e
LP
3846 }
3847 }
3848
3849 *l = r;
3850
3851 return 0;
3852}
3853
6ac8fdc9 3854static bool tty_may_match_dev_console(const char *tty) {
7b912648 3855 _cleanup_free_ char *resolved = NULL;
6ac8fdc9 3856
1e22b5cd
LP
3857 if (!tty)
3858 return true;
3859
a119ec7c 3860 tty = skip_dev_prefix(tty);
6ac8fdc9
MS
3861
3862 /* trivial identity? */
3863 if (streq(tty, "console"))
3864 return true;
3865
7b912648
LP
3866 if (resolve_dev_console(&resolved) < 0)
3867 return true; /* if we could not resolve, assume it may */
6ac8fdc9
MS
3868
3869 /* "tty0" means the active VC, so it may be the same sometimes */
7b912648 3870 return streq(resolved, tty) || (streq(resolved, "tty0") && tty_is_vc(tty));
6ac8fdc9
MS
3871}
3872
34cf6c43 3873bool exec_context_may_touch_console(const ExecContext *ec) {
1e22b5cd
LP
3874
3875 return (ec->tty_reset ||
3876 ec->tty_vhangup ||
3877 ec->tty_vt_disallocate ||
6ac8fdc9
MS
3878 is_terminal_input(ec->std_input) ||
3879 is_terminal_output(ec->std_output) ||
3880 is_terminal_output(ec->std_error)) &&
1e22b5cd 3881 tty_may_match_dev_console(exec_context_tty_path(ec));
6ac8fdc9
MS
3882}
3883
15ae422b
LP
3884static void strv_fprintf(FILE *f, char **l) {
3885 char **g;
3886
3887 assert(f);
3888
3889 STRV_FOREACH(g, l)
3890 fprintf(f, " %s", *g);
3891}
3892
34cf6c43 3893void exec_context_dump(const ExecContext *c, FILE* f, const char *prefix) {
d3070fbd 3894 ExecDirectoryType dt;
c2bbd90b 3895 char **e, **d;
94f04347 3896 unsigned i;
add00535 3897 int r;
9eba9da4 3898
5cb5a6ff
LP
3899 assert(c);
3900 assert(f);
3901
4ad49000 3902 prefix = strempty(prefix);
5cb5a6ff
LP
3903
3904 fprintf(f,
94f04347
LP
3905 "%sUMask: %04o\n"
3906 "%sWorkingDirectory: %s\n"
451a074f 3907 "%sRootDirectory: %s\n"
15ae422b 3908 "%sNonBlocking: %s\n"
64747e2d 3909 "%sPrivateTmp: %s\n"
7f112f50 3910 "%sPrivateDevices: %s\n"
59eeb84b 3911 "%sProtectKernelTunables: %s\n"
e66a2f65 3912 "%sProtectKernelModules: %s\n"
59eeb84b 3913 "%sProtectControlGroups: %s\n"
d251207d
LP
3914 "%sPrivateNetwork: %s\n"
3915 "%sPrivateUsers: %s\n"
1b8689f9
LP
3916 "%sProtectHome: %s\n"
3917 "%sProtectSystem: %s\n"
5d997827 3918 "%sMountAPIVFS: %s\n"
f3e43635 3919 "%sIgnoreSIGPIPE: %s\n"
f4170c67 3920 "%sMemoryDenyWriteExecute: %s\n"
b1edf445
LP
3921 "%sRestrictRealtime: %s\n"
3922 "%sKeyringMode: %s\n",
5cb5a6ff 3923 prefix, c->umask,
9eba9da4 3924 prefix, c->working_directory ? c->working_directory : "/",
451a074f 3925 prefix, c->root_directory ? c->root_directory : "/",
15ae422b 3926 prefix, yes_no(c->non_blocking),
64747e2d 3927 prefix, yes_no(c->private_tmp),
7f112f50 3928 prefix, yes_no(c->private_devices),
59eeb84b 3929 prefix, yes_no(c->protect_kernel_tunables),
e66a2f65 3930 prefix, yes_no(c->protect_kernel_modules),
59eeb84b 3931 prefix, yes_no(c->protect_control_groups),
d251207d
LP
3932 prefix, yes_no(c->private_network),
3933 prefix, yes_no(c->private_users),
1b8689f9
LP
3934 prefix, protect_home_to_string(c->protect_home),
3935 prefix, protect_system_to_string(c->protect_system),
5d997827 3936 prefix, yes_no(c->mount_apivfs),
f3e43635 3937 prefix, yes_no(c->ignore_sigpipe),
f4170c67 3938 prefix, yes_no(c->memory_deny_write_execute),
b1edf445
LP
3939 prefix, yes_no(c->restrict_realtime),
3940 prefix, exec_keyring_mode_to_string(c->keyring_mode));
fb33a393 3941
915e6d16
LP
3942 if (c->root_image)
3943 fprintf(f, "%sRootImage: %s\n", prefix, c->root_image);
3944
8c7be95e
LP
3945 STRV_FOREACH(e, c->environment)
3946 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
3947
3948 STRV_FOREACH(e, c->environment_files)
3949 fprintf(f, "%sEnvironmentFile: %s\n", prefix, *e);
94f04347 3950
b4c14404
FB
3951 STRV_FOREACH(e, c->pass_environment)
3952 fprintf(f, "%sPassEnvironment: %s\n", prefix, *e);
3953
00819cc1
LP
3954 STRV_FOREACH(e, c->unset_environment)
3955 fprintf(f, "%sUnsetEnvironment: %s\n", prefix, *e);
3956
53f47dfc
YW
3957 fprintf(f, "%sRuntimeDirectoryPreserve: %s\n", prefix, exec_preserve_mode_to_string(c->runtime_directory_preserve_mode));
3958
72fd1768 3959 for (dt = 0; dt < _EXEC_DIRECTORY_TYPE_MAX; dt++) {
3536f49e
YW
3960 fprintf(f, "%s%sMode: %04o\n", prefix, exec_directory_type_to_string(dt), c->directories[dt].mode);
3961
3962 STRV_FOREACH(d, c->directories[dt].paths)
3963 fprintf(f, "%s%s: %s\n", prefix, exec_directory_type_to_string(dt), *d);
3964 }
c2bbd90b 3965
fb33a393
LP
3966 if (c->nice_set)
3967 fprintf(f,
3968 "%sNice: %i\n",
3969 prefix, c->nice);
3970
dd6c17b1 3971 if (c->oom_score_adjust_set)
fb33a393 3972 fprintf(f,
dd6c17b1
LP
3973 "%sOOMScoreAdjust: %i\n",
3974 prefix, c->oom_score_adjust);
9eba9da4 3975
94f04347 3976 for (i = 0; i < RLIM_NLIMITS; i++)
3c11da9d
EV
3977 if (c->rlimit[i]) {
3978 fprintf(f, "%s%s: " RLIM_FMT "\n",
3979 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_max);
3980 fprintf(f, "%s%sSoft: " RLIM_FMT "\n",
3981 prefix, rlimit_to_string(i), c->rlimit[i]->rlim_cur);
3982 }
94f04347 3983
f8b69d1d 3984 if (c->ioprio_set) {
1756a011 3985 _cleanup_free_ char *class_str = NULL;
f8b69d1d 3986
837df140
YW
3987 r = ioprio_class_to_string_alloc(IOPRIO_PRIO_CLASS(c->ioprio), &class_str);
3988 if (r >= 0)
3989 fprintf(f, "%sIOSchedulingClass: %s\n", prefix, class_str);
3990
3991 fprintf(f, "%sIOPriority: %lu\n", prefix, IOPRIO_PRIO_DATA(c->ioprio));
f8b69d1d 3992 }
94f04347 3993
f8b69d1d 3994 if (c->cpu_sched_set) {
1756a011 3995 _cleanup_free_ char *policy_str = NULL;
f8b69d1d 3996
837df140
YW
3997 r = sched_policy_to_string_alloc(c->cpu_sched_policy, &policy_str);
3998 if (r >= 0)
3999 fprintf(f, "%sCPUSchedulingPolicy: %s\n", prefix, policy_str);
4000
94f04347 4001 fprintf(f,
38b48754
LP
4002 "%sCPUSchedulingPriority: %i\n"
4003 "%sCPUSchedulingResetOnFork: %s\n",
38b48754
LP
4004 prefix, c->cpu_sched_priority,
4005 prefix, yes_no(c->cpu_sched_reset_on_fork));
b929bf04 4006 }
94f04347 4007
82c121a4 4008 if (c->cpuset) {
94f04347 4009 fprintf(f, "%sCPUAffinity:", prefix);
82c121a4
LP
4010 for (i = 0; i < c->cpuset_ncpus; i++)
4011 if (CPU_ISSET_S(i, CPU_ALLOC_SIZE(c->cpuset_ncpus), c->cpuset))
43a99a7a 4012 fprintf(f, " %u", i);
94f04347
LP
4013 fputs("\n", f);
4014 }
4015
3a43da28 4016 if (c->timer_slack_nsec != NSEC_INFINITY)
ccd06097 4017 fprintf(f, "%sTimerSlackNSec: "NSEC_FMT "\n", prefix, c->timer_slack_nsec);
94f04347
LP
4018
4019 fprintf(f,
80876c20
LP
4020 "%sStandardInput: %s\n"
4021 "%sStandardOutput: %s\n"
4022 "%sStandardError: %s\n",
4023 prefix, exec_input_to_string(c->std_input),
4024 prefix, exec_output_to_string(c->std_output),
4025 prefix, exec_output_to_string(c->std_error));
4026
befc4a80
LP
4027 if (c->std_input == EXEC_INPUT_NAMED_FD)
4028 fprintf(f, "%sStandardInputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDIN_FILENO]);
4029 if (c->std_output == EXEC_OUTPUT_NAMED_FD)
4030 fprintf(f, "%sStandardOutputFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDOUT_FILENO]);
4031 if (c->std_error == EXEC_OUTPUT_NAMED_FD)
4032 fprintf(f, "%sStandardErrorFileDescriptorName: %s\n", prefix, c->stdio_fdname[STDERR_FILENO]);
4033
4034 if (c->std_input == EXEC_INPUT_FILE)
4035 fprintf(f, "%sStandardInputFile: %s\n", prefix, c->stdio_file[STDIN_FILENO]);
4036 if (c->std_output == EXEC_OUTPUT_FILE)
4037 fprintf(f, "%sStandardOutputFile: %s\n", prefix, c->stdio_file[STDOUT_FILENO]);
4038 if (c->std_error == EXEC_OUTPUT_FILE)
4039 fprintf(f, "%sStandardErrorFile: %s\n", prefix, c->stdio_file[STDERR_FILENO]);
4040
80876c20
LP
4041 if (c->tty_path)
4042 fprintf(f,
6ea832a2
LP
4043 "%sTTYPath: %s\n"
4044 "%sTTYReset: %s\n"
4045 "%sTTYVHangup: %s\n"
4046 "%sTTYVTDisallocate: %s\n",
4047 prefix, c->tty_path,
4048 prefix, yes_no(c->tty_reset),
4049 prefix, yes_no(c->tty_vhangup),
4050 prefix, yes_no(c->tty_vt_disallocate));
94f04347 4051
9f6444eb
LP
4052 if (IN_SET(c->std_output,
4053 EXEC_OUTPUT_SYSLOG,
4054 EXEC_OUTPUT_KMSG,
4055 EXEC_OUTPUT_JOURNAL,
4056 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4057 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4058 EXEC_OUTPUT_JOURNAL_AND_CONSOLE) ||
4059 IN_SET(c->std_error,
4060 EXEC_OUTPUT_SYSLOG,
4061 EXEC_OUTPUT_KMSG,
4062 EXEC_OUTPUT_JOURNAL,
4063 EXEC_OUTPUT_SYSLOG_AND_CONSOLE,
4064 EXEC_OUTPUT_KMSG_AND_CONSOLE,
4065 EXEC_OUTPUT_JOURNAL_AND_CONSOLE)) {
f8b69d1d 4066
5ce70e5b 4067 _cleanup_free_ char *fac_str = NULL, *lvl_str = NULL;
f8b69d1d 4068
837df140
YW
4069 r = log_facility_unshifted_to_string_alloc(c->syslog_priority >> 3, &fac_str);
4070 if (r >= 0)
4071 fprintf(f, "%sSyslogFacility: %s\n", prefix, fac_str);
f8b69d1d 4072
837df140
YW
4073 r = log_level_to_string_alloc(LOG_PRI(c->syslog_priority), &lvl_str);
4074 if (r >= 0)
4075 fprintf(f, "%sSyslogLevel: %s\n", prefix, lvl_str);
f8b69d1d 4076 }
94f04347 4077
d3070fbd
LP
4078 if (c->log_level_max >= 0) {
4079 _cleanup_free_ char *t = NULL;
4080
4081 (void) log_level_to_string_alloc(c->log_level_max, &t);
4082
4083 fprintf(f, "%sLogLevelMax: %s\n", prefix, strna(t));
4084 }
4085
4086 if (c->n_log_extra_fields > 0) {
4087 size_t j;
4088
4089 for (j = 0; j < c->n_log_extra_fields; j++) {
4090 fprintf(f, "%sLogExtraFields: ", prefix);
4091 fwrite(c->log_extra_fields[j].iov_base,
4092 1, c->log_extra_fields[j].iov_len,
4093 f);
4094 fputc('\n', f);
4095 }
4096 }
4097
07d46372
YW
4098 if (c->secure_bits) {
4099 _cleanup_free_ char *str = NULL;
4100
4101 r = secure_bits_to_string_alloc(c->secure_bits, &str);
4102 if (r >= 0)
4103 fprintf(f, "%sSecure Bits: %s\n", prefix, str);
4104 }
94f04347 4105
a103496c 4106 if (c->capability_bounding_set != CAP_ALL) {
dd1f5bd0 4107 _cleanup_free_ char *str = NULL;
94f04347 4108
dd1f5bd0
YW
4109 r = capability_set_to_string_alloc(c->capability_bounding_set, &str);
4110 if (r >= 0)
4111 fprintf(f, "%sCapabilityBoundingSet: %s\n", prefix, str);
755d4b67
IP
4112 }
4113
4114 if (c->capability_ambient_set != 0) {
dd1f5bd0 4115 _cleanup_free_ char *str = NULL;
755d4b67 4116
dd1f5bd0
YW
4117 r = capability_set_to_string_alloc(c->capability_ambient_set, &str);
4118 if (r >= 0)
4119 fprintf(f, "%sAmbientCapabilities: %s\n", prefix, str);
94f04347
LP
4120 }
4121
4122 if (c->user)
f2d3769a 4123 fprintf(f, "%sUser: %s\n", prefix, c->user);
94f04347 4124 if (c->group)
f2d3769a 4125 fprintf(f, "%sGroup: %s\n", prefix, c->group);
94f04347 4126
29206d46
LP
4127 fprintf(f, "%sDynamicUser: %s\n", prefix, yes_no(c->dynamic_user));
4128
ac6e8be6 4129 if (!strv_isempty(c->supplementary_groups)) {
94f04347 4130 fprintf(f, "%sSupplementaryGroups:", prefix);
15ae422b
LP
4131 strv_fprintf(f, c->supplementary_groups);
4132 fputs("\n", f);
4133 }
94f04347 4134
5b6319dc 4135 if (c->pam_name)
f2d3769a 4136 fprintf(f, "%sPAMName: %s\n", prefix, c->pam_name);
5b6319dc 4137
58629001 4138 if (!strv_isempty(c->read_write_paths)) {
2a624c36
AP
4139 fprintf(f, "%sReadWritePaths:", prefix);
4140 strv_fprintf(f, c->read_write_paths);
15ae422b
LP
4141 fputs("\n", f);
4142 }
4143
58629001 4144 if (!strv_isempty(c->read_only_paths)) {
2a624c36
AP
4145 fprintf(f, "%sReadOnlyPaths:", prefix);
4146 strv_fprintf(f, c->read_only_paths);
15ae422b
LP
4147 fputs("\n", f);
4148 }
94f04347 4149
58629001 4150 if (!strv_isempty(c->inaccessible_paths)) {
2a624c36
AP
4151 fprintf(f, "%sInaccessiblePaths:", prefix);
4152 strv_fprintf(f, c->inaccessible_paths);
94f04347
LP
4153 fputs("\n", f);
4154 }
2e22afe9 4155
d2d6c096 4156 if (c->n_bind_mounts > 0)
4ca763a9
YW
4157 for (i = 0; i < c->n_bind_mounts; i++)
4158 fprintf(f, "%s%s: %s%s:%s:%s\n", prefix,
d2d6c096 4159 c->bind_mounts[i].read_only ? "BindReadOnlyPaths" : "BindPaths",
4ca763a9 4160 c->bind_mounts[i].ignore_enoent ? "-": "",
d2d6c096
LP
4161 c->bind_mounts[i].source,
4162 c->bind_mounts[i].destination,
4163 c->bind_mounts[i].recursive ? "rbind" : "norbind");
d2d6c096 4164
2abd4e38
YW
4165 if (c->n_temporary_filesystems > 0)
4166 for (i = 0; i < c->n_temporary_filesystems; i++) {
4167 TemporaryFileSystem *t = c->temporary_filesystems + i;
4168
4169 fprintf(f, "%sTemporaryFileSystem: %s%s%s\n", prefix,
4170 t->path,
4171 isempty(t->options) ? "" : ":",
4172 strempty(t->options));
4173 }
4174
169c1bda
LP
4175 if (c->utmp_id)
4176 fprintf(f,
4177 "%sUtmpIdentifier: %s\n",
4178 prefix, c->utmp_id);
7b52a628
MS
4179
4180 if (c->selinux_context)
4181 fprintf(f,
5f8640fb
LP
4182 "%sSELinuxContext: %s%s\n",
4183 prefix, c->selinux_context_ignore ? "-" : "", c->selinux_context);
17df7223 4184
80c21aea
WC
4185 if (c->apparmor_profile)
4186 fprintf(f,
4187 "%sAppArmorProfile: %s%s\n",
4188 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
4189
4190 if (c->smack_process_label)
4191 fprintf(f,
4192 "%sSmackProcessLabel: %s%s\n",
4193 prefix, c->smack_process_label_ignore ? "-" : "", c->smack_process_label);
4194
050f7277 4195 if (c->personality != PERSONALITY_INVALID)
ac45f971
LP
4196 fprintf(f,
4197 "%sPersonality: %s\n",
4198 prefix, strna(personality_to_string(c->personality)));
4199
78e864e5
TM
4200 fprintf(f,
4201 "%sLockPersonality: %s\n",
4202 prefix, yes_no(c->lock_personality));
4203
17df7223 4204 if (c->syscall_filter) {
349cc4a5 4205#if HAVE_SECCOMP
17df7223 4206 Iterator j;
8cfa775f 4207 void *id, *val;
17df7223 4208 bool first = true;
351a19b1 4209#endif
17df7223
LP
4210
4211 fprintf(f,
57183d11 4212 "%sSystemCallFilter: ",
17df7223
LP
4213 prefix);
4214
4215 if (!c->syscall_whitelist)
4216 fputc('~', f);
4217
349cc4a5 4218#if HAVE_SECCOMP
8cfa775f 4219 HASHMAP_FOREACH_KEY(val, id, c->syscall_filter, j) {
17df7223 4220 _cleanup_free_ char *name = NULL;
8cfa775f
YW
4221 const char *errno_name = NULL;
4222 int num = PTR_TO_INT(val);
17df7223
LP
4223
4224 if (first)
4225 first = false;
4226 else
4227 fputc(' ', f);
4228
57183d11 4229 name = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
17df7223 4230 fputs(strna(name), f);
8cfa775f
YW
4231
4232 if (num >= 0) {
4233 errno_name = errno_to_name(num);
4234 if (errno_name)
4235 fprintf(f, ":%s", errno_name);
4236 else
4237 fprintf(f, ":%d", num);
4238 }
17df7223 4239 }
351a19b1 4240#endif
17df7223
LP
4241
4242 fputc('\n', f);
4243 }
4244
57183d11 4245 if (c->syscall_archs) {
349cc4a5 4246#if HAVE_SECCOMP
57183d11
LP
4247 Iterator j;
4248 void *id;
4249#endif
4250
4251 fprintf(f,
4252 "%sSystemCallArchitectures:",
4253 prefix);
4254
349cc4a5 4255#if HAVE_SECCOMP
57183d11
LP
4256 SET_FOREACH(id, c->syscall_archs, j)
4257 fprintf(f, " %s", strna(seccomp_arch_to_string(PTR_TO_UINT32(id) - 1)));
4258#endif
4259 fputc('\n', f);
4260 }
4261
add00535
LP
4262 if (exec_context_restrict_namespaces_set(c)) {
4263 _cleanup_free_ char *s = NULL;
4264
4265 r = namespace_flag_to_string_many(c->restrict_namespaces, &s);
4266 if (r >= 0)
4267 fprintf(f, "%sRestrictNamespaces: %s\n",
4268 prefix, s);
4269 }
4270
3df90f24
YW
4271 if (c->syscall_errno > 0) {
4272 const char *errno_name;
4273
4274 fprintf(f, "%sSystemCallErrorNumber: ", prefix);
4275
4276 errno_name = errno_to_name(c->syscall_errno);
4277 if (errno_name)
4278 fprintf(f, "%s\n", errno_name);
4279 else
4280 fprintf(f, "%d\n", c->syscall_errno);
4281 }
eef65bf3
MS
4282
4283 if (c->apparmor_profile)
4284 fprintf(f,
4285 "%sAppArmorProfile: %s%s\n",
4286 prefix, c->apparmor_profile_ignore ? "-" : "", c->apparmor_profile);
5cb5a6ff
LP
4287}
4288
34cf6c43 4289bool exec_context_maintains_privileges(const ExecContext *c) {
a931ad47
LP
4290 assert(c);
4291
61233823 4292 /* Returns true if the process forked off would run under
a931ad47
LP
4293 * an unchanged UID or as root. */
4294
4295 if (!c->user)
4296 return true;
4297
4298 if (streq(c->user, "root") || streq(c->user, "0"))
4299 return true;
4300
4301 return false;
4302}
4303
34cf6c43 4304int exec_context_get_effective_ioprio(const ExecContext *c) {
7f452159
LP
4305 int p;
4306
4307 assert(c);
4308
4309 if (c->ioprio_set)
4310 return c->ioprio;
4311
4312 p = ioprio_get(IOPRIO_WHO_PROCESS, 0);
4313 if (p < 0)
4314 return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 4);
4315
4316 return p;
4317}
4318
d3070fbd
LP
4319void exec_context_free_log_extra_fields(ExecContext *c) {
4320 size_t l;
4321
4322 assert(c);
4323
4324 for (l = 0; l < c->n_log_extra_fields; l++)
4325 free(c->log_extra_fields[l].iov_base);
4326 c->log_extra_fields = mfree(c->log_extra_fields);
4327 c->n_log_extra_fields = 0;
4328}
4329
b58b4116 4330void exec_status_start(ExecStatus *s, pid_t pid) {
034c6ed7 4331 assert(s);
5cb5a6ff 4332
b58b4116
LP
4333 zero(*s);
4334 s->pid = pid;
4335 dual_timestamp_get(&s->start_timestamp);
4336}
4337
34cf6c43 4338void exec_status_exit(ExecStatus *s, const ExecContext *context, pid_t pid, int code, int status) {
b58b4116
LP
4339 assert(s);
4340
0b1f4ae6 4341 if (s->pid && s->pid != pid)
b58b4116
LP
4342 zero(*s);
4343
034c6ed7 4344 s->pid = pid;
63983207 4345 dual_timestamp_get(&s->exit_timestamp);
9fb86720 4346
034c6ed7
LP
4347 s->code = code;
4348 s->status = status;
169c1bda 4349
6ea832a2
LP
4350 if (context) {
4351 if (context->utmp_id)
4352 utmp_put_dead_process(context->utmp_id, pid, code, status);
4353
1e22b5cd 4354 exec_context_tty_reset(context, NULL);
6ea832a2 4355 }
9fb86720
LP
4356}
4357
34cf6c43 4358void exec_status_dump(const ExecStatus *s, FILE *f, const char *prefix) {
9fb86720
LP
4359 char buf[FORMAT_TIMESTAMP_MAX];
4360
4361 assert(s);
4362 assert(f);
4363
9fb86720
LP
4364 if (s->pid <= 0)
4365 return;
4366
4c940960
LP
4367 prefix = strempty(prefix);
4368
9fb86720 4369 fprintf(f,
ccd06097
ZJS
4370 "%sPID: "PID_FMT"\n",
4371 prefix, s->pid);
9fb86720 4372
af9d16e1 4373 if (dual_timestamp_is_set(&s->start_timestamp))
9fb86720
LP
4374 fprintf(f,
4375 "%sStart Timestamp: %s\n",
63983207 4376 prefix, format_timestamp(buf, sizeof(buf), s->start_timestamp.realtime));
9fb86720 4377
af9d16e1 4378 if (dual_timestamp_is_set(&s->exit_timestamp))
9fb86720
LP
4379 fprintf(f,
4380 "%sExit Timestamp: %s\n"
4381 "%sExit Code: %s\n"
4382 "%sExit Status: %i\n",
63983207 4383 prefix, format_timestamp(buf, sizeof(buf), s->exit_timestamp.realtime),
9fb86720
LP
4384 prefix, sigchld_code_to_string(s->code),
4385 prefix, s->status);
5cb5a6ff 4386}
44d8db9e 4387
34cf6c43 4388static char *exec_command_line(char **argv) {
44d8db9e
LP
4389 size_t k;
4390 char *n, *p, **a;
4391 bool first = true;
4392
9e2f7c11 4393 assert(argv);
44d8db9e 4394
9164977d 4395 k = 1;
9e2f7c11 4396 STRV_FOREACH(a, argv)
44d8db9e
LP
4397 k += strlen(*a)+3;
4398
5cd9cd35
LP
4399 n = new(char, k);
4400 if (!n)
44d8db9e
LP
4401 return NULL;
4402
4403 p = n;
9e2f7c11 4404 STRV_FOREACH(a, argv) {
44d8db9e
LP
4405
4406 if (!first)
4407 *(p++) = ' ';
4408 else
4409 first = false;
4410
4411 if (strpbrk(*a, WHITESPACE)) {
4412 *(p++) = '\'';
4413 p = stpcpy(p, *a);
4414 *(p++) = '\'';
4415 } else
4416 p = stpcpy(p, *a);
4417
4418 }
4419
9164977d
LP
4420 *p = 0;
4421
44d8db9e
LP
4422 /* FIXME: this doesn't really handle arguments that have
4423 * spaces and ticks in them */
4424
4425 return n;
4426}
4427
34cf6c43 4428static void exec_command_dump(ExecCommand *c, FILE *f, const char *prefix) {
e1d75803 4429 _cleanup_free_ char *cmd = NULL;
4c940960 4430 const char *prefix2;
44d8db9e
LP
4431
4432 assert(c);
4433 assert(f);
4434
4c940960 4435 prefix = strempty(prefix);
63c372cb 4436 prefix2 = strjoina(prefix, "\t");
44d8db9e 4437
9e2f7c11 4438 cmd = exec_command_line(c->argv);
44d8db9e
LP
4439 fprintf(f,
4440 "%sCommand Line: %s\n",
4441 prefix, cmd ? cmd : strerror(ENOMEM));
4442
9fb86720 4443 exec_status_dump(&c->exec_status, f, prefix2);
44d8db9e
LP
4444}
4445
4446void exec_command_dump_list(ExecCommand *c, FILE *f, const char *prefix) {
4447 assert(f);
4448
4c940960 4449 prefix = strempty(prefix);
44d8db9e
LP
4450
4451 LIST_FOREACH(command, c, c)
4452 exec_command_dump(c, f, prefix);
4453}
94f04347 4454
a6a80b4f
LP
4455void exec_command_append_list(ExecCommand **l, ExecCommand *e) {
4456 ExecCommand *end;
4457
4458 assert(l);
4459 assert(e);
4460
4461 if (*l) {
35b8ca3a 4462 /* It's kind of important, that we keep the order here */
71fda00f
LP
4463 LIST_FIND_TAIL(command, *l, end);
4464 LIST_INSERT_AFTER(command, *l, end, e);
a6a80b4f
LP
4465 } else
4466 *l = e;
4467}
4468
26fd040d
LP
4469int exec_command_set(ExecCommand *c, const char *path, ...) {
4470 va_list ap;
4471 char **l, *p;
4472
4473 assert(c);
4474 assert(path);
4475
4476 va_start(ap, path);
4477 l = strv_new_ap(path, ap);
4478 va_end(ap);
4479
4480 if (!l)
4481 return -ENOMEM;
4482
250a918d
LP
4483 p = strdup(path);
4484 if (!p) {
26fd040d
LP
4485 strv_free(l);
4486 return -ENOMEM;
4487 }
4488
4489 free(c->path);
4490 c->path = p;
4491
4492 strv_free(c->argv);
4493 c->argv = l;
4494
4495 return 0;
4496}
4497
86b23b07 4498int exec_command_append(ExecCommand *c, const char *path, ...) {
e63ff941 4499 _cleanup_strv_free_ char **l = NULL;
86b23b07 4500 va_list ap;
86b23b07
JS
4501 int r;
4502
4503 assert(c);
4504 assert(path);
4505
4506 va_start(ap, path);
4507 l = strv_new_ap(path, ap);
4508 va_end(ap);
4509
4510 if (!l)
4511 return -ENOMEM;
4512
e287086b 4513 r = strv_extend_strv(&c->argv, l, false);
e63ff941 4514 if (r < 0)
86b23b07 4515 return r;
86b23b07
JS
4516
4517 return 0;
4518}
4519
e8a565cb
YW
4520static void *remove_tmpdir_thread(void *p) {
4521 _cleanup_free_ char *path = p;
86b23b07 4522
e8a565cb
YW
4523 (void) rm_rf(path, REMOVE_ROOT|REMOVE_PHYSICAL);
4524 return NULL;
4525}
4526
4527static ExecRuntime* exec_runtime_free(ExecRuntime *rt, bool destroy) {
4528 int r;
4529
4530 if (!rt)
4531 return NULL;
4532
4533 if (rt->manager)
4534 (void) hashmap_remove(rt->manager->exec_runtime_by_id, rt->id);
4535
4536 /* When destroy is true, then rm_rf tmp_dir and var_tmp_dir. */
4537 if (destroy && rt->tmp_dir) {
4538 log_debug("Spawning thread to nuke %s", rt->tmp_dir);
4539
4540 r = asynchronous_job(remove_tmpdir_thread, rt->tmp_dir);
4541 if (r < 0) {
4542 log_warning_errno(r, "Failed to nuke %s: %m", rt->tmp_dir);
4543 free(rt->tmp_dir);
4544 }
4545
4546 rt->tmp_dir = NULL;
4547 }
613b411c 4548
e8a565cb
YW
4549 if (destroy && rt->var_tmp_dir) {
4550 log_debug("Spawning thread to nuke %s", rt->var_tmp_dir);
4551
4552 r = asynchronous_job(remove_tmpdir_thread, rt->var_tmp_dir);
4553 if (r < 0) {
4554 log_warning_errno(r, "Failed to nuke %s: %m", rt->var_tmp_dir);
4555 free(rt->var_tmp_dir);
4556 }
4557
4558 rt->var_tmp_dir = NULL;
4559 }
4560
4561 rt->id = mfree(rt->id);
4562 rt->tmp_dir = mfree(rt->tmp_dir);
4563 rt->var_tmp_dir = mfree(rt->var_tmp_dir);
4564 safe_close_pair(rt->netns_storage_socket);
4565 return mfree(rt);
4566}
4567
4568static void exec_runtime_freep(ExecRuntime **rt) {
613b411c 4569 if (*rt)
e8a565cb
YW
4570 (void) exec_runtime_free(*rt, false);
4571}
4572
4573static int exec_runtime_allocate(ExecRuntime **rt) {
4574 assert(rt);
613b411c
LP
4575
4576 *rt = new0(ExecRuntime, 1);
f146f5e1 4577 if (!*rt)
613b411c
LP
4578 return -ENOMEM;
4579
613b411c 4580 (*rt)->netns_storage_socket[0] = (*rt)->netns_storage_socket[1] = -1;
613b411c
LP
4581 return 0;
4582}
4583
e8a565cb
YW
4584static int exec_runtime_add(
4585 Manager *m,
4586 const char *id,
4587 const char *tmp_dir,
4588 const char *var_tmp_dir,
4589 const int netns_storage_socket[2],
4590 ExecRuntime **ret) {
4591
4592 _cleanup_(exec_runtime_freep) ExecRuntime *rt = NULL;
613b411c
LP
4593 int r;
4594
e8a565cb 4595 assert(m);
613b411c
LP
4596 assert(id);
4597
e8a565cb
YW
4598 r = hashmap_ensure_allocated(&m->exec_runtime_by_id, &string_hash_ops);
4599 if (r < 0)
4600 return r;
613b411c 4601
e8a565cb 4602 r = exec_runtime_allocate(&rt);
613b411c
LP
4603 if (r < 0)
4604 return r;
4605
e8a565cb
YW
4606 rt->id = strdup(id);
4607 if (!rt->id)
4608 return -ENOMEM;
4609
4610 if (tmp_dir) {
4611 rt->tmp_dir = strdup(tmp_dir);
4612 if (!rt->tmp_dir)
4613 return -ENOMEM;
4614
4615 /* When tmp_dir is set, then we require var_tmp_dir is also set. */
4616 assert(var_tmp_dir);
4617 rt->var_tmp_dir = strdup(var_tmp_dir);
4618 if (!rt->var_tmp_dir)
4619 return -ENOMEM;
4620 }
4621
4622 if (netns_storage_socket) {
4623 rt->netns_storage_socket[0] = netns_storage_socket[0];
4624 rt->netns_storage_socket[1] = netns_storage_socket[1];
613b411c
LP
4625 }
4626
e8a565cb
YW
4627 r = hashmap_put(m->exec_runtime_by_id, rt->id, rt);
4628 if (r < 0)
4629 return r;
4630
4631 rt->manager = m;
4632
4633 if (ret)
4634 *ret = rt;
4635
4636 /* do not remove created ExecRuntime object when the operation succeeds. */
4637 rt = NULL;
4638 return 0;
4639}
4640
4641static int exec_runtime_make(Manager *m, const ExecContext *c, const char *id, ExecRuntime **ret) {
4642 _cleanup_free_ char *tmp_dir = NULL, *var_tmp_dir = NULL;
4643 _cleanup_close_pair_ int netns_storage_socket[2] = {-1, -1};
4644 int r;
4645
4646 assert(m);
4647 assert(c);
4648 assert(id);
4649
4650 /* It is not necessary to create ExecRuntime object. */
4651 if (!c->private_network && !c->private_tmp)
4652 return 0;
4653
4654 if (c->private_tmp) {
4655 r = setup_tmp_dirs(id, &tmp_dir, &var_tmp_dir);
613b411c
LP
4656 if (r < 0)
4657 return r;
4658 }
4659
e8a565cb
YW
4660 if (c->private_network) {
4661 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, netns_storage_socket) < 0)
4662 return -errno;
4663 }
4664
4665 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, netns_storage_socket, ret);
4666 if (r < 0)
4667 return r;
4668
4669 /* Avoid cleanup */
4670 netns_storage_socket[0] = -1;
4671 netns_storage_socket[1] = -1;
613b411c
LP
4672 return 1;
4673}
4674
e8a565cb
YW
4675int exec_runtime_acquire(Manager *m, const ExecContext *c, const char *id, bool create, ExecRuntime **ret) {
4676 ExecRuntime *rt;
4677 int r;
613b411c 4678
e8a565cb
YW
4679 assert(m);
4680 assert(id);
4681 assert(ret);
4682
4683 rt = hashmap_get(m->exec_runtime_by_id, id);
4684 if (rt)
4685 /* We already have a ExecRuntime object, let's increase the ref count and reuse it */
4686 goto ref;
4687
4688 if (!create)
4689 return 0;
4690
4691 /* If not found, then create a new object. */
4692 r = exec_runtime_make(m, c, id, &rt);
4693 if (r <= 0)
4694 /* When r == 0, it is not necessary to create ExecRuntime object. */
4695 return r;
613b411c 4696
e8a565cb
YW
4697ref:
4698 /* increment reference counter. */
4699 rt->n_ref++;
4700 *ret = rt;
4701 return 1;
4702}
613b411c 4703
e8a565cb
YW
4704ExecRuntime *exec_runtime_unref(ExecRuntime *rt, bool destroy) {
4705 if (!rt)
613b411c
LP
4706 return NULL;
4707
e8a565cb 4708 assert(rt->n_ref > 0);
613b411c 4709
e8a565cb
YW
4710 rt->n_ref--;
4711 if (rt->n_ref > 0)
f2341e0a
LP
4712 return NULL;
4713
e8a565cb 4714 return exec_runtime_free(rt, destroy);
613b411c
LP
4715}
4716
e8a565cb
YW
4717int exec_runtime_serialize(const Manager *m, FILE *f, FDSet *fds) {
4718 ExecRuntime *rt;
4719 Iterator i;
4720
4721 assert(m);
613b411c
LP
4722 assert(f);
4723 assert(fds);
4724
e8a565cb
YW
4725 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4726 fprintf(f, "exec-runtime=%s", rt->id);
613b411c 4727
e8a565cb
YW
4728 if (rt->tmp_dir)
4729 fprintf(f, " tmp-dir=%s", rt->tmp_dir);
613b411c 4730
e8a565cb
YW
4731 if (rt->var_tmp_dir)
4732 fprintf(f, " var-tmp-dir=%s", rt->var_tmp_dir);
613b411c 4733
e8a565cb
YW
4734 if (rt->netns_storage_socket[0] >= 0) {
4735 int copy;
613b411c 4736
e8a565cb
YW
4737 copy = fdset_put_dup(fds, rt->netns_storage_socket[0]);
4738 if (copy < 0)
4739 return copy;
613b411c 4740
e8a565cb
YW
4741 fprintf(f, " netns-socket-0=%i", copy);
4742 }
613b411c 4743
e8a565cb
YW
4744 if (rt->netns_storage_socket[1] >= 0) {
4745 int copy;
613b411c 4746
e8a565cb
YW
4747 copy = fdset_put_dup(fds, rt->netns_storage_socket[1]);
4748 if (copy < 0)
4749 return copy;
613b411c 4750
e8a565cb
YW
4751 fprintf(f, " netns-socket-1=%i", copy);
4752 }
4753
4754 fputc('\n', f);
613b411c
LP
4755 }
4756
4757 return 0;
4758}
4759
e8a565cb
YW
4760int exec_runtime_deserialize_compat(Unit *u, const char *key, const char *value, FDSet *fds) {
4761 _cleanup_(exec_runtime_freep) ExecRuntime *rt_create = NULL;
4762 ExecRuntime *rt;
613b411c
LP
4763 int r;
4764
e8a565cb
YW
4765 /* This is for the migration from old (v237 or earlier) deserialization text.
4766 * Due to the bug #7790, this may not work with the units that use JoinsNamespaceOf=.
4767 * Even if the ExecRuntime object originally created by the other unit, we cannot judge
4768 * so or not from the serialized text, then we always creates a new object owned by this. */
4769
4770 assert(u);
613b411c
LP
4771 assert(key);
4772 assert(value);
4773
e8a565cb
YW
4774 /* Manager manages ExecRuntime objects by the unit id.
4775 * So, we omit the serialized text when the unit does not have id (yet?)... */
4776 if (isempty(u->id)) {
4777 log_unit_debug(u, "Invocation ID not found. Dropping runtime parameter.");
4778 return 0;
4779 }
613b411c 4780
e8a565cb
YW
4781 r = hashmap_ensure_allocated(&u->manager->exec_runtime_by_id, &string_hash_ops);
4782 if (r < 0) {
4783 log_unit_debug_errno(u, r, "Failed to allocate storage for runtime parameter: %m");
4784 return 0;
4785 }
4786
4787 rt = hashmap_get(u->manager->exec_runtime_by_id, u->id);
4788 if (!rt) {
4789 r = exec_runtime_allocate(&rt_create);
613b411c 4790 if (r < 0)
f2341e0a 4791 return log_oom();
613b411c 4792
e8a565cb
YW
4793 rt_create->id = strdup(u->id);
4794 if (!rt_create->id)
4795 return log_oom();
4796
4797 rt = rt_create;
4798 }
4799
4800 if (streq(key, "tmp-dir")) {
4801 char *copy;
4802
613b411c
LP
4803 copy = strdup(value);
4804 if (!copy)
4805 return log_oom();
4806
e8a565cb 4807 free_and_replace(rt->tmp_dir, copy);
613b411c
LP
4808
4809 } else if (streq(key, "var-tmp-dir")) {
4810 char *copy;
4811
613b411c
LP
4812 copy = strdup(value);
4813 if (!copy)
4814 return log_oom();
4815
e8a565cb 4816 free_and_replace(rt->var_tmp_dir, copy);
613b411c
LP
4817
4818 } else if (streq(key, "netns-socket-0")) {
4819 int fd;
4820
e8a565cb 4821 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 4822 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 4823 return 0;
613b411c 4824 }
e8a565cb
YW
4825
4826 safe_close(rt->netns_storage_socket[0]);
4827 rt->netns_storage_socket[0] = fdset_remove(fds, fd);
4828
613b411c
LP
4829 } else if (streq(key, "netns-socket-1")) {
4830 int fd;
4831
e8a565cb 4832 if (safe_atoi(value, &fd) < 0 || !fdset_contains(fds, fd)) {
f2341e0a 4833 log_unit_debug(u, "Failed to parse netns socket value: %s", value);
e8a565cb 4834 return 0;
613b411c 4835 }
e8a565cb
YW
4836
4837 safe_close(rt->netns_storage_socket[1]);
4838 rt->netns_storage_socket[1] = fdset_remove(fds, fd);
613b411c
LP
4839 } else
4840 return 0;
4841
613b411c 4842
e8a565cb
YW
4843 /* If the object is newly created, then put it to the hashmap which manages ExecRuntime objects. */
4844 if (rt_create) {
4845 r = hashmap_put(u->manager->exec_runtime_by_id, rt_create->id, rt_create);
4846 if (r < 0) {
4847 log_unit_debug_errno(u, r, "Failed to put runtime paramter to manager's storage: %m");
4848 return 0;
4849 }
613b411c 4850
e8a565cb 4851 rt_create->manager = u->manager;
613b411c 4852
e8a565cb
YW
4853 /* Avoid cleanup */
4854 rt_create = NULL;
4855 }
98b47d54 4856
e8a565cb
YW
4857 return 1;
4858}
613b411c 4859
e8a565cb
YW
4860void exec_runtime_deserialize_one(Manager *m, const char *value, FDSet *fds) {
4861 char *id = NULL, *tmp_dir = NULL, *var_tmp_dir = NULL;
4862 int r, fd0 = -1, fd1 = -1;
4863 const char *p, *v = value;
4864 size_t n;
613b411c 4865
e8a565cb
YW
4866 assert(m);
4867 assert(value);
4868 assert(fds);
98b47d54 4869
e8a565cb
YW
4870 n = strcspn(v, " ");
4871 id = strndupa(v, n);
4872 if (v[n] != ' ')
4873 goto finalize;
4874 p = v + n + 1;
4875
4876 v = startswith(p, "tmp-dir=");
4877 if (v) {
4878 n = strcspn(v, " ");
4879 tmp_dir = strndupa(v, n);
4880 if (v[n] != ' ')
4881 goto finalize;
4882 p = v + n + 1;
4883 }
4884
4885 v = startswith(p, "var-tmp-dir=");
4886 if (v) {
4887 n = strcspn(v, " ");
4888 var_tmp_dir = strndupa(v, n);
4889 if (v[n] != ' ')
4890 goto finalize;
4891 p = v + n + 1;
4892 }
4893
4894 v = startswith(p, "netns-socket-0=");
4895 if (v) {
4896 char *buf;
4897
4898 n = strcspn(v, " ");
4899 buf = strndupa(v, n);
4900 if (safe_atoi(buf, &fd0) < 0 || !fdset_contains(fds, fd0)) {
4901 log_debug("Unable to process exec-runtime netns fd specification.");
4902 return;
98b47d54 4903 }
e8a565cb
YW
4904 fd0 = fdset_remove(fds, fd0);
4905 if (v[n] != ' ')
4906 goto finalize;
4907 p = v + n + 1;
613b411c
LP
4908 }
4909
e8a565cb
YW
4910 v = startswith(p, "netns-socket-1=");
4911 if (v) {
4912 char *buf;
98b47d54 4913
e8a565cb
YW
4914 n = strcspn(v, " ");
4915 buf = strndupa(v, n);
4916 if (safe_atoi(buf, &fd1) < 0 || !fdset_contains(fds, fd1)) {
4917 log_debug("Unable to process exec-runtime netns fd specification.");
4918 return;
98b47d54 4919 }
e8a565cb
YW
4920 fd1 = fdset_remove(fds, fd1);
4921 }
98b47d54 4922
e8a565cb
YW
4923finalize:
4924
4925 r = exec_runtime_add(m, id, tmp_dir, var_tmp_dir, (int[]) { fd0, fd1 }, NULL);
4926 if (r < 0) {
4927 log_debug_errno(r, "Failed to add exec-runtime: %m");
4928 return;
613b411c 4929 }
e8a565cb 4930}
613b411c 4931
e8a565cb
YW
4932void exec_runtime_vacuum(Manager *m) {
4933 ExecRuntime *rt;
4934 Iterator i;
4935
4936 assert(m);
4937
4938 /* Free unreferenced ExecRuntime objects. This is used after manager deserialization process. */
4939
4940 HASHMAP_FOREACH(rt, m->exec_runtime_by_id, i) {
4941 if (rt->n_ref > 0)
4942 continue;
4943
4944 (void) exec_runtime_free(rt, false);
4945 }
613b411c
LP
4946}
4947
80876c20
LP
4948static const char* const exec_input_table[_EXEC_INPUT_MAX] = {
4949 [EXEC_INPUT_NULL] = "null",
4950 [EXEC_INPUT_TTY] = "tty",
4951 [EXEC_INPUT_TTY_FORCE] = "tty-force",
4f2d528d 4952 [EXEC_INPUT_TTY_FAIL] = "tty-fail",
52c239d7
LB
4953 [EXEC_INPUT_SOCKET] = "socket",
4954 [EXEC_INPUT_NAMED_FD] = "fd",
08f3be7a 4955 [EXEC_INPUT_DATA] = "data",
2038c3f5 4956 [EXEC_INPUT_FILE] = "file",
80876c20
LP
4957};
4958
8a0867d6
LP
4959DEFINE_STRING_TABLE_LOOKUP(exec_input, ExecInput);
4960
94f04347 4961static const char* const exec_output_table[_EXEC_OUTPUT_MAX] = {
80876c20 4962 [EXEC_OUTPUT_INHERIT] = "inherit",
94f04347 4963 [EXEC_OUTPUT_NULL] = "null",
80876c20 4964 [EXEC_OUTPUT_TTY] = "tty",
94f04347 4965 [EXEC_OUTPUT_SYSLOG] = "syslog",
28dbc1e8 4966 [EXEC_OUTPUT_SYSLOG_AND_CONSOLE] = "syslog+console",
9a6bca7a 4967 [EXEC_OUTPUT_KMSG] = "kmsg",
28dbc1e8 4968 [EXEC_OUTPUT_KMSG_AND_CONSOLE] = "kmsg+console",
706343f4
LP
4969 [EXEC_OUTPUT_JOURNAL] = "journal",
4970 [EXEC_OUTPUT_JOURNAL_AND_CONSOLE] = "journal+console",
52c239d7
LB
4971 [EXEC_OUTPUT_SOCKET] = "socket",
4972 [EXEC_OUTPUT_NAMED_FD] = "fd",
2038c3f5 4973 [EXEC_OUTPUT_FILE] = "file",
94f04347
LP
4974};
4975
4976DEFINE_STRING_TABLE_LOOKUP(exec_output, ExecOutput);
023a4f67
LP
4977
4978static const char* const exec_utmp_mode_table[_EXEC_UTMP_MODE_MAX] = {
4979 [EXEC_UTMP_INIT] = "init",
4980 [EXEC_UTMP_LOGIN] = "login",
4981 [EXEC_UTMP_USER] = "user",
4982};
4983
4984DEFINE_STRING_TABLE_LOOKUP(exec_utmp_mode, ExecUtmpMode);
53f47dfc
YW
4985
4986static const char* const exec_preserve_mode_table[_EXEC_PRESERVE_MODE_MAX] = {
4987 [EXEC_PRESERVE_NO] = "no",
4988 [EXEC_PRESERVE_YES] = "yes",
4989 [EXEC_PRESERVE_RESTART] = "restart",
4990};
4991
4992DEFINE_STRING_TABLE_LOOKUP_WITH_BOOLEAN(exec_preserve_mode, ExecPreserveMode, EXEC_PRESERVE_YES);
3536f49e 4993
72fd1768 4994static const char* const exec_directory_type_table[_EXEC_DIRECTORY_TYPE_MAX] = {
3536f49e
YW
4995 [EXEC_DIRECTORY_RUNTIME] = "RuntimeDirectory",
4996 [EXEC_DIRECTORY_STATE] = "StateDirectory",
4997 [EXEC_DIRECTORY_CACHE] = "CacheDirectory",
4998 [EXEC_DIRECTORY_LOGS] = "LogsDirectory",
4999 [EXEC_DIRECTORY_CONFIGURATION] = "ConfigurationDirectory",
5000};
5001
5002DEFINE_STRING_TABLE_LOOKUP(exec_directory_type, ExecDirectoryType);
b1edf445
LP
5003
5004static const char* const exec_keyring_mode_table[_EXEC_KEYRING_MODE_MAX] = {
5005 [EXEC_KEYRING_INHERIT] = "inherit",
5006 [EXEC_KEYRING_PRIVATE] = "private",
5007 [EXEC_KEYRING_SHARED] = "shared",
5008};
5009
5010DEFINE_STRING_TABLE_LOOKUP(exec_keyring_mode, ExecKeyringMode);