]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-stub-pid1.c
tree-wide: drop license boilerplate
[thirdparty/systemd.git] / src / nspawn / nspawn-stub-pid1.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2016 Lennart Poettering
6 ***/
7
8 #include <sys/reboot.h>
9 #include <sys/wait.h>
10 #include <sys/prctl.h>
11 #include <unistd.h>
12
13 #include "fd-util.h"
14 #include "log.h"
15 #include "missing.h"
16 #include "nspawn-stub-pid1.h"
17 #include "process-util.h"
18 #include "signal-util.h"
19 #include "time-util.h"
20 #include "def.h"
21
22 static int reset_environ(const char *new_environment, size_t length) {
23 unsigned long start, end;
24
25 start = (unsigned long) new_environment;
26 end = start + length;
27
28 if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
29 return -errno;
30
31 if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
32 return -errno;
33
34 return 0;
35 }
36
37 int stub_pid1(sd_id128_t uuid) {
38 enum {
39 STATE_RUNNING,
40 STATE_REBOOT,
41 STATE_POWEROFF,
42 } state = STATE_RUNNING;
43
44 sigset_t fullmask, oldmask, waitmask;
45 usec_t quit_usec = USEC_INFINITY;
46 pid_t pid;
47 int r;
48
49 /* The new environment we set up, on the stack. */
50 char new_environment[] =
51 "container=systemd-nspawn\0"
52 "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
53
54 /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
55 * for allowing arbitrary processes run in a container, and still have all zombies reaped. */
56
57 assert_se(sigfillset(&fullmask) >= 0);
58 assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);
59
60 pid = fork();
61 if (pid < 0)
62 return log_error_errno(errno, "Failed to fork child pid: %m");
63
64 if (pid == 0) {
65 /* Return in the child */
66 assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);
67 setsid();
68 return 0;
69 }
70
71 reset_all_signal_handlers();
72
73 log_close();
74 close_all_fds(NULL, 0);
75 log_open();
76
77 /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
78 * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
79 * find them set. */
80 sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
81 reset_environ(new_environment, sizeof(new_environment));
82
83 (void) rename_process("(sd-stubinit)");
84
85 assert_se(sigemptyset(&waitmask) >= 0);
86 assert_se(sigset_add_many(&waitmask,
87 SIGCHLD, /* posix: process died */
88 SIGINT, /* sysv: ctrl-alt-del */
89 SIGRTMIN+3, /* systemd: halt */
90 SIGRTMIN+4, /* systemd: poweroff */
91 SIGRTMIN+5, /* systemd: reboot */
92 SIGRTMIN+6, /* systemd: kexec */
93 SIGRTMIN+13, /* systemd: halt */
94 SIGRTMIN+14, /* systemd: poweroff */
95 SIGRTMIN+15, /* systemd: reboot */
96 SIGRTMIN+16, /* systemd: kexec */
97 -1) >= 0);
98
99 /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
100 * support reexec/reloading in this stub process. */
101
102 for (;;) {
103 siginfo_t si;
104 usec_t current_usec;
105
106 si.si_pid = 0;
107 r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
108 if (r < 0) {
109 r = log_error_errno(errno, "Failed to reap children: %m");
110 goto finish;
111 }
112
113 current_usec = now(CLOCK_MONOTONIC);
114
115 if (si.si_pid == pid || current_usec >= quit_usec) {
116
117 /* The child we started ourselves died or we reached a timeout. */
118
119 if (state == STATE_REBOOT) { /* dispatch a queued reboot */
120 (void) reboot(RB_AUTOBOOT);
121 r = log_error_errno(errno, "Failed to reboot: %m");
122 goto finish;
123
124 } else if (state == STATE_POWEROFF)
125 (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */
126
127 if (si.si_pid == pid && si.si_code == CLD_EXITED)
128 r = si.si_status; /* pass on exit code */
129 else
130 r = 255; /* signal, coredump, timeout, … */
131
132 goto finish;
133 }
134 if (si.si_pid != 0)
135 /* We reaped something. Retry until there's nothing more to reap. */
136 continue;
137
138 if (quit_usec == USEC_INFINITY)
139 r = sigwaitinfo(&waitmask, &si);
140 else {
141 struct timespec ts;
142 r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec));
143 }
144 if (r < 0) {
145 if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
146 continue;
147 if (errno == EAGAIN) /* timeout reached */
148 continue;
149
150 r = log_error_errno(errno, "Failed to wait for signal: %m");
151 goto finish;
152 }
153
154 if (si.si_signo == SIGCHLD)
155 continue; /* Let's reap this */
156
157 if (state != STATE_RUNNING)
158 continue;
159
160 /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
161 * constant… */
162
163 if (si.si_signo == SIGRTMIN+3 ||
164 si.si_signo == SIGRTMIN+4 ||
165 si.si_signo == SIGRTMIN+13 ||
166 si.si_signo == SIGRTMIN+14)
167
168 state = STATE_POWEROFF;
169
170 else if (si.si_signo == SIGINT ||
171 si.si_signo == SIGRTMIN+5 ||
172 si.si_signo == SIGRTMIN+6 ||
173 si.si_signo == SIGRTMIN+15 ||
174 si.si_signo == SIGRTMIN+16)
175
176 state = STATE_REBOOT;
177 else
178 assert_not_reached("Got unexpected signal");
179
180 r = kill_and_sigcont(pid, SIGTERM);
181
182 /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
183 * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
184 * processes which handle both. That's because services tend to bind configuration reload or something
185 * else to SIGHUP. */
186
187 if (r != -ESRCH)
188 (void) kill(pid, SIGHUP);
189
190 quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
191 }
192
193 finish:
194 _exit(r < 0 ? EXIT_FAILURE : r);
195 }