]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/nspawn/nspawn-stub-pid1.c
Merge pull request #7388 from keszybz/doc-tweak
[thirdparty/systemd.git] / src / nspawn / nspawn-stub-pid1.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2016 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <sys/reboot.h>
21 #include <sys/wait.h>
22 #include <sys/prctl.h>
23 #include <unistd.h>
24
25 #include "fd-util.h"
26 #include "log.h"
27 #include "nspawn-stub-pid1.h"
28 #include "process-util.h"
29 #include "signal-util.h"
30 #include "time-util.h"
31 #include "def.h"
32
33 static int reset_environ(const char *new_environment, size_t length) {
34 unsigned long start, end;
35
36 start = (unsigned long) new_environment;
37 end = start + length;
38
39 if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0)
40 return -errno;
41
42 if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0)
43 return -errno;
44
45 return 0;
46 }
47
48 int stub_pid1(sd_id128_t uuid) {
49 enum {
50 STATE_RUNNING,
51 STATE_REBOOT,
52 STATE_POWEROFF,
53 } state = STATE_RUNNING;
54
55 sigset_t fullmask, oldmask, waitmask;
56 usec_t quit_usec = USEC_INFINITY;
57 pid_t pid;
58 int r;
59
60 /* The new environment we set up, on the stack. */
61 char new_environment[] =
62 "container=systemd-nspawn\0"
63 "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX";
64
65 /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful
66 * for allowing arbitrary processes run in a container, and still have all zombies reaped. */
67
68 assert_se(sigfillset(&fullmask) >= 0);
69 assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0);
70
71 pid = fork();
72 if (pid < 0)
73 return log_error_errno(errno, "Failed to fork child pid: %m");
74
75 if (pid == 0) {
76 /* Return in the child */
77 assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0);
78 setsid();
79 return 0;
80 }
81
82 reset_all_signal_handlers();
83
84 log_close();
85 close_all_fds(NULL, 0);
86 log_open();
87
88 /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also,
89 * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ
90 * find them set. */
91 sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX);
92 reset_environ(new_environment, sizeof(new_environment));
93
94 rename_process("STUBINIT");
95
96 assert_se(sigemptyset(&waitmask) >= 0);
97 assert_se(sigset_add_many(&waitmask,
98 SIGCHLD, /* posix: process died */
99 SIGINT, /* sysv: ctrl-alt-del */
100 SIGRTMIN+3, /* systemd: halt */
101 SIGRTMIN+4, /* systemd: poweroff */
102 SIGRTMIN+5, /* systemd: reboot */
103 SIGRTMIN+6, /* systemd: kexec */
104 SIGRTMIN+13, /* systemd: halt */
105 SIGRTMIN+14, /* systemd: poweroff */
106 SIGRTMIN+15, /* systemd: reboot */
107 SIGRTMIN+16, /* systemd: kexec */
108 -1) >= 0);
109
110 /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't
111 * support reexec/reloading in this stub process. */
112
113 for (;;) {
114 siginfo_t si;
115 usec_t current_usec;
116
117 si.si_pid = 0;
118 r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG);
119 if (r < 0) {
120 r = log_error_errno(errno, "Failed to reap children: %m");
121 goto finish;
122 }
123
124 current_usec = now(CLOCK_MONOTONIC);
125
126 if (si.si_pid == pid || current_usec >= quit_usec) {
127
128 /* The child we started ourselves died or we reached a timeout. */
129
130 if (state == STATE_REBOOT) { /* dispatch a queued reboot */
131 (void) reboot(RB_AUTOBOOT);
132 r = log_error_errno(errno, "Failed to reboot: %m");
133 goto finish;
134
135 } else if (state == STATE_POWEROFF)
136 (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */
137
138 if (si.si_pid == pid && si.si_code == CLD_EXITED)
139 r = si.si_status; /* pass on exit code */
140 else
141 r = 255; /* signal, coredump, timeout, … */
142
143 goto finish;
144 }
145 if (si.si_pid != 0)
146 /* We reaped something. Retry until there's nothing more to reap. */
147 continue;
148
149 if (quit_usec == USEC_INFINITY)
150 r = sigwaitinfo(&waitmask, &si);
151 else {
152 struct timespec ts;
153 r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec));
154 }
155 if (r < 0) {
156 if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */
157 continue;
158 if (errno == EAGAIN) /* timeout reached */
159 continue;
160
161 r = log_error_errno(errno, "Failed to wait for signal: %m");
162 goto finish;
163 }
164
165 if (si.si_signo == SIGCHLD)
166 continue; /* Let's reap this */
167
168 if (state != STATE_RUNNING)
169 continue;
170
171 /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a
172 * constant… */
173
174 if (si.si_signo == SIGRTMIN+3 ||
175 si.si_signo == SIGRTMIN+4 ||
176 si.si_signo == SIGRTMIN+13 ||
177 si.si_signo == SIGRTMIN+14)
178
179 state = STATE_POWEROFF;
180
181 else if (si.si_signo == SIGINT ||
182 si.si_signo == SIGRTMIN+5 ||
183 si.si_signo == SIGRTMIN+6 ||
184 si.si_signo == SIGRTMIN+15 ||
185 si.si_signo == SIGRTMIN+16)
186
187 state = STATE_REBOOT;
188 else
189 assert_not_reached("Got unexpected signal");
190
191 r = kill_and_sigcont(pid, SIGTERM);
192
193 /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We
194 * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those
195 * processes which handle both. That's because services tend to bind configuration reload or something
196 * else to SIGHUP. */
197
198 if (r != -ESRCH)
199 (void) kill(pid, SIGHUP);
200
201 quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC;
202 }
203
204 finish:
205 _exit(r < 0 ? EXIT_FAILURE : r);
206 }