]>
Commit | Line | Data |
---|---|---|
53e1b683 | 1 | /* SPDX-License-Identifier: LGPL-2.1+ */ |
7732f92b | 2 | |
f5947a5e | 3 | #include <sys/ioctl.h> |
7732f92b | 4 | #include <sys/reboot.h> |
7732f92b | 5 | #include <sys/wait.h> |
75bf701f | 6 | #include <sys/prctl.h> |
fe993888 | 7 | #include <unistd.h> |
7732f92b | 8 | |
f2fb2ec9 LP |
9 | #include "def.h" |
10 | #include "exit-status.h" | |
7732f92b LP |
11 | #include "fd-util.h" |
12 | #include "log.h" | |
13 | #include "nspawn-stub-pid1.h" | |
14 | #include "process-util.h" | |
15 | #include "signal-util.h" | |
16 | #include "time-util.h" | |
7732f92b | 17 | |
75bf701f LP |
18 | static int reset_environ(const char *new_environment, size_t length) { |
19 | unsigned long start, end; | |
20 | ||
21 | start = (unsigned long) new_environment; | |
22 | end = start + length; | |
23 | ||
24 | if (prctl(PR_SET_MM, PR_SET_MM_ENV_START, start, 0, 0) < 0) | |
25 | return -errno; | |
26 | ||
27 | if (prctl(PR_SET_MM, PR_SET_MM_ENV_END, end, 0, 0) < 0) | |
28 | return -errno; | |
29 | ||
30 | return 0; | |
31 | } | |
32 | ||
33 | int stub_pid1(sd_id128_t uuid) { | |
7732f92b LP |
34 | enum { |
35 | STATE_RUNNING, | |
36 | STATE_REBOOT, | |
37 | STATE_POWEROFF, | |
38 | } state = STATE_RUNNING; | |
39 | ||
40 | sigset_t fullmask, oldmask, waitmask; | |
41 | usec_t quit_usec = USEC_INFINITY; | |
42 | pid_t pid; | |
43 | int r; | |
44 | ||
75bf701f LP |
45 | /* The new environment we set up, on the stack. */ |
46 | char new_environment[] = | |
47 | "container=systemd-nspawn\0" | |
48 | "container_uuid=XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"; | |
49 | ||
7732f92b LP |
50 | /* Implements a stub PID 1, that reaps all processes and processes a couple of standard signals. This is useful |
51 | * for allowing arbitrary processes run in a container, and still have all zombies reaped. */ | |
52 | ||
53 | assert_se(sigfillset(&fullmask) >= 0); | |
54 | assert_se(sigprocmask(SIG_BLOCK, &fullmask, &oldmask) >= 0); | |
55 | ||
de1b29f3 ND |
56 | /* Surrender the terminal this stub may control so that child processes can have a controlling terminal |
57 | * without resorting to setsid hacks. */ | |
58 | r = ioctl(STDIN_FILENO, TIOCNOTTY); | |
59 | if (r < 0 && errno != ENOTTY) | |
60 | return log_error_errno(errno, "Failed to surrender controlling terminal: %m"); | |
61 | ||
7732f92b LP |
62 | pid = fork(); |
63 | if (pid < 0) | |
64 | return log_error_errno(errno, "Failed to fork child pid: %m"); | |
65 | ||
66 | if (pid == 0) { | |
67 | /* Return in the child */ | |
68 | assert_se(sigprocmask(SIG_SETMASK, &oldmask, NULL) >= 0); | |
69 | setsid(); | |
70 | return 0; | |
71 | } | |
72 | ||
73 | reset_all_signal_handlers(); | |
74 | ||
75 | log_close(); | |
7acf581a | 76 | (void) close_all_fds(NULL, 0); |
7732f92b LP |
77 | log_open(); |
78 | ||
75bf701f LP |
79 | /* Flush out /proc/self/environ, so that we don't leak the environment from the host into the container. Also, |
80 | * set $container= and $container_uuid= so that clients in the container that query it from /proc/1/environ | |
1a012455 | 81 | * find them set. */ |
75bf701f LP |
82 | sd_id128_to_string(uuid, new_environment + sizeof(new_environment) - SD_ID128_STRING_MAX); |
83 | reset_environ(new_environment, sizeof(new_environment)); | |
84 | ||
4c253ed1 | 85 | (void) rename_process("(sd-stubinit)"); |
7732f92b LP |
86 | |
87 | assert_se(sigemptyset(&waitmask) >= 0); | |
88 | assert_se(sigset_add_many(&waitmask, | |
89 | SIGCHLD, /* posix: process died */ | |
90 | SIGINT, /* sysv: ctrl-alt-del */ | |
91 | SIGRTMIN+3, /* systemd: halt */ | |
92 | SIGRTMIN+4, /* systemd: poweroff */ | |
93 | SIGRTMIN+5, /* systemd: reboot */ | |
94 | SIGRTMIN+6, /* systemd: kexec */ | |
95 | SIGRTMIN+13, /* systemd: halt */ | |
96 | SIGRTMIN+14, /* systemd: poweroff */ | |
97 | SIGRTMIN+15, /* systemd: reboot */ | |
98 | SIGRTMIN+16, /* systemd: kexec */ | |
99 | -1) >= 0); | |
100 | ||
101 | /* Note that we ignore SIGTERM (sysv's reexec), SIGHUP (reload), and all other signals here, since we don't | |
102 | * support reexec/reloading in this stub process. */ | |
103 | ||
104 | for (;;) { | |
105 | siginfo_t si; | |
106 | usec_t current_usec; | |
107 | ||
108 | si.si_pid = 0; | |
109 | r = waitid(P_ALL, 0, &si, WEXITED|WNOHANG); | |
110 | if (r < 0) { | |
111 | r = log_error_errno(errno, "Failed to reap children: %m"); | |
112 | goto finish; | |
113 | } | |
114 | ||
115 | current_usec = now(CLOCK_MONOTONIC); | |
116 | ||
117 | if (si.si_pid == pid || current_usec >= quit_usec) { | |
118 | ||
119 | /* The child we started ourselves died or we reached a timeout. */ | |
120 | ||
121 | if (state == STATE_REBOOT) { /* dispatch a queued reboot */ | |
122 | (void) reboot(RB_AUTOBOOT); | |
123 | r = log_error_errno(errno, "Failed to reboot: %m"); | |
124 | goto finish; | |
125 | ||
126 | } else if (state == STATE_POWEROFF) | |
127 | (void) reboot(RB_POWER_OFF); /* if this fails, fall back to normal exit. */ | |
128 | ||
129 | if (si.si_pid == pid && si.si_code == CLD_EXITED) | |
130 | r = si.si_status; /* pass on exit code */ | |
131 | else | |
f2fb2ec9 | 132 | r = EXIT_EXCEPTION; /* signal, coredump, timeout, … */ |
7732f92b LP |
133 | |
134 | goto finish; | |
135 | } | |
136 | if (si.si_pid != 0) | |
137 | /* We reaped something. Retry until there's nothing more to reap. */ | |
138 | continue; | |
139 | ||
140 | if (quit_usec == USEC_INFINITY) | |
141 | r = sigwaitinfo(&waitmask, &si); | |
142 | else { | |
143 | struct timespec ts; | |
144 | r = sigtimedwait(&waitmask, &si, timespec_store(&ts, quit_usec - current_usec)); | |
145 | } | |
146 | if (r < 0) { | |
147 | if (errno == EINTR) /* strace -p attach can result in EINTR, let's handle this nicely. */ | |
148 | continue; | |
149 | if (errno == EAGAIN) /* timeout reached */ | |
150 | continue; | |
151 | ||
152 | r = log_error_errno(errno, "Failed to wait for signal: %m"); | |
153 | goto finish; | |
154 | } | |
155 | ||
156 | if (si.si_signo == SIGCHLD) | |
157 | continue; /* Let's reap this */ | |
158 | ||
159 | if (state != STATE_RUNNING) | |
160 | continue; | |
161 | ||
162 | /* Would love to use a switch() statement here, but SIGRTMIN is actually a function call, not a | |
163 | * constant… */ | |
164 | ||
165 | if (si.si_signo == SIGRTMIN+3 || | |
166 | si.si_signo == SIGRTMIN+4 || | |
167 | si.si_signo == SIGRTMIN+13 || | |
168 | si.si_signo == SIGRTMIN+14) | |
169 | ||
170 | state = STATE_POWEROFF; | |
171 | ||
172 | else if (si.si_signo == SIGINT || | |
173 | si.si_signo == SIGRTMIN+5 || | |
174 | si.si_signo == SIGRTMIN+6 || | |
175 | si.si_signo == SIGRTMIN+15 || | |
176 | si.si_signo == SIGRTMIN+16) | |
177 | ||
178 | state = STATE_REBOOT; | |
179 | else | |
180 | assert_not_reached("Got unexpected signal"); | |
181 | ||
a4624785 LP |
182 | r = kill_and_sigcont(pid, SIGTERM); |
183 | ||
184 | /* Let's send a SIGHUP after the SIGTERM, as shells tend to ignore SIGTERM but do react to SIGHUP. We | |
185 | * do it strictly in this order, so that the SIGTERM is dispatched first, and SIGHUP second for those | |
186 | * processes which handle both. That's because services tend to bind configuration reload or something | |
187 | * else to SIGHUP. */ | |
188 | ||
189 | if (r != -ESRCH) | |
190 | (void) kill(pid, SIGHUP); | |
191 | ||
7732f92b LP |
192 | quit_usec = now(CLOCK_MONOTONIC) + DEFAULT_TIMEOUT_USEC; |
193 | } | |
194 | ||
195 | finish: | |
196 | _exit(r < 0 ? EXIT_FAILURE : r); | |
197 | } |