]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/shutdown.c
tree-wide: invoke rlimit_nofile_safe() before various exec{v,ve,l}() invocations
[thirdparty/systemd.git] / src / core / shutdown.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 Copyright © 2010 ProFUSION embedded systems
4 ***/
5
6 #include <errno.h>
7 #include <getopt.h>
8 #include <linux/reboot.h>
9 #include <signal.h>
10 #include <stdbool.h>
11 #include <stdlib.h>
12 #include <sys/mman.h>
13 #include <sys/mount.h>
14 #include <sys/reboot.h>
15 #include <sys/stat.h>
16 #include <unistd.h>
17
18 #include "alloc-util.h"
19 #include "async.h"
20 #include "cgroup-util.h"
21 #include "def.h"
22 #include "exec-util.h"
23 #include "fd-util.h"
24 #include "fileio.h"
25 #include "killall.h"
26 #include "log.h"
27 #include "missing.h"
28 #include "parse-util.h"
29 #include "process-util.h"
30 #include "reboot-util.h"
31 #include "rlimit-util.h"
32 #include "signal-util.h"
33 #include "string-util.h"
34 #include "switch-root.h"
35 #include "terminal-util.h"
36 #include "umount.h"
37 #include "util.h"
38 #include "virt.h"
39 #include "watchdog.h"
40
41 #define SYNC_PROGRESS_ATTEMPTS 3
42 #define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
43
44 static char* arg_verb;
45 static uint8_t arg_exit_code;
46 static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC;
47
48 static int parse_argv(int argc, char *argv[]) {
49 enum {
50 ARG_LOG_LEVEL = 0x100,
51 ARG_LOG_TARGET,
52 ARG_LOG_COLOR,
53 ARG_LOG_LOCATION,
54 ARG_EXIT_CODE,
55 ARG_TIMEOUT,
56 };
57
58 static const struct option options[] = {
59 { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
60 { "log-target", required_argument, NULL, ARG_LOG_TARGET },
61 { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
62 { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
63 { "exit-code", required_argument, NULL, ARG_EXIT_CODE },
64 { "timeout", required_argument, NULL, ARG_TIMEOUT },
65 {}
66 };
67
68 int c, r;
69
70 assert(argc >= 1);
71 assert(argv);
72
73 /* "-" prevents getopt from permuting argv[] and moving the verb away
74 * from argv[1]. Our interface to initrd promises it'll be there. */
75 while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0)
76 switch (c) {
77
78 case ARG_LOG_LEVEL:
79 r = log_set_max_level_from_string(optarg);
80 if (r < 0)
81 log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg);
82
83 break;
84
85 case ARG_LOG_TARGET:
86 r = log_set_target_from_string(optarg);
87 if (r < 0)
88 log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg);
89
90 break;
91
92 case ARG_LOG_COLOR:
93
94 if (optarg) {
95 r = log_show_color_from_string(optarg);
96 if (r < 0)
97 log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg);
98 } else
99 log_show_color(true);
100
101 break;
102
103 case ARG_LOG_LOCATION:
104 if (optarg) {
105 r = log_show_location_from_string(optarg);
106 if (r < 0)
107 log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg);
108 } else
109 log_show_location(true);
110
111 break;
112
113 case ARG_EXIT_CODE:
114 r = safe_atou8(optarg, &arg_exit_code);
115 if (r < 0)
116 log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg);
117
118 break;
119
120 case ARG_TIMEOUT:
121 r = parse_sec(optarg, &arg_timeout);
122 if (r < 0)
123 log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg);
124
125 break;
126
127 case '\001':
128 if (!arg_verb)
129 arg_verb = optarg;
130 else
131 log_error("Excess arguments, ignoring");
132 break;
133
134 case '?':
135 return -EINVAL;
136
137 default:
138 assert_not_reached("Unhandled option code.");
139 }
140
141 if (!arg_verb)
142 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
143 "Verb argument missing.");
144
145 return 0;
146 }
147
148 static int switch_root_initramfs(void) {
149 if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0)
150 return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m");
151
152 if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0)
153 return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m");
154
155 /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors.
156 * /run/initramfs/shutdown will take care of these.
157 * Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
158 */
159 return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
160 }
161
162 /* Read the following fields from /proc/meminfo:
163 *
164 * NFS_Unstable
165 * Writeback
166 * Dirty
167 *
168 * Return true if the sum of these fields is greater than the previous
169 * value input. For all other issues, report the failure and indicate that
170 * the sync is not making progress.
171 */
172 static bool sync_making_progress(unsigned long long *prev_dirty) {
173 _cleanup_fclose_ FILE *f = NULL;
174 unsigned long long val = 0;
175 bool r = false;
176
177 f = fopen("/proc/meminfo", "re");
178 if (!f)
179 return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
180
181 for (;;) {
182 _cleanup_free_ char *line = NULL;
183 unsigned long long ull = 0;
184 int q;
185
186 q = read_line(f, LONG_LINE_MAX, &line);
187 if (q < 0)
188 return log_warning_errno(q, "Failed to parse /proc/meminfo: %m");
189 if (q == 0)
190 break;
191
192 if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
193 continue;
194
195 errno = 0;
196 if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
197 if (errno != 0)
198 log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
199 else
200 log_warning("Failed to parse /proc/meminfo");
201
202 return false;
203 }
204
205 val += ull;
206 }
207
208 r = *prev_dirty > val;
209
210 *prev_dirty = val;
211
212 return r;
213 }
214
215 static void sync_with_progress(void) {
216 unsigned long long dirty = ULONG_LONG_MAX;
217 unsigned checks;
218 pid_t pid;
219 int r;
220
221 BLOCK_SIGNALS(SIGCHLD);
222
223 /* Due to the possiblity of the sync operation hanging, we fork a child process and monitor the progress. If
224 * the timeout lapses, the assumption is that that particular sync stalled. */
225
226 r = asynchronous_sync(&pid);
227 if (r < 0) {
228 log_error_errno(r, "Failed to fork sync(): %m");
229 return;
230 }
231
232 log_info("Syncing filesystems and block devices.");
233
234 /* Start monitoring the sync operation. If more than
235 * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
236 * we assume that the sync is stalled */
237 for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
238 r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
239 if (r == 0)
240 /* Sync finished without error.
241 * (The sync itself does not return an error code) */
242 return;
243 else if (r == -ETIMEDOUT) {
244 /* Reset the check counter if the "Dirty" value is
245 * decreasing */
246 if (sync_making_progress(&dirty))
247 checks = 0;
248 } else {
249 log_error_errno(r, "Failed to sync filesystems and block devices: %m");
250 return;
251 }
252 }
253
254 /* Only reached in the event of a timeout. We should issue a kill
255 * to the stray process. */
256 log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
257 (void) kill(pid, SIGKILL);
258 }
259
260 int main(int argc, char *argv[]) {
261 bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
262 bool in_container, use_watchdog = false, can_initrd;
263 _cleanup_free_ char *cgroup = NULL;
264 char *arguments[3];
265 int cmd, r, umount_log_level = LOG_INFO;
266 static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};
267 char *watchdog_device;
268
269 /* The log target defaults to console, but the original systemd process will pass its log target in through a
270 * command line argument, which will override this default. Also, ensure we'll never log to the journal or
271 * syslog, as these logging daemons are either already dead or will die very soon. */
272
273 log_set_target(LOG_TARGET_CONSOLE);
274 log_set_prohibit_ipc(true);
275 log_parse_environment();
276
277 r = parse_argv(argc, argv);
278 if (r < 0)
279 goto error;
280
281 log_open();
282
283 umask(0022);
284
285 if (getpid_cached() != 1) {
286 log_error("Not executed by init (PID 1).");
287 r = -EPERM;
288 goto error;
289 }
290
291 if (streq(arg_verb, "reboot"))
292 cmd = RB_AUTOBOOT;
293 else if (streq(arg_verb, "poweroff"))
294 cmd = RB_POWER_OFF;
295 else if (streq(arg_verb, "halt"))
296 cmd = RB_HALT_SYSTEM;
297 else if (streq(arg_verb, "kexec"))
298 cmd = LINUX_REBOOT_CMD_KEXEC;
299 else if (streq(arg_verb, "exit"))
300 cmd = 0; /* ignored, just checking that arg_verb is valid */
301 else {
302 log_error("Unknown action '%s'.", arg_verb);
303 r = -EINVAL;
304 goto error;
305 }
306
307 (void) cg_get_root_path(&cgroup);
308 in_container = detect_container() > 0;
309
310 use_watchdog = getenv("WATCHDOG_USEC");
311 watchdog_device = getenv("WATCHDOG_DEVICE");
312 if (watchdog_device) {
313 r = watchdog_set_device(watchdog_device);
314 if (r < 0)
315 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
316 watchdog_device);
317 }
318
319 /* Lock us into memory */
320 (void) mlockall(MCL_CURRENT|MCL_FUTURE);
321
322 /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
323 * slow IO is processed here already and the final process killing spree is not impacted by processes
324 * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
325 * result. */
326 if (!in_container)
327 sync_with_progress();
328
329 disable_coredumps();
330
331 log_info("Sending SIGTERM to remaining processes...");
332 broadcast_signal(SIGTERM, true, true, arg_timeout);
333
334 log_info("Sending SIGKILL to remaining processes...");
335 broadcast_signal(SIGKILL, true, false, arg_timeout);
336
337 need_umount = !in_container;
338 need_swapoff = !in_container;
339 need_loop_detach = !in_container;
340 need_dm_detach = !in_container;
341 can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0;
342
343 /* Unmount all mountpoints, swaps, and loopback devices */
344 for (;;) {
345 bool changed = false;
346
347 if (use_watchdog)
348 watchdog_ping();
349
350 /* Let's trim the cgroup tree on each iteration so
351 that we leave an empty cgroup tree around, so that
352 container managers get a nice notify event when we
353 are down */
354 if (cgroup)
355 cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false);
356
357 if (need_umount) {
358 log_info("Unmounting file systems.");
359 r = umount_all(&changed, umount_log_level);
360 if (r == 0) {
361 need_umount = false;
362 log_info("All filesystems unmounted.");
363 } else if (r > 0)
364 log_info("Not all file systems unmounted, %d left.", r);
365 else
366 log_error_errno(r, "Failed to unmount file systems: %m");
367 }
368
369 if (need_swapoff) {
370 log_info("Deactivating swaps.");
371 r = swapoff_all(&changed);
372 if (r == 0) {
373 need_swapoff = false;
374 log_info("All swaps deactivated.");
375 } else if (r > 0)
376 log_info("Not all swaps deactivated, %d left.", r);
377 else
378 log_error_errno(r, "Failed to deactivate swaps: %m");
379 }
380
381 if (need_loop_detach) {
382 log_info("Detaching loop devices.");
383 r = loopback_detach_all(&changed, umount_log_level);
384 if (r == 0) {
385 need_loop_detach = false;
386 log_info("All loop devices detached.");
387 } else if (r > 0)
388 log_info("Not all loop devices detached, %d left.", r);
389 else
390 log_error_errno(r, "Failed to detach loop devices: %m");
391 }
392
393 if (need_dm_detach) {
394 log_info("Detaching DM devices.");
395 r = dm_detach_all(&changed, umount_log_level);
396 if (r == 0) {
397 need_dm_detach = false;
398 log_info("All DM devices detached.");
399 } else if (r > 0)
400 log_info("Not all DM devices detached, %d left.", r);
401 else
402 log_error_errno(r, "Failed to detach DM devices: %m");
403 }
404
405 if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach) {
406 log_info("All filesystems, swaps, loop devices and DM devices detached.");
407 /* Yay, done */
408 break;
409 }
410
411 if (!changed && umount_log_level == LOG_INFO && !can_initrd) {
412 /* There are things we cannot get rid of. Loop one more time
413 * with LOG_ERR to inform the user. Note that we don't need
414 * to do this if there is a initrd to switch to, because that
415 * one is likely to get rid of the remounting mounts. If not,
416 * it will log about them. */
417 umount_log_level = LOG_ERR;
418 continue;
419 }
420
421 /* If in this iteration we didn't manage to
422 * unmount/deactivate anything, we simply give up */
423 if (!changed) {
424 log_info("Cannot finalize remaining%s%s%s%s continuing.",
425 need_umount ? " file systems," : "",
426 need_swapoff ? " swap devices," : "",
427 need_loop_detach ? " loop devices," : "",
428 need_dm_detach ? " DM devices," : "");
429 break;
430 }
431
432 log_debug("Couldn't finalize remaining %s%s%s%s trying again.",
433 need_umount ? " file systems," : "",
434 need_swapoff ? " swap devices," : "",
435 need_loop_detach ? " loop devices," : "",
436 need_dm_detach ? " DM devices," : "");
437 }
438
439 /* We're done with the watchdog. */
440 watchdog_free_device();
441
442 arguments[0] = NULL;
443 arguments[1] = arg_verb;
444 arguments[2] = NULL;
445 execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL);
446
447 (void) rlimit_nofile_safe();
448
449 if (can_initrd) {
450 r = switch_root_initramfs();
451 if (r >= 0) {
452 argv[0] = (char*) "/shutdown";
453
454 setsid();
455 make_console_stdio();
456
457 log_info("Successfully changed into root pivot.\n"
458 "Returning to initrd...");
459
460 execv("/shutdown", argv);
461 log_error_errno(errno, "Failed to execute shutdown binary: %m");
462 } else
463 log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m");
464
465 }
466
467 if (need_umount || need_swapoff || need_loop_detach || need_dm_detach)
468 log_error("Failed to finalize %s%s%s%s ignoring",
469 need_umount ? " file systems," : "",
470 need_swapoff ? " swap devices," : "",
471 need_loop_detach ? " loop devices," : "",
472 need_dm_detach ? " DM devices," : "");
473
474 /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
475 * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
476 * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
477 * let's do it once more. Do not remove this sync, data corruption will result. */
478 if (!in_container)
479 sync_with_progress();
480
481 if (streq(arg_verb, "exit")) {
482 if (in_container)
483 return arg_exit_code;
484
485 cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */
486 }
487
488 switch (cmd) {
489
490 case LINUX_REBOOT_CMD_KEXEC:
491
492 if (!in_container) {
493 /* We cheat and exec kexec to avoid doing all its work */
494 log_info("Rebooting with kexec.");
495
496 r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL);
497 if (r == 0) {
498 const char * const args[] = {
499 KEXEC, "-e", NULL
500 };
501
502 /* Child */
503
504 execv(args[0], (char * const *) args);
505 _exit(EXIT_FAILURE);
506 }
507
508 /* If we are still running, then the kexec can't have worked, let's fall through */
509 }
510
511 cmd = RB_AUTOBOOT;
512 _fallthrough_;
513
514 case RB_AUTOBOOT:
515 (void) reboot_with_parameter(REBOOT_LOG);
516 log_info("Rebooting.");
517 break;
518
519 case RB_POWER_OFF:
520 log_info("Powering off.");
521 break;
522
523 case RB_HALT_SYSTEM:
524 log_info("Halting system.");
525 break;
526
527 default:
528 assert_not_reached("Unknown magic");
529 }
530
531 (void) reboot(cmd);
532 if (errno == EPERM && in_container) {
533 /* If we are in a container, and we lacked
534 * CAP_SYS_BOOT just exit, this will kill our
535 * container for good. */
536 log_info("Exiting container.");
537 return EXIT_SUCCESS;
538 }
539
540 r = log_error_errno(errno, "Failed to invoke reboot(): %m");
541
542 error:
543 log_emergency_errno(r, "Critical error while doing system shutdown: %m");
544 freeze();
545 }