]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/shutdown.c
tree-wide: drop license boilerplate
[thirdparty/systemd.git] / src / core / shutdown.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2010 ProFUSION embedded systems
6 ***/
7
8 #include <errno.h>
9 #include <getopt.h>
10 #include <linux/reboot.h>
11 #include <signal.h>
12 #include <stdbool.h>
13 #include <stdlib.h>
14 #include <sys/mman.h>
15 #include <sys/mount.h>
16 #include <sys/reboot.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19
20 #include "alloc-util.h"
21 #include "async.h"
22 #include "cgroup-util.h"
23 #include "def.h"
24 #include "exec-util.h"
25 #include "fd-util.h"
26 #include "fileio.h"
27 #include "killall.h"
28 #include "log.h"
29 #include "missing.h"
30 #include "parse-util.h"
31 #include "process-util.h"
32 #include "reboot-util.h"
33 #include "signal-util.h"
34 #include "string-util.h"
35 #include "switch-root.h"
36 #include "terminal-util.h"
37 #include "umount.h"
38 #include "util.h"
39 #include "virt.h"
40 #include "watchdog.h"
41
42 #define SYNC_PROGRESS_ATTEMPTS 3
43 #define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
44
45 static char* arg_verb;
46 static uint8_t arg_exit_code;
47 static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC;
48
49 static int parse_argv(int argc, char *argv[]) {
50 enum {
51 ARG_LOG_LEVEL = 0x100,
52 ARG_LOG_TARGET,
53 ARG_LOG_COLOR,
54 ARG_LOG_LOCATION,
55 ARG_EXIT_CODE,
56 ARG_TIMEOUT,
57 };
58
59 static const struct option options[] = {
60 { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
61 { "log-target", required_argument, NULL, ARG_LOG_TARGET },
62 { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
63 { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
64 { "exit-code", required_argument, NULL, ARG_EXIT_CODE },
65 { "timeout", required_argument, NULL, ARG_TIMEOUT },
66 {}
67 };
68
69 int c, r;
70
71 assert(argc >= 1);
72 assert(argv);
73
74 /* "-" prevents getopt from permuting argv[] and moving the verb away
75 * from argv[1]. Our interface to initrd promises it'll be there. */
76 while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0)
77 switch (c) {
78
79 case ARG_LOG_LEVEL:
80 r = log_set_max_level_from_string(optarg);
81 if (r < 0)
82 log_error_errno(r, "Failed to parse log level %s, ignoring.", optarg);
83
84 break;
85
86 case ARG_LOG_TARGET:
87 r = log_set_target_from_string(optarg);
88 if (r < 0)
89 log_error_errno(r, "Failed to parse log target %s, ignoring", optarg);
90
91 break;
92
93 case ARG_LOG_COLOR:
94
95 if (optarg) {
96 r = log_show_color_from_string(optarg);
97 if (r < 0)
98 log_error_errno(r, "Failed to parse log color setting %s, ignoring", optarg);
99 } else
100 log_show_color(true);
101
102 break;
103
104 case ARG_LOG_LOCATION:
105 if (optarg) {
106 r = log_show_location_from_string(optarg);
107 if (r < 0)
108 log_error_errno(r, "Failed to parse log location setting %s, ignoring", optarg);
109 } else
110 log_show_location(true);
111
112 break;
113
114 case ARG_EXIT_CODE:
115 r = safe_atou8(optarg, &arg_exit_code);
116 if (r < 0)
117 log_error_errno(r, "Failed to parse exit code %s, ignoring", optarg);
118
119 break;
120
121 case ARG_TIMEOUT:
122 r = parse_sec(optarg, &arg_timeout);
123 if (r < 0)
124 log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring", optarg);
125
126 break;
127
128 case '\001':
129 if (!arg_verb)
130 arg_verb = optarg;
131 else
132 log_error("Excess arguments, ignoring");
133 break;
134
135 case '?':
136 return -EINVAL;
137
138 default:
139 assert_not_reached("Unhandled option code.");
140 }
141
142 if (!arg_verb) {
143 log_error("Verb argument missing.");
144 return -EINVAL;
145 }
146
147 return 0;
148 }
149
150 static int switch_root_initramfs(void) {
151 if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0)
152 return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m");
153
154 if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0)
155 return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m");
156
157 /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors.
158 * /run/initramfs/shutdown will take care of these.
159 * Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
160 */
161 return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
162 }
163
164 /* Read the following fields from /proc/meminfo:
165 *
166 * NFS_Unstable
167 * Writeback
168 * Dirty
169 *
170 * Return true if the sum of these fields is greater than the previous
171 * value input. For all other issues, report the failure and indicate that
172 * the sync is not making progress.
173 */
174 static bool sync_making_progress(unsigned long long *prev_dirty) {
175 _cleanup_fclose_ FILE *f = NULL;
176 char line[LINE_MAX];
177 bool r = false;
178 unsigned long long val = 0;
179
180 f = fopen("/proc/meminfo", "re");
181 if (!f)
182 return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
183
184 FOREACH_LINE(line, f, log_warning_errno(errno, "Failed to parse /proc/meminfo: %m")) {
185 unsigned long long ull = 0;
186
187 if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
188 continue;
189
190 errno = 0;
191 if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
192 if (errno != 0)
193 log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
194 else
195 log_warning("Failed to parse /proc/meminfo");
196
197 return false;
198 }
199
200 val += ull;
201 }
202
203 r = *prev_dirty > val;
204
205 *prev_dirty = val;
206
207 return r;
208 }
209
210 static void sync_with_progress(void) {
211 unsigned long long dirty = ULONG_LONG_MAX;
212 unsigned checks;
213 pid_t pid;
214 int r;
215
216 BLOCK_SIGNALS(SIGCHLD);
217
218 /* Due to the possiblity of the sync operation hanging, we fork a child process and monitor the progress. If
219 * the timeout lapses, the assumption is that that particular sync stalled. */
220
221 r = asynchronous_sync(&pid);
222 if (r < 0) {
223 log_error_errno(r, "Failed to fork sync(): %m");
224 return;
225 }
226
227 log_info("Syncing filesystems and block devices.");
228
229 /* Start monitoring the sync operation. If more than
230 * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
231 * we assume that the sync is stalled */
232 for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
233 r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
234 if (r == 0)
235 /* Sync finished without error.
236 * (The sync itself does not return an error code) */
237 return;
238 else if (r == -ETIMEDOUT) {
239 /* Reset the check counter if the "Dirty" value is
240 * decreasing */
241 if (sync_making_progress(&dirty))
242 checks = 0;
243 } else {
244 log_error_errno(r, "Failed to sync filesystems and block devices: %m");
245 return;
246 }
247 }
248
249 /* Only reached in the event of a timeout. We should issue a kill
250 * to the stray process. */
251 log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
252 (void) kill(pid, SIGKILL);
253 }
254
255 int main(int argc, char *argv[]) {
256 bool need_umount, need_swapoff, need_loop_detach, need_dm_detach;
257 bool in_container, use_watchdog = false, can_initrd;
258 _cleanup_free_ char *cgroup = NULL;
259 char *arguments[3];
260 int cmd, r, umount_log_level = LOG_INFO;
261 static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};
262 char *watchdog_device;
263
264 /* The log target defaults to console, but the original systemd process will pass its log target in through a
265 * command line argument, which will override this default. Also, ensure we'll never log to the journal or
266 * syslog, as these logging daemons are either already dead or will die very soon. */
267
268 log_set_target(LOG_TARGET_CONSOLE);
269 log_set_prohibit_ipc(true);
270 log_parse_environment();
271
272 r = parse_argv(argc, argv);
273 if (r < 0)
274 goto error;
275
276 log_open();
277
278 umask(0022);
279
280 if (getpid_cached() != 1) {
281 log_error("Not executed by init (PID 1).");
282 r = -EPERM;
283 goto error;
284 }
285
286 if (streq(arg_verb, "reboot"))
287 cmd = RB_AUTOBOOT;
288 else if (streq(arg_verb, "poweroff"))
289 cmd = RB_POWER_OFF;
290 else if (streq(arg_verb, "halt"))
291 cmd = RB_HALT_SYSTEM;
292 else if (streq(arg_verb, "kexec"))
293 cmd = LINUX_REBOOT_CMD_KEXEC;
294 else if (streq(arg_verb, "exit"))
295 cmd = 0; /* ignored, just checking that arg_verb is valid */
296 else {
297 log_error("Unknown action '%s'.", arg_verb);
298 r = -EINVAL;
299 goto error;
300 }
301
302 (void) cg_get_root_path(&cgroup);
303 in_container = detect_container() > 0;
304
305 use_watchdog = !!getenv("WATCHDOG_USEC");
306 watchdog_device = getenv("WATCHDOG_DEVICE");
307 if (watchdog_device) {
308 r = watchdog_set_device(watchdog_device);
309 if (r < 0)
310 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
311 watchdog_device);
312 }
313
314 /* Lock us into memory */
315 (void) mlockall(MCL_CURRENT|MCL_FUTURE);
316
317 /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
318 * slow IO is processed here already and the final process killing spree is not impacted by processes
319 * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
320 * result. */
321 if (!in_container)
322 sync_with_progress();
323
324 disable_coredumps();
325
326 log_info("Sending SIGTERM to remaining processes...");
327 broadcast_signal(SIGTERM, true, true, arg_timeout);
328
329 log_info("Sending SIGKILL to remaining processes...");
330 broadcast_signal(SIGKILL, true, false, arg_timeout);
331
332 need_umount = !in_container;
333 need_swapoff = !in_container;
334 need_loop_detach = !in_container;
335 need_dm_detach = !in_container;
336 can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0;
337
338 /* Unmount all mountpoints, swaps, and loopback devices */
339 for (;;) {
340 bool changed = false;
341
342 if (use_watchdog)
343 watchdog_ping();
344
345 /* Let's trim the cgroup tree on each iteration so
346 that we leave an empty cgroup tree around, so that
347 container managers get a nice notify event when we
348 are down */
349 if (cgroup)
350 cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false);
351
352 if (need_umount) {
353 log_info("Unmounting file systems.");
354 r = umount_all(&changed, umount_log_level);
355 if (r == 0) {
356 need_umount = false;
357 log_info("All filesystems unmounted.");
358 } else if (r > 0)
359 log_info("Not all file systems unmounted, %d left.", r);
360 else
361 log_error_errno(r, "Failed to unmount file systems: %m");
362 }
363
364 if (need_swapoff) {
365 log_info("Deactivating swaps.");
366 r = swapoff_all(&changed);
367 if (r == 0) {
368 need_swapoff = false;
369 log_info("All swaps deactivated.");
370 } else if (r > 0)
371 log_info("Not all swaps deactivated, %d left.", r);
372 else
373 log_error_errno(r, "Failed to deactivate swaps: %m");
374 }
375
376 if (need_loop_detach) {
377 log_info("Detaching loop devices.");
378 r = loopback_detach_all(&changed, umount_log_level);
379 if (r == 0) {
380 need_loop_detach = false;
381 log_info("All loop devices detached.");
382 } else if (r > 0)
383 log_info("Not all loop devices detached, %d left.", r);
384 else
385 log_error_errno(r, "Failed to detach loop devices: %m");
386 }
387
388 if (need_dm_detach) {
389 log_info("Detaching DM devices.");
390 r = dm_detach_all(&changed, umount_log_level);
391 if (r == 0) {
392 need_dm_detach = false;
393 log_info("All DM devices detached.");
394 } else if (r > 0)
395 log_info("Not all DM devices detached, %d left.", r);
396 else
397 log_error_errno(r, "Failed to detach DM devices: %m");
398 }
399
400 if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach) {
401 log_info("All filesystems, swaps, loop devices and DM devices detached.");
402 /* Yay, done */
403 break;
404 }
405
406 if (!changed && umount_log_level == LOG_INFO && !can_initrd) {
407 /* There are things we cannot get rid of. Loop one more time
408 * with LOG_ERR to inform the user. Note that we don't need
409 * to do this if there is a initrd to switch to, because that
410 * one is likely to get rid of the remounting mounts. If not,
411 * it will log about them. */
412 umount_log_level = LOG_ERR;
413 continue;
414 }
415
416 /* If in this iteration we didn't manage to
417 * unmount/deactivate anything, we simply give up */
418 if (!changed) {
419 log_info("Cannot finalize remaining%s%s%s%s continuing.",
420 need_umount ? " file systems," : "",
421 need_swapoff ? " swap devices," : "",
422 need_loop_detach ? " loop devices," : "",
423 need_dm_detach ? " DM devices," : "");
424 break;
425 }
426
427 log_debug("Couldn't finalize remaining %s%s%s%s trying again.",
428 need_umount ? " file systems," : "",
429 need_swapoff ? " swap devices," : "",
430 need_loop_detach ? " loop devices," : "",
431 need_dm_detach ? " DM devices," : "");
432 }
433
434 /* We're done with the watchdog. */
435 watchdog_free_device();
436
437 arguments[0] = NULL;
438 arguments[1] = arg_verb;
439 arguments[2] = NULL;
440 execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments);
441
442 if (can_initrd) {
443 r = switch_root_initramfs();
444 if (r >= 0) {
445 argv[0] = (char*) "/shutdown";
446
447 setsid();
448 make_console_stdio();
449
450 log_info("Successfully changed into root pivot.\n"
451 "Returning to initrd...");
452
453 execv("/shutdown", argv);
454 log_error_errno(errno, "Failed to execute shutdown binary: %m");
455 } else
456 log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m");
457
458 }
459
460 if (need_umount || need_swapoff || need_loop_detach || need_dm_detach)
461 log_error("Failed to finalize %s%s%s%s ignoring",
462 need_umount ? " file systems," : "",
463 need_swapoff ? " swap devices," : "",
464 need_loop_detach ? " loop devices," : "",
465 need_dm_detach ? " DM devices," : "");
466
467 /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
468 * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
469 * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
470 * let's do it once more. Do not remove this sync, data corruption will result. */
471 if (!in_container)
472 sync_with_progress();
473
474 if (streq(arg_verb, "exit")) {
475 if (in_container)
476 return arg_exit_code;
477
478 cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */
479 }
480
481 switch (cmd) {
482
483 case LINUX_REBOOT_CMD_KEXEC:
484
485 if (!in_container) {
486 /* We cheat and exec kexec to avoid doing all its work */
487 log_info("Rebooting with kexec.");
488
489 r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL);
490 if (r == 0) {
491 const char * const args[] = {
492 KEXEC, "-e", NULL
493 };
494
495 /* Child */
496
497 execv(args[0], (char * const *) args);
498 _exit(EXIT_FAILURE);
499 }
500
501 /* If we are still running, then the kexec can't have worked, let's fall through */
502 }
503
504 cmd = RB_AUTOBOOT;
505 _fallthrough_;
506
507 case RB_AUTOBOOT:
508 (void) reboot_with_parameter(REBOOT_LOG);
509 log_info("Rebooting.");
510 break;
511
512 case RB_POWER_OFF:
513 log_info("Powering off.");
514 break;
515
516 case RB_HALT_SYSTEM:
517 log_info("Halting system.");
518 break;
519
520 default:
521 assert_not_reached("Unknown magic");
522 }
523
524 (void) reboot(cmd);
525 if (errno == EPERM && in_container) {
526 /* If we are in a container, and we lacked
527 * CAP_SYS_BOOT just exit, this will kill our
528 * container for good. */
529 log_info("Exiting container.");
530 return EXIT_SUCCESS;
531 }
532
533 r = log_error_errno(errno, "Failed to invoke reboot(): %m");
534
535 error:
536 log_emergency_errno(r, "Critical error while doing system shutdown: %m");
537 freeze();
538 }