src/shutdown/shutdown.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2 /***
   3   Copyright © 2010 ProFUSION embedded systems
   4 ***/
   5
   6 #include <errno.h>
   7 #include <getopt.h>
   8 #include <linux/reboot.h>
   9 #include <stdbool.h>
  10 #include <stdlib.h>
  11 #include <sys/mman.h>
  12 #include <sys/mount.h>
  13 #include <sys/reboot.h>
  14 #include <sys/stat.h>
  15 #include <unistd.h>
  16
  17 #include "alloc-util.h"
  18 #include "async.h"
  19 #include "binfmt-util.h"
  20 #include "cgroup-setup.h"
  21 #include "cgroup-util.h"
  22 #include "def.h"
  23 #include "exec-util.h"
  24 #include "fd-util.h"
  25 #include "fileio.h"
  26 #include "killall.h"
  27 #include "log.h"
  28 #include "parse-util.h"
  29 #include "process-util.h"
  30 #include "reboot-util.h"
  31 #include "rlimit-util.h"
  32 #include "signal-util.h"
  33 #include "string-util.h"
  34 #include "switch-root.h"
  35 #include "sysctl-util.h"
  36 #include "terminal-util.h"
  37 #include "umount.h"
  38 #include "util.h"
  39 #include "virt.h"
  40 #include "watchdog.h"
  41
  42 #define SYNC_PROGRESS_ATTEMPTS 3
  43 #define SYNC_TIMEOUT_USEC (10*USEC_PER_SEC)
  44
  45 static char* arg_verb;
  46 static uint8_t arg_exit_code;
  47 static usec_t arg_timeout = DEFAULT_TIMEOUT_USEC;
  48
  49 static int parse_argv(int argc, char *argv[]) {
  50         enum {
  51                 ARG_LOG_LEVEL = 0x100,
  52                 ARG_LOG_TARGET,
  53                 ARG_LOG_COLOR,
  54                 ARG_LOG_LOCATION,
  55                 ARG_LOG_TIME,
  56                 ARG_EXIT_CODE,
  57                 ARG_TIMEOUT,
  58         };
  59
  60         static const struct option options[] = {
  61                 { "log-level",     required_argument, NULL, ARG_LOG_LEVEL    },
  62                 { "log-target",    required_argument, NULL, ARG_LOG_TARGET   },
  63                 { "log-color",     optional_argument, NULL, ARG_LOG_COLOR    },
  64                 { "log-location",  optional_argument, NULL, ARG_LOG_LOCATION },
  65                 { "log-time",      optional_argument, NULL, ARG_LOG_TIME     },
  66                 { "exit-code",     required_argument, NULL, ARG_EXIT_CODE    },
  67                 { "timeout",       required_argument, NULL, ARG_TIMEOUT      },
  68                 {}
  69         };
  70
  71         int c, r;
  72
  73         assert(argc >= 1);
  74         assert(argv);
  75
  76         /* "-" prevents getopt from permuting argv[] and moving the verb away
  77          * from argv[1]. Our interface to initrd promises it'll be there. */
  78         while ((c = getopt_long(argc, argv, "-", options, NULL)) >= 0)
  79                 switch (c) {
  80
  81                 case ARG_LOG_LEVEL:
  82                         r = log_set_max_level_from_string(optarg);
  83                         if (r < 0)
  84                                 log_error_errno(r, "Failed to parse log level %s, ignoring: %m", optarg);
  85
  86                         break;
  87
  88                 case ARG_LOG_TARGET:
  89                         r = log_set_target_from_string(optarg);
  90                         if (r < 0)
  91                                 log_error_errno(r, "Failed to parse log target %s, ignoring: %m", optarg);
  92
  93                         break;
  94
  95                 case ARG_LOG_COLOR:
  96
  97                         if (optarg) {
  98                                 r = log_show_color_from_string(optarg);
  99                                 if (r < 0)
 100                                         log_error_errno(r, "Failed to parse log color setting %s, ignoring: %m", optarg);
 101                         } else
 102                                 log_show_color(true);
 103
 104                         break;
 105
 106                 case ARG_LOG_LOCATION:
 107                         if (optarg) {
 108                                 r = log_show_location_from_string(optarg);
 109                                 if (r < 0)
 110                                         log_error_errno(r, "Failed to parse log location setting %s, ignoring: %m", optarg);
 111                         } else
 112                                 log_show_location(true);
 113
 114                         break;
 115
 116                 case ARG_LOG_TIME:
 117
 118                         if (optarg) {
 119                                 r = log_show_time_from_string(optarg);
 120                                 if (r < 0)
 121                                         log_error_errno(r, "Failed to parse log time setting %s, ignoring: %m", optarg);
 122                         } else
 123                                 log_show_time(true);
 124
 125                         break;
 126
 127                 case ARG_EXIT_CODE:
 128                         r = safe_atou8(optarg, &arg_exit_code);
 129                         if (r < 0)
 130                                 log_error_errno(r, "Failed to parse exit code %s, ignoring: %m", optarg);
 131
 132                         break;
 133
 134                 case ARG_TIMEOUT:
 135                         r = parse_sec(optarg, &arg_timeout);
 136                         if (r < 0)
 137                                 log_error_errno(r, "Failed to parse shutdown timeout %s, ignoring: %m", optarg);
 138
 139                         break;
 140
 141                 case '\001':
 142                         if (!arg_verb)
 143                                 arg_verb = optarg;
 144                         else
 145                                 log_error("Excess arguments, ignoring");
 146                         break;
 147
 148                 case '?':
 149                         return -EINVAL;
 150
 151                 default:
 152                         assert_not_reached("Unhandled option code.");
 153                 }
 154
 155         if (!arg_verb)
 156                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
 157                                        "Verb argument missing.");
 158
 159         return 0;
 160 }
 161
 162 static int switch_root_initramfs(void) {
 163         if (mount("/run/initramfs", "/run/initramfs", NULL, MS_BIND, NULL) < 0)
 164                 return log_error_errno(errno, "Failed to mount bind /run/initramfs on /run/initramfs: %m");
 165
 166         if (mount(NULL, "/run/initramfs", NULL, MS_PRIVATE, NULL) < 0)
 167                 return log_error_errno(errno, "Failed to make /run/initramfs private mount: %m");
 168
 169         /* switch_root with MS_BIND, because there might still be processes lurking around, which have open file descriptors.
 170          * /run/initramfs/shutdown will take care of these.
 171          * Also do not detach the old root, because /run/initramfs/shutdown needs to access it.
 172          */
 173         return switch_root("/run/initramfs", "/oldroot", false, MS_BIND);
 174 }
 175
 176 /* Read the following fields from /proc/meminfo:
 177  *
 178  *  NFS_Unstable
 179  *  Writeback
 180  *  Dirty
 181  *
 182  * Return true if the sum of these fields is greater than the previous
 183  * value input. For all other issues, report the failure and indicate that
 184  * the sync is not making progress.
 185  */
 186 static int sync_making_progress(unsigned long long *prev_dirty) {
 187         _cleanup_fclose_ FILE *f = NULL;
 188         unsigned long long val = 0;
 189         int ret;
 190
 191         f = fopen("/proc/meminfo", "re");
 192         if (!f)
 193                 return log_warning_errno(errno, "Failed to open /proc/meminfo: %m");
 194
 195         for (;;) {
 196                 _cleanup_free_ char *line = NULL;
 197                 unsigned long long ull = 0;
 198                 int q;
 199
 200                 q = read_line(f, LONG_LINE_MAX, &line);
 201                 if (q < 0)
 202                         return log_warning_errno(q, "Failed to parse /proc/meminfo: %m");
 203                 if (q == 0)
 204                         break;
 205
 206                 if (!first_word(line, "NFS_Unstable:") && !first_word(line, "Writeback:") && !first_word(line, "Dirty:"))
 207                         continue;
 208
 209                 errno = 0;
 210                 if (sscanf(line, "%*s %llu %*s", &ull) != 1) {
 211                         if (errno != 0)
 212                                 log_warning_errno(errno, "Failed to parse /proc/meminfo: %m");
 213                         else
 214                                 log_warning("Failed to parse /proc/meminfo");
 215
 216                         return false;
 217                 }
 218
 219                 val += ull;
 220         }
 221
 222         ret = *prev_dirty > val;
 223         *prev_dirty = val;
 224         return ret;
 225 }
 226
 227 static void sync_with_progress(void) {
 228         unsigned long long dirty = ULLONG_MAX;
 229         unsigned checks;
 230         pid_t pid;
 231         int r;
 232
 233         BLOCK_SIGNALS(SIGCHLD);
 234
 235         /* Due to the possibility of the sync operation hanging, we fork a child process and monitor
 236          * the progress. If the timeout lapses, the assumption is that the particular sync stalled. */
 237
 238         r = asynchronous_sync(&pid);
 239         if (r < 0) {
 240                 log_error_errno(r, "Failed to fork sync(): %m");
 241                 return;
 242         }
 243
 244         log_info("Syncing filesystems and block devices.");
 245
 246         /* Start monitoring the sync operation. If more than
 247          * SYNC_PROGRESS_ATTEMPTS lapse without progress being made,
 248          * we assume that the sync is stalled */
 249         for (checks = 0; checks < SYNC_PROGRESS_ATTEMPTS; checks++) {
 250                 r = wait_for_terminate_with_timeout(pid, SYNC_TIMEOUT_USEC);
 251                 if (r == 0)
 252                         /* Sync finished without error.
 253                          * (The sync itself does not return an error code) */
 254                         return;
 255                 else if (r == -ETIMEDOUT) {
 256                         /* Reset the check counter if the "Dirty" value is
 257                          * decreasing */
 258                         if (sync_making_progress(&dirty) > 0)
 259                                 checks = 0;
 260                 } else {
 261                         log_error_errno(r, "Failed to sync filesystems and block devices: %m");
 262                         return;
 263                 }
 264         }
 265
 266         /* Only reached in the event of a timeout. We should issue a kill
 267          * to the stray process. */
 268         log_error("Syncing filesystems and block devices - timed out, issuing SIGKILL to PID "PID_FMT".", pid);
 269         (void) kill(pid, SIGKILL);
 270 }
 271
 272 static int read_current_sysctl_printk_log_level(void) {
 273         _cleanup_free_ char *sysctl_printk_vals = NULL, *sysctl_printk_curr = NULL;
 274         int current_lvl;
 275         const char *p;
 276         int r;
 277
 278         r = sysctl_read("kernel/printk", &sysctl_printk_vals);
 279         if (r < 0)
 280                 return log_debug_errno(r, "Cannot read sysctl kernel.printk: %m");
 281
 282         p = sysctl_printk_vals;
 283         r = extract_first_word(&p, &sysctl_printk_curr, NULL, 0);
 284         if (r < 0)
 285                 return log_debug_errno(r, "Failed to split out kernel printk priority: %m");
 286         if (r == 0)
 287                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Short read while reading kernel.printk sysctl");
 288
 289         r = safe_atoi(sysctl_printk_curr, &current_lvl);
 290         if (r < 0)
 291                 return log_debug_errno(r, "Failed to parse kernel.printk sysctl: %s", sysctl_printk_vals);
 292
 293         return current_lvl;
 294 }
 295
 296 static void bump_sysctl_printk_log_level(int min_level) {
 297         int current_lvl, r;
 298
 299         /* Set the logging level to be able to see messages with log level smaller or equal to min_level */
 300
 301         current_lvl = read_current_sysctl_printk_log_level();
 302         if (current_lvl < 0 || current_lvl >= min_level + 1)
 303                 return;
 304
 305         r = sysctl_writef("kernel/printk", "%i", min_level + 1);
 306         if (r < 0)
 307                 log_debug_errno(r, "Failed to bump kernel.printk to %i: %m", min_level + 1);
 308 }
 309
 310 int main(int argc, char *argv[]) {
 311         bool need_umount, need_swapoff, need_loop_detach, need_dm_detach, need_md_detach, in_container, use_watchdog = false, can_initrd;
 312         _cleanup_free_ char *cgroup = NULL;
 313         char *arguments[3], *watchdog_device;
 314         int cmd, r, umount_log_level = LOG_INFO;
 315         static const char* const dirs[] = {SYSTEM_SHUTDOWN_PATH, NULL};
 316
 317         /* The log target defaults to console, but the original systemd process will pass its log target in through a
 318          * command line argument, which will override this default. Also, ensure we'll never log to the journal or
 319          * syslog, as these logging daemons are either already dead or will die very soon. */
 320
 321         log_set_target(LOG_TARGET_CONSOLE);
 322         log_set_prohibit_ipc(true);
 323         log_parse_environment();
 324
 325         r = parse_argv(argc, argv);
 326         if (r < 0)
 327                 goto error;
 328
 329         log_open();
 330
 331         umask(0022);
 332
 333         if (getpid_cached() != 1) {
 334                 log_error("Not executed by init (PID 1).");
 335                 r = -EPERM;
 336                 goto error;
 337         }
 338
 339         if (streq(arg_verb, "reboot"))
 340                 cmd = RB_AUTOBOOT;
 341         else if (streq(arg_verb, "poweroff"))
 342                 cmd = RB_POWER_OFF;
 343         else if (streq(arg_verb, "halt"))
 344                 cmd = RB_HALT_SYSTEM;
 345         else if (streq(arg_verb, "kexec"))
 346                 cmd = LINUX_REBOOT_CMD_KEXEC;
 347         else if (streq(arg_verb, "exit"))
 348                 cmd = 0; /* ignored, just checking that arg_verb is valid */
 349         else {
 350                 log_error("Unknown action '%s'.", arg_verb);
 351                 r = -EINVAL;
 352                 goto error;
 353         }
 354
 355         (void) cg_get_root_path(&cgroup);
 356         in_container = detect_container() > 0;
 357
 358         /* If the logging messages are going to KMSG, and if we are not running from a container, then try to
 359          * update the sysctl kernel.printk current value in order to see "info" messages; This current log
 360          * level is not updated if already big enough.
 361          */
 362         if (!in_container &&
 363             IN_SET(log_get_target(),
 364                    LOG_TARGET_AUTO,
 365                    LOG_TARGET_JOURNAL_OR_KMSG,
 366                    LOG_TARGET_SYSLOG_OR_KMSG,
 367                    LOG_TARGET_KMSG))
 368                 bump_sysctl_printk_log_level(LOG_WARNING);
 369
 370         use_watchdog = getenv("WATCHDOG_USEC");
 371         watchdog_device = getenv("WATCHDOG_DEVICE");
 372         if (watchdog_device) {
 373                 r = watchdog_set_device(watchdog_device);
 374                 if (r < 0)
 375                         log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m",
 376                                           watchdog_device);
 377         }
 378
 379         /* Lock us into memory */
 380         (void) mlockall(MCL_CURRENT|MCL_FUTURE);
 381
 382         /* Synchronize everything that is not written to disk yet at this point already. This is a good idea so that
 383          * slow IO is processed here already and the final process killing spree is not impacted by processes
 384          * desperately trying to sync IO to disk within their timeout. Do not remove this sync, data corruption will
 385          * result. */
 386         if (!in_container)
 387                 sync_with_progress();
 388
 389         disable_coredumps();
 390         disable_binfmt();
 391
 392         log_info("Sending SIGTERM to remaining processes...");
 393         broadcast_signal(SIGTERM, true, true, arg_timeout);
 394
 395         log_info("Sending SIGKILL to remaining processes...");
 396         broadcast_signal(SIGKILL, true, false, arg_timeout);
 397
 398         need_umount = !in_container;
 399         need_swapoff = !in_container;
 400         need_loop_detach = !in_container;
 401         need_dm_detach = !in_container;
 402         need_md_detach = !in_container;
 403         can_initrd = !in_container && !in_initrd() && access("/run/initramfs/shutdown", X_OK) == 0;
 404
 405         /* Unmount all mountpoints, swaps, and loopback devices */
 406         for (;;) {
 407                 bool changed = false;
 408
 409                 if (use_watchdog)
 410                         (void) watchdog_ping();
 411
 412                 /* Let's trim the cgroup tree on each iteration so
 413                    that we leave an empty cgroup tree around, so that
 414                    container managers get a nice notify event when we
 415                    are down */
 416                 if (cgroup)
 417                         (void) cg_trim(SYSTEMD_CGROUP_CONTROLLER, cgroup, false);
 418
 419                 if (need_umount) {
 420                         log_info("Unmounting file systems.");
 421                         r = umount_all(&changed, umount_log_level);
 422                         if (r == 0) {
 423                                 need_umount = false;
 424                                 log_info("All filesystems unmounted.");
 425                         } else if (r > 0)
 426                                 log_info("Not all file systems unmounted, %d left.", r);
 427                         else
 428                                 log_error_errno(r, "Failed to unmount file systems: %m");
 429                 }
 430
 431                 if (need_swapoff) {
 432                         log_info("Deactivating swaps.");
 433                         r = swapoff_all(&changed);
 434                         if (r == 0) {
 435                                 need_swapoff = false;
 436                                 log_info("All swaps deactivated.");
 437                         } else if (r > 0)
 438                                 log_info("Not all swaps deactivated, %d left.", r);
 439                         else
 440                                 log_error_errno(r, "Failed to deactivate swaps: %m");
 441                 }
 442
 443                 if (need_loop_detach) {
 444                         log_info("Detaching loop devices.");
 445                         r = loopback_detach_all(&changed, umount_log_level);
 446                         if (r == 0) {
 447                                 need_loop_detach = false;
 448                                 log_info("All loop devices detached.");
 449                         } else if (r > 0)
 450                                 log_info("Not all loop devices detached, %d left.", r);
 451                         else
 452                                 log_error_errno(r, "Failed to detach loop devices: %m");
 453                 }
 454
 455                 if (need_md_detach) {
 456                         log_info("Stopping MD devices.");
 457                         r = md_detach_all(&changed, umount_log_level);
 458                         if (r == 0) {
 459                                 need_md_detach = false;
 460                                 log_info("All MD devices stopped.");
 461                         } else if (r > 0)
 462                                 log_info("Not all MD devices stopped, %d left.", r);
 463                         else
 464                                 log_error_errno(r, "Failed to stop MD devices: %m");
 465                 }
 466
 467                 if (need_dm_detach) {
 468                         log_info("Detaching DM devices.");
 469                         r = dm_detach_all(&changed, umount_log_level);
 470                         if (r == 0) {
 471                                 need_dm_detach = false;
 472                                 log_info("All DM devices detached.");
 473                         } else if (r > 0)
 474                                 log_info("Not all DM devices detached, %d left.", r);
 475                         else
 476                                 log_error_errno(r, "Failed to detach DM devices: %m");
 477                 }
 478
 479                 if (!need_umount && !need_swapoff && !need_loop_detach && !need_dm_detach
 480                             && !need_md_detach) {
 481                         log_info("All filesystems, swaps, loop devices, MD devices and DM devices detached.");
 482                         /* Yay, done */
 483                         break;
 484                 }
 485
 486                 if (!changed && umount_log_level == LOG_INFO && !can_initrd) {
 487                         /* There are things we cannot get rid of. Loop one more time
 488                          * with LOG_ERR to inform the user. Note that we don't need
 489                          * to do this if there is a initrd to switch to, because that
 490                          * one is likely to get rid of the remaining mounts. If not,
 491                          * it will log about them. */
 492                         umount_log_level = LOG_ERR;
 493                         continue;
 494                 }
 495
 496                 /* If in this iteration we didn't manage to
 497                  * unmount/deactivate anything, we simply give up */
 498                 if (!changed) {
 499                         log_info("Cannot finalize remaining%s%s%s%s%s continuing.",
 500                                  need_umount ? " file systems," : "",
 501                                  need_swapoff ? " swap devices," : "",
 502                                  need_loop_detach ? " loop devices," : "",
 503                                  need_dm_detach ? " DM devices," : "",
 504                                  need_md_detach ? " MD devices," : "");
 505                         break;
 506                 }
 507
 508                 log_debug("Couldn't finalize remaining %s%s%s%s%s trying again.",
 509                           need_umount ? " file systems," : "",
 510                           need_swapoff ? " swap devices," : "",
 511                           need_loop_detach ? " loop devices," : "",
 512                           need_dm_detach ? " DM devices," : "",
 513                           need_md_detach ? " MD devices," : "");
 514         }
 515
 516         /* We're done with the watchdog. */
 517         watchdog_free_device();
 518
 519         arguments[0] = NULL;
 520         arguments[1] = arg_verb;
 521         arguments[2] = NULL;
 522         (void) execute_directories(dirs, DEFAULT_TIMEOUT_USEC, NULL, NULL, arguments, NULL, EXEC_DIR_PARALLEL | EXEC_DIR_IGNORE_ERRORS);
 523
 524         (void) rlimit_nofile_safe();
 525
 526         if (can_initrd) {
 527                 r = switch_root_initramfs();
 528                 if (r >= 0) {
 529                         argv[0] = (char*) "/shutdown";
 530
 531                         (void) setsid();
 532                         (void) make_console_stdio();
 533
 534                         log_info("Successfully changed into root pivot.\n"
 535                                  "Returning to initrd...");
 536
 537                         execv("/shutdown", argv);
 538                         log_error_errno(errno, "Failed to execute shutdown binary: %m");
 539                 } else
 540                         log_error_errno(r, "Failed to switch root to \"/run/initramfs\": %m");
 541         }
 542
 543         if (need_umount || need_swapoff || need_loop_detach || need_dm_detach || need_md_detach)
 544                 log_error("Failed to finalize%s%s%s%s%s ignoring.",
 545                           need_umount ? " file systems," : "",
 546                           need_swapoff ? " swap devices," : "",
 547                           need_loop_detach ? " loop devices," : "",
 548                           need_dm_detach ? " DM devices," : "",
 549                           need_md_detach ? " MD devices," : "");
 550
 551         /* The kernel will automatically flush ATA disks and suchlike on reboot(), but the file systems need to be
 552          * sync'ed explicitly in advance. So let's do this here, but not needlessly slow down containers. Note that we
 553          * sync'ed things already once above, but we did some more work since then which might have caused IO, hence
 554          * let's do it once more. Do not remove this sync, data corruption will result. */
 555         if (!in_container)
 556                 sync_with_progress();
 557
 558         if (streq(arg_verb, "exit")) {
 559                 if (in_container)
 560                         return arg_exit_code;
 561
 562                 cmd = RB_POWER_OFF; /* We cannot exit() on the host, fallback on another method. */
 563         }
 564
 565         switch (cmd) {
 566
 567         case LINUX_REBOOT_CMD_KEXEC:
 568
 569                 if (!in_container) {
 570                         /* We cheat and exec kexec to avoid doing all its work */
 571                         log_info("Rebooting with kexec.");
 572
 573                         r = safe_fork("(sd-kexec)", FORK_RESET_SIGNALS|FORK_CLOSE_ALL_FDS|FORK_LOG|FORK_WAIT, NULL);
 574                         if (r == 0) {
 575                                 const char * const args[] = {
 576                                         KEXEC, "-e", NULL
 577                                 };
 578
 579                                 /* Child */
 580
 581                                 execv(args[0], (char * const *) args);
 582
 583                                 /* execv failed (kexec binary missing?), so try simply reboot(RB_KEXEC) */
 584                                 (void) reboot(cmd);
 585                                 _exit(EXIT_FAILURE);
 586                         }
 587
 588                         /* If we are still running, then the kexec can't have worked, let's fall through */
 589                 }
 590
 591                 cmd = RB_AUTOBOOT;
 592                 _fallthrough_;
 593
 594         case RB_AUTOBOOT:
 595                 (void) reboot_with_parameter(REBOOT_LOG);
 596                 log_info("Rebooting.");
 597                 break;
 598
 599         case RB_POWER_OFF:
 600                 log_info("Powering off.");
 601                 break;
 602
 603         case RB_HALT_SYSTEM:
 604                 log_info("Halting system.");
 605                 break;
 606
 607         default:
 608                 assert_not_reached("Unknown magic");
 609         }
 610
 611         (void) reboot(cmd);
 612         if (errno == EPERM && in_container) {
 613                 /* If we are in a container, and we lacked
 614                  * CAP_SYS_BOOT just exit, this will kill our
 615                  * container for good. */
 616                 log_info("Exiting container.");
 617                 return EXIT_SUCCESS;
 618         }
 619
 620         r = log_error_errno(errno, "Failed to invoke reboot(): %m");
 621
 622   error:
 623         log_emergency_errno(r, "Critical error while doing system shutdown: %m");
 624         freeze();
 625 }