src/core/main.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <fcntl.h>
   4 #include <getopt.h>
   5 #include <linux/oom.h>
   6 #include <stdlib.h>
   7 #include <sys/mount.h>
   8 #include <sys/prctl.h>
   9 #include <sys/utsname.h>
  10 #include <unistd.h>
  11
  12 #if HAVE_VALGRIND_VALGRIND_H
  13 #  include <valgrind/valgrind.h>
  14 #endif
  15
  16 #include "sd-bus.h"
  17 #include "sd-daemon.h"
  18 #include "sd-messages.h"
  19
  20 #include "alloc-util.h"
  21 #include "apparmor-setup.h"
  22 #include "architecture.h"
  23 #include "argv-util.h"
  24 #include "build.h"
  25 #include "bus-error.h"
  26 #include "capability-util.h"
  27 #include "cgroup-setup.h"
  28 #include "chase.h"
  29 #include "clock-util.h"
  30 #include "clock-warp.h"
  31 #include "conf-parser.h"
  32 #include "confidential-virt.h"
  33 #include "constants.h"
  34 #include "copy.h"
  35 #include "coredump-util.h"
  36 #include "cpu-set-util.h"
  37 #include "crash-handler.h"
  38 #include "dbus.h"
  39 #include "dbus-manager.h"
  40 #include "dev-setup.h"
  41 #include "efi-random.h"
  42 #include "emergency-action.h"
  43 #include "env-util.h"
  44 #include "escape.h"
  45 #include "fd-util.h"
  46 #include "fdset.h"
  47 #include "fileio.h"
  48 #include "format-util.h"
  49 #include "getopt-defs.h"
  50 #include "hexdecoct.h"
  51 #include "hostname-setup.h"
  52 #include "id128-util.h"
  53 #include "ima-setup.h"
  54 #include "import-creds.h"
  55 #include "initrd-util.h"
  56 #include "io-util.h"
  57 #include "ipe-setup.h"
  58 #include "killall.h"
  59 #include "kmod-setup.h"
  60 #include "label-util.h"
  61 #include "limits-util.h"
  62 #include "load-fragment.h"
  63 #include "log.h"
  64 #include "loopback-setup.h"
  65 #include "machine-id-setup.h"
  66 #include "main.h"
  67 #include "manager.h"
  68 #include "manager-dump.h"
  69 #include "manager-serialize.h"
  70 #include "mkdir-label.h"
  71 #include "mount-setup.h"
  72 #include "mount-util.h"
  73 #include "os-util.h"
  74 #include "osc-context.h"
  75 #include "pager.h"
  76 #include "parse-argument.h"
  77 #include "parse-util.h"
  78 #include "path-util.h"
  79 #include "pretty-print.h"
  80 #include "proc-cmdline.h"
  81 #include "process-util.h"
  82 #include "random-util.h"
  83 #include "rlimit-util.h"
  84 #include "rm-rf.h"
  85 #include "seccomp-util.h"
  86 #include "selinux-setup.h"
  87 #include "selinux-util.h"
  88 #include "serialize.h"
  89 #include "set.h"
  90 #include "signal-util.h"
  91 #include "smack-setup.h"
  92 #include "special.h"
  93 #include "stat-util.h"
  94 #include "stdio-util.h"
  95 #include "strv.h"
  96 #include "switch-root.h"
  97 #include "sysctl-util.h"
  98 #include "terminal-util.h"
  99 #include "time-util.h"
 100 #include "umask-util.h"
 101 #include "unit-name.h"
 102 #include "user-util.h"
 103 #include "version.h"
 104 #include "virt.h"
 105 #include "watchdog.h"
 106
 107 #if HAS_FEATURE_ADDRESS_SANITIZER
 108 #include <sanitizer/lsan_interface.h>
 109 #endif
 110
 111 static enum {
 112         ACTION_RUN,
 113         ACTION_HELP,
 114         ACTION_VERSION,
 115         ACTION_TEST,
 116         ACTION_DUMP_CONFIGURATION_ITEMS,
 117         ACTION_DUMP_BUS_PROPERTIES,
 118         ACTION_BUS_INTROSPECT,
 119 } arg_action = ACTION_RUN;
 120
 121 static const char *arg_bus_introspect = NULL;
 122
 123 /* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access.  Real
 124  * defaults are assigned in reset_arguments() below. */
 125 static char *arg_default_unit;
 126 static RuntimeScope arg_runtime_scope;
 127 bool arg_dump_core;
 128 int arg_crash_chvt;
 129 bool arg_crash_shell;
 130 CrashAction arg_crash_action;
 131 static char *arg_confirm_spawn;
 132 static ShowStatus arg_show_status;
 133 static StatusUnitFormat arg_status_unit_format;
 134 static bool arg_switched_root;
 135 static PagerFlags arg_pager_flags;
 136 static bool arg_service_watchdogs;
 137 static UnitDefaults arg_defaults;
 138 static usec_t arg_runtime_watchdog;
 139 static usec_t arg_reboot_watchdog;
 140 static usec_t arg_kexec_watchdog;
 141 static usec_t arg_pretimeout_watchdog;
 142 static char *arg_early_core_pattern;
 143 static char *arg_watchdog_pretimeout_governor;
 144 static char *arg_watchdog_device;
 145 static char **arg_default_environment;
 146 static char **arg_manager_environment;
 147 static uint64_t arg_capability_bounding_set;
 148 static bool arg_no_new_privs;
 149 static int arg_protect_system;
 150 static nsec_t arg_timer_slack_nsec;
 151 static Set* arg_syscall_archs;
 152 static FILE* arg_serialization;
 153 static sd_id128_t arg_machine_id;
 154 static bool arg_machine_id_from_firmware = false;
 155 static EmergencyAction arg_cad_burst_action;
 156 static CPUSet arg_cpu_affinity;
 157 static NUMAPolicy arg_numa_policy;
 158 static usec_t arg_clock_usec;
 159 static void *arg_random_seed;
 160 static size_t arg_random_seed_size;
 161 static usec_t arg_reload_limit_interval_sec;
 162 static unsigned arg_reload_limit_burst;
 163
 164 /* A copy of the original environment block */
 165 static char **saved_env = NULL;
 166
 167 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
 168                                const struct rlimit *saved_rlimit_memlock);
 169
 170 static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_crash_action, crash_action, CrashAction, CRASH_FREEZE);
 171
 172 static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
 173         _cleanup_free_ char *base = NULL;
 174         _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
 175         int r;
 176
 177         r = xdg_user_config_dir("/systemd", &base);
 178         if (r < 0)
 179                 return r;
 180
 181         r = strv_extendf(&files, "%s/user.conf", base);
 182         if (r < 0)
 183                 return r;
 184
 185         r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
 186         if (r < 0)
 187                 return r;
 188
 189         r = strv_consume(&dirs, TAKE_PTR(base));
 190         if (r < 0)
 191                 return r;
 192
 193         r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
 194         if (r < 0)
 195                 return r;
 196
 197         *ret_files = TAKE_PTR(files);
 198         *ret_dirs = TAKE_PTR(dirs);
 199         return 0;
 200 }
 201
 202 static int save_console_winsize_in_environment(int tty_fd) {
 203         int r;
 204
 205         assert(tty_fd >= 0);
 206
 207         struct winsize ws = {};
 208         if (ioctl(tty_fd, TIOCGWINSZ, &ws) < 0) {
 209                 log_debug_errno(errno, "Failed to acquire console window size, ignoring.");
 210                 goto unset;
 211         }
 212
 213         if (ws.ws_col <= 0 && ws.ws_row <= 0) {
 214                 log_debug("No console window size set, ignoring.");
 215                 goto unset;
 216         }
 217
 218         r = setenvf("COLUMNS", /* overwrite= */ true, "%u", ws.ws_col);
 219         if (r < 0) {
 220                 log_debug_errno(r, "Failed to set $COLUMNS, ignoring: %m");
 221                 goto unset;
 222         }
 223
 224         r = setenvf("LINES", /* overwrite= */ true, "%u", ws.ws_row);
 225         if (r < 0) {
 226                 log_debug_errno(r, "Failed to set $LINES, ignoring: %m");
 227                 goto unset;
 228         }
 229
 230         log_debug("Recorded console dimensions in environment: $COLUMNS=%u $LINES=%u.", ws.ws_col, ws.ws_row);
 231         return 1;
 232
 233 unset:
 234         (void) unsetenv("COLUMNS");
 235         (void) unsetenv("LINES");
 236         return 0;
 237 }
 238
 239 static int console_setup(void) {
 240
 241         if (getpid_cached() != 1)
 242                 return 0;
 243
 244         _cleanup_close_ int tty_fd = -EBADF;
 245
 246         tty_fd = open_terminal("/dev/console", O_RDWR|O_NOCTTY|O_CLOEXEC);
 247         if (tty_fd < 0)
 248                 return log_error_errno(tty_fd, "Failed to open %s: %m", "/dev/console");
 249
 250         /* We don't want to force text mode. Plymouth may be showing pictures already from initrd. */
 251         reset_dev_console_fd(tty_fd, /* switch_to_text= */ false);
 252
 253         save_console_winsize_in_environment(tty_fd);
 254
 255         return 0;
 256 }
 257
 258 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
 259         int r;
 260
 261         assert(key);
 262
 263         if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
 264
 265                 if (proc_cmdline_value_missing(key, value))
 266                         return 0;
 267
 268                 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
 269                         log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
 270                 else if (in_initrd() == !!startswith(key, "rd."))
 271                         return free_and_strdup_warn(&arg_default_unit, value);
 272
 273         } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
 274
 275                 r = value ? parse_boolean(value) : true;
 276                 if (r < 0)
 277                         log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
 278                 else
 279                         arg_dump_core = r;
 280
 281         } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
 282
 283                 if (proc_cmdline_value_missing(key, value))
 284                         return 0;
 285
 286                 if (path_is_absolute(value))
 287                         (void) parse_path_argument(value, false, &arg_early_core_pattern);
 288                 else
 289                         log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
 290
 291         } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
 292
 293                 if (!value)
 294                         arg_crash_chvt = 0; /* turn on */
 295                 else {
 296                         r = parse_crash_chvt(value, &arg_crash_chvt);
 297                         if (r < 0)
 298                                 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
 299                 }
 300
 301         } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
 302
 303                 r = value ? parse_boolean(value) : true;
 304                 if (r < 0)
 305                         log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
 306                 else
 307                         arg_crash_shell = r;
 308
 309         } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
 310
 311                 r = value ? parse_boolean(value) : true;
 312                 if (r < 0)
 313                         log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
 314                 else
 315                         arg_crash_action = r ? CRASH_REBOOT : CRASH_FREEZE;
 316
 317         } else if (proc_cmdline_key_streq(key, "systemd.crash_action")) {
 318
 319                 if (proc_cmdline_value_missing(key, value))
 320                         return 0;
 321
 322                 r = crash_action_from_string(value);
 323                 if (r < 0)
 324                         log_warning_errno(r, "Failed to parse crash action switch %s, ignoring: %m", value);
 325                 else
 326                         arg_crash_action = r;
 327
 328         } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
 329                 char *s;
 330
 331                 r = parse_confirm_spawn(value, &s);
 332                 if (r < 0)
 333                         log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
 334                 else
 335                         free_and_replace(arg_confirm_spawn, s);
 336
 337         } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
 338
 339                 r = value ? parse_boolean(value) : true;
 340                 if (r < 0)
 341                         log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
 342                 else
 343                         arg_service_watchdogs = r;
 344
 345         } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
 346
 347                 if (value) {
 348                         r = parse_show_status(value, &arg_show_status);
 349                         if (r < 0)
 350                                 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
 351                 } else
 352                         arg_show_status = SHOW_STATUS_YES;
 353
 354         } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
 355
 356                 if (proc_cmdline_value_missing(key, value))
 357                         return 0;
 358
 359                 r = status_unit_format_from_string(value);
 360                 if (r < 0)
 361                         log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
 362                 else
 363                         arg_status_unit_format = r;
 364
 365         } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
 366
 367                 if (proc_cmdline_value_missing(key, value))
 368                         return 0;
 369
 370                 r = exec_output_from_string(value);
 371                 if (r < 0)
 372                         log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
 373                 else
 374                         arg_defaults.std_output = r;
 375
 376         } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
 377
 378                 if (proc_cmdline_value_missing(key, value))
 379                         return 0;
 380
 381                 r = exec_output_from_string(value);
 382                 if (r < 0)
 383                         log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
 384                 else
 385                         arg_defaults.std_error = r;
 386
 387         } else if (streq(key, "systemd.setenv")) {
 388
 389                 if (proc_cmdline_value_missing(key, value))
 390                         return 0;
 391
 392                 if (!env_assignment_is_valid(value))
 393                         log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
 394                 else {
 395                         r = strv_env_replace_strdup(&arg_default_environment, value);
 396                         if (r < 0)
 397                                 return log_oom();
 398                 }
 399
 400         } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
 401
 402                 if (proc_cmdline_value_missing(key, value))
 403                         return 0;
 404
 405                 if (streq(value, "firmware"))
 406                         arg_machine_id_from_firmware = true;
 407                 else {
 408                         r = id128_from_string_nonzero(value, &arg_machine_id);
 409                         if (r < 0)
 410                                 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
 411                         else
 412                                 arg_machine_id_from_firmware = false;
 413                 }
 414         } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
 415
 416                 if (proc_cmdline_value_missing(key, value))
 417                         return 0;
 418
 419                 r = parse_sec(value, &arg_defaults.timeout_start_usec);
 420                 if (r < 0)
 421                         log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
 422
 423                 if (arg_defaults.timeout_start_usec <= 0)
 424                         arg_defaults.timeout_start_usec = USEC_INFINITY;
 425
 426         } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) {
 427
 428                 if (proc_cmdline_value_missing(key, value))
 429                         return 0;
 430
 431                 r = parse_sec(value, &arg_defaults.device_timeout_usec);
 432                 if (r < 0)
 433                         log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value);
 434
 435                 if (arg_defaults.device_timeout_usec <= 0)
 436                         arg_defaults.device_timeout_usec = USEC_INFINITY;
 437
 438         } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
 439
 440                 if (proc_cmdline_value_missing(key, value))
 441                         return 0;
 442
 443                 r = parse_cpu_set(value, &arg_cpu_affinity);
 444                 if (r < 0)
 445                         log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
 446
 447         } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
 448
 449                 if (proc_cmdline_value_missing(key, value))
 450                         return 0;
 451
 452                 (void) parse_path_argument(value, false, &arg_watchdog_device);
 453
 454         } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
 455
 456                 if (proc_cmdline_value_missing(key, value))
 457                         return 0;
 458
 459                 if (streq(value, "default"))
 460                         arg_runtime_watchdog = USEC_INFINITY;
 461                 else if (streq(value, "off"))
 462                         arg_runtime_watchdog = 0;
 463                 else {
 464                         r = parse_sec(value, &arg_runtime_watchdog);
 465                         if (r < 0) {
 466                                 log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
 467                                 return 0;
 468                         }
 469                 }
 470
 471                 arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
 472
 473         } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
 474
 475                 if (proc_cmdline_value_missing(key, value))
 476                         return 0;
 477
 478                 if (streq(value, "default"))
 479                         arg_pretimeout_watchdog = USEC_INFINITY;
 480                 else if (streq(value, "off"))
 481                         arg_pretimeout_watchdog = 0;
 482                 else {
 483                         r = parse_sec(value, &arg_pretimeout_watchdog);
 484                         if (r < 0) {
 485                                 log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
 486                                 return 0;
 487                         }
 488                 }
 489
 490         } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
 491
 492                 if (proc_cmdline_value_missing(key, value) || isempty(value)) {
 493                         arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
 494                         return 0;
 495                 }
 496
 497                 if (!string_is_safe(value)) {
 498                         log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
 499                         return 0;
 500                 }
 501
 502                 return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
 503
 504         } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
 505
 506                 if (proc_cmdline_value_missing(key, value))
 507                         return 0;
 508
 509                 r = safe_atou64(value, &arg_clock_usec);
 510                 if (r < 0)
 511                         log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
 512
 513         } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
 514                 void *p;
 515                 size_t sz;
 516
 517                 if (proc_cmdline_value_missing(key, value))
 518                         return 0;
 519
 520                 r = unbase64mem(value, &p, &sz);
 521                 if (r < 0)
 522                         log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
 523
 524                 free(arg_random_seed);
 525                 arg_random_seed = sz > 0 ? p : mfree(p);
 526                 arg_random_seed_size = sz;
 527
 528         } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) {
 529
 530                 if (proc_cmdline_value_missing(key, value))
 531                         return 0;
 532
 533                 r = parse_sec(value, &arg_reload_limit_interval_sec);
 534                 if (r < 0) {
 535                         log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value);
 536                         return 0;
 537                 }
 538
 539         } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) {
 540
 541                 if (proc_cmdline_value_missing(key, value))
 542                         return 0;
 543
 544                 r = safe_atou(value, &arg_reload_limit_burst);
 545                 if (r < 0) {
 546                         log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value);
 547                         return 0;
 548                 }
 549
 550         } else if (streq(key, "quiet") && !value) {
 551
 552                 if (arg_show_status == _SHOW_STATUS_INVALID)
 553                         arg_show_status = SHOW_STATUS_ERROR;
 554
 555         } else if (streq(key, "debug") && !value) {
 556
 557                 /* Note that log_parse_environment() handles 'debug'
 558                  * too, and sets the log level to LOG_DEBUG. */
 559
 560                 if (detect_container() > 0)
 561                         log_set_target(LOG_TARGET_CONSOLE);
 562
 563         } else if (!value) {
 564                 const char *target;
 565
 566                 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
 567                 target = runlevel_to_target(key);
 568                 if (target)
 569                         return free_and_strdup_warn(&arg_default_unit, target);
 570         }
 571
 572         return 0;
 573 }
 574
 575 #define DEFINE_SETTER(name, func, descr)                              \
 576         static int name(const char *unit,                             \
 577                         const char *filename,                         \
 578                         unsigned line,                                \
 579                         const char *section,                          \
 580                         unsigned section_line,                        \
 581                         const char *lvalue,                           \
 582                         int ltype,                                    \
 583                         const char *rvalue,                           \
 584                         void *data,                                   \
 585                         void *userdata) {                             \
 586                                                                       \
 587                 int r;                                                \
 588                                                                       \
 589                 assert(filename);                                     \
 590                 assert(lvalue);                                       \
 591                 assert(rvalue);                                       \
 592                                                                       \
 593                 r = func(rvalue);                                     \
 594                 if (r < 0)                                            \
 595                         log_syntax(unit, LOG_ERR, filename, line, r,  \
 596                                    "Invalid " descr "'%s': %m",       \
 597                                    rvalue);                           \
 598                                                                       \
 599                 return 0;                                             \
 600         }
 601
 602 DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
 603 DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
 604 DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
 605 DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
 606 DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
 607
 608 static int config_parse_default_timeout_abort(
 609                 const char *unit,
 610                 const char *filename,
 611                 unsigned line,
 612                 const char *section,
 613                 unsigned section_line,
 614                 const char *lvalue,
 615                 int ltype,
 616                 const char *rvalue,
 617                 void *data,
 618                 void *userdata) {
 619         int r;
 620
 621         r = config_parse_timeout_abort(
 622                         unit,
 623                         filename,
 624                         line,
 625                         section,
 626                         section_line,
 627                         lvalue,
 628                         ltype,
 629                         rvalue,
 630                         &arg_defaults.timeout_abort_usec,
 631                         userdata);
 632         if (r >= 0)
 633                 arg_defaults.timeout_abort_set = r;
 634         return 0;
 635 }
 636
 637 static int config_parse_oom_score_adjust(
 638                 const char *unit,
 639                 const char *filename,
 640                 unsigned line,
 641                 const char *section,
 642                 unsigned section_line,
 643                 const char *lvalue,
 644                 int ltype,
 645                 const char *rvalue,
 646                 void *data,
 647                 void *userdata) {
 648
 649         int oa, r;
 650
 651         if (isempty(rvalue)) {
 652                 arg_defaults.oom_score_adjust_set = false;
 653                 return 0;
 654         }
 655
 656         r = parse_oom_score_adjust(rvalue, &oa);
 657         if (r < 0)
 658                 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
 659
 660         arg_defaults.oom_score_adjust = oa;
 661         arg_defaults.oom_score_adjust_set = true;
 662
 663         return 0;
 664 }
 665
 666 static int config_parse_protect_system_pid1(
 667                 const char *unit,
 668                 const char *filename,
 669                 unsigned line,
 670                 const char *section,
 671                 unsigned section_line,
 672                 const char *lvalue,
 673                 int ltype,
 674                 const char *rvalue,
 675                 void *data,
 676                 void *userdata) {
 677
 678         int *v = ASSERT_PTR(data), r;
 679
 680         /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
 681          * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
 682          * "full"). And we will enable this automatically for the initrd unless configured otherwise.
 683          *
 684          * We might extend this later to match more closely what the per-service ProtectSystem= can do, but
 685          * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
 686          * at the moment we enable this logic. */
 687
 688         if (isempty(rvalue) || streq(rvalue, "auto")) {
 689                 *v = -1;
 690                 return 0;
 691         }
 692
 693         r = parse_boolean(rvalue);
 694         if (r < 0)
 695                 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
 696
 697         *v = r;
 698         return 0;
 699 }
 700
 701 static int config_parse_crash_reboot(
 702                 const char *unit,
 703                 const char *filename,
 704                 unsigned line,
 705                 const char *section,
 706                 unsigned section_line,
 707                 const char *lvalue,
 708                 int ltype,
 709                 const char *rvalue,
 710                 void *data,
 711                 void *userdata) {
 712
 713         CrashAction *v = ASSERT_PTR(data);
 714         int r;
 715
 716         if (isempty(rvalue)) {
 717                 *v = CRASH_REBOOT;
 718                 return 0;
 719         }
 720
 721         r = parse_boolean(rvalue);
 722         if (r < 0)
 723                 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
 724
 725         *v = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
 726         return 0;
 727 }
 728
 729 static int parse_config_file(void) {
 730         const ConfigTableItem items[] = {
 731                 { "Manager", "LogLevel",                     config_parse_level2,                0,                        NULL                              },
 732                 { "Manager", "LogTarget",                    config_parse_target,                0,                        NULL                              },
 733                 { "Manager", "LogColor",                     config_parse_color,                 0,                        NULL                              },
 734                 { "Manager", "LogLocation",                  config_parse_location,              0,                        NULL                              },
 735                 { "Manager", "LogTime",                      config_parse_time,                  0,                        NULL                              },
 736                 { "Manager", "DumpCore",                     config_parse_bool,                  0,                        &arg_dump_core                    },
 737                 { "Manager", "CrashChVT", /* legacy */       config_parse_crash_chvt,            0,                        &arg_crash_chvt                   },
 738                 { "Manager", "CrashChangeVT",                config_parse_crash_chvt,            0,                        &arg_crash_chvt                   },
 739                 { "Manager", "CrashShell",                   config_parse_bool,                  0,                        &arg_crash_shell                  },
 740                 { "Manager", "CrashReboot",                  config_parse_crash_reboot,          0,                        &arg_crash_action                 },
 741                 { "Manager", "CrashAction",                  config_parse_crash_action,          0,                        &arg_crash_action                 },
 742                 { "Manager", "ShowStatus",                   config_parse_show_status,           0,                        &arg_show_status                  },
 743                 { "Manager", "StatusUnitFormat",             config_parse_status_unit_format,    0,                        &arg_status_unit_format           },
 744                 { "Manager", "CPUAffinity",                  config_parse_cpu_set,               0,                        &arg_cpu_affinity                 },
 745                 { "Manager", "NUMAPolicy",                   config_parse_numa_policy,           0,                        &arg_numa_policy.type             },
 746                 { "Manager", "NUMAMask",                     config_parse_numa_mask,             0,                        &arg_numa_policy.nodes            },
 747                 { "Manager", "JoinControllers",              config_parse_warn_compat,           DISABLED_LEGACY,          NULL                              },
 748                 { "Manager", "RuntimeWatchdogSec",           config_parse_watchdog_sec,          0,                        &arg_runtime_watchdog             },
 749                 { "Manager", "RuntimeWatchdogPreSec",        config_parse_watchdog_sec,          0,                        &arg_pretimeout_watchdog          },
 750                 { "Manager", "RebootWatchdogSec",            config_parse_watchdog_sec,          0,                        &arg_reboot_watchdog              },
 751                 { "Manager", "ShutdownWatchdogSec",          config_parse_watchdog_sec,          0,                        &arg_reboot_watchdog              }, /* obsolete alias */
 752                 { "Manager", "KExecWatchdogSec",             config_parse_watchdog_sec,          0,                        &arg_kexec_watchdog               },
 753                 { "Manager", "WatchdogDevice",               config_parse_path,                  0,                        &arg_watchdog_device              },
 754                 { "Manager", "RuntimeWatchdogPreGovernor",   config_parse_string,                CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
 755                 { "Manager", "CapabilityBoundingSet",        config_parse_capability_set,        0,                        &arg_capability_bounding_set      },
 756                 { "Manager", "NoNewPrivileges",              config_parse_bool,                  0,                        &arg_no_new_privs                 },
 757                 { "Manager", "ProtectSystem",                config_parse_protect_system_pid1,   0,                        &arg_protect_system               },
 758 #if HAVE_SECCOMP
 759                 { "Manager", "SystemCallArchitectures",      config_parse_syscall_archs,         0,                        &arg_syscall_archs                },
 760 #else
 761                 { "Manager", "SystemCallArchitectures",      config_parse_warn_compat,           DISABLED_CONFIGURATION,   NULL                              },
 762
 763 #endif
 764                 { "Manager", "TimerSlackNSec",               config_parse_nsec,                  0,                        &arg_timer_slack_nsec             },
 765                 { "Manager", "DefaultTimerAccuracySec",      config_parse_sec,                   0,                        &arg_defaults.timer_accuracy_usec },
 766                 { "Manager", "DefaultStandardOutput",        config_parse_output_restricted,     0,                        &arg_defaults.std_output          },
 767                 { "Manager", "DefaultStandardError",         config_parse_output_restricted,     0,                        &arg_defaults.std_error           },
 768                 { "Manager", "DefaultTimeoutStartSec",       config_parse_sec,                   0,                        &arg_defaults.timeout_start_usec  },
 769                 { "Manager", "DefaultTimeoutStopSec",        config_parse_sec,                   0,                        &arg_defaults.timeout_stop_usec   },
 770                 { "Manager", "DefaultTimeoutAbortSec",       config_parse_default_timeout_abort, 0,                        NULL                              },
 771                 { "Manager", "DefaultDeviceTimeoutSec",      config_parse_sec,                   0,                        &arg_defaults.device_timeout_usec },
 772                 { "Manager", "DefaultRestartSec",            config_parse_sec,                   0,                        &arg_defaults.restart_usec        },
 773                 { "Manager", "DefaultStartLimitInterval",    config_parse_sec,                   0,                        &arg_defaults.start_limit.interval}, /* obsolete alias */
 774                 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec,                   0,                        &arg_defaults.start_limit.interval},
 775                 { "Manager", "DefaultStartLimitBurst",       config_parse_unsigned,              0,                        &arg_defaults.start_limit.burst   },
 776                 { "Manager", "DefaultEnvironment",           config_parse_environ,               arg_runtime_scope,        &arg_default_environment          },
 777                 { "Manager", "ManagerEnvironment",           config_parse_environ,               arg_runtime_scope,        &arg_manager_environment          },
 778                 { "Manager", "DefaultLimitCPU",              config_parse_rlimit,                RLIMIT_CPU,               arg_defaults.rlimit               },
 779                 { "Manager", "DefaultLimitFSIZE",            config_parse_rlimit,                RLIMIT_FSIZE,             arg_defaults.rlimit               },
 780                 { "Manager", "DefaultLimitDATA",             config_parse_rlimit,                RLIMIT_DATA,              arg_defaults.rlimit               },
 781                 { "Manager", "DefaultLimitSTACK",            config_parse_rlimit,                RLIMIT_STACK,             arg_defaults.rlimit               },
 782                 { "Manager", "DefaultLimitCORE",             config_parse_rlimit,                RLIMIT_CORE,              arg_defaults.rlimit               },
 783                 { "Manager", "DefaultLimitRSS",              config_parse_rlimit,                RLIMIT_RSS,               arg_defaults.rlimit               },
 784                 { "Manager", "DefaultLimitNOFILE",           config_parse_rlimit,                RLIMIT_NOFILE,            arg_defaults.rlimit               },
 785                 { "Manager", "DefaultLimitAS",               config_parse_rlimit,                RLIMIT_AS,                arg_defaults.rlimit               },
 786                 { "Manager", "DefaultLimitNPROC",            config_parse_rlimit,                RLIMIT_NPROC,             arg_defaults.rlimit               },
 787                 { "Manager", "DefaultLimitMEMLOCK",          config_parse_rlimit,                RLIMIT_MEMLOCK,           arg_defaults.rlimit               },
 788                 { "Manager", "DefaultLimitLOCKS",            config_parse_rlimit,                RLIMIT_LOCKS,             arg_defaults.rlimit               },
 789                 { "Manager", "DefaultLimitSIGPENDING",       config_parse_rlimit,                RLIMIT_SIGPENDING,        arg_defaults.rlimit               },
 790                 { "Manager", "DefaultLimitMSGQUEUE",         config_parse_rlimit,                RLIMIT_MSGQUEUE,          arg_defaults.rlimit               },
 791                 { "Manager", "DefaultLimitNICE",             config_parse_rlimit,                RLIMIT_NICE,              arg_defaults.rlimit               },
 792                 { "Manager", "DefaultLimitRTPRIO",           config_parse_rlimit,                RLIMIT_RTPRIO,            arg_defaults.rlimit               },
 793                 { "Manager", "DefaultLimitRTTIME",           config_parse_rlimit,                RLIMIT_RTTIME,            arg_defaults.rlimit               },
 794                 { "Manager", "DefaultCPUAccounting",         config_parse_warn_compat,           DISABLED_LEGACY,          NULL                              },
 795                 { "Manager", "DefaultIOAccounting",          config_parse_bool,                  0,                        &arg_defaults.io_accounting       },
 796                 { "Manager", "DefaultIPAccounting",          config_parse_bool,                  0,                        &arg_defaults.ip_accounting       },
 797                 { "Manager", "DefaultBlockIOAccounting",     config_parse_warn_compat,           DISABLED_LEGACY,          NULL                              },
 798                 { "Manager", "DefaultMemoryAccounting",      config_parse_bool,                  0,                        &arg_defaults.memory_accounting   },
 799                 { "Manager", "DefaultTasksAccounting",       config_parse_bool,                  0,                        &arg_defaults.tasks_accounting    },
 800                 { "Manager", "DefaultTasksMax",              config_parse_tasks_max,             0,                        &arg_defaults.tasks_max           },
 801                 { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec,              0,                        &arg_defaults.memory_pressure_threshold_usec },
 802                 { "Manager", "DefaultMemoryPressureWatch",   config_parse_memory_pressure_watch, 0,                        &arg_defaults.memory_pressure_watch },
 803                 { "Manager", "CtrlAltDelBurstAction",        config_parse_emergency_action,      arg_runtime_scope,        &arg_cad_burst_action             },
 804                 { "Manager", "DefaultOOMPolicy",             config_parse_oom_policy,            0,                        &arg_defaults.oom_policy          },
 805                 { "Manager", "DefaultOOMScoreAdjust",        config_parse_oom_score_adjust,      0,                        NULL                              },
 806                 { "Manager", "ReloadLimitIntervalSec",       config_parse_sec,                   0,                        &arg_reload_limit_interval_sec    },
 807                 { "Manager", "ReloadLimitBurst",             config_parse_unsigned,              0,                        &arg_reload_limit_burst           },
 808 #if ENABLE_SMACK
 809                 { "Manager", "DefaultSmackProcessLabel",     config_parse_string,                0,                        &arg_defaults.smack_process_label },
 810 #else
 811                 { "Manager", "DefaultSmackProcessLabel",     config_parse_warn_compat,           DISABLED_CONFIGURATION,   NULL                              },
 812 #endif
 813                 {}
 814         };
 815
 816         if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
 817                 (void) config_parse_standard_file_with_dropins(
 818                                 "systemd/system.conf",
 819                                 "Manager\0",
 820                                 config_item_table_lookup, items,
 821                                 CONFIG_PARSE_WARN,
 822                                 /* userdata= */ NULL);
 823         else {
 824                 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
 825                 int r;
 826
 827                 assert(arg_runtime_scope == RUNTIME_SCOPE_USER);
 828
 829                 r = manager_find_user_config_paths(&files, &dirs);
 830                 if (r < 0)
 831                         return log_error_errno(r, "Failed to determine config file paths: %m");
 832
 833                 (void) config_parse_many(
 834                                 (const char* const*) files,
 835                                 (const char* const*) dirs,
 836                                 "user.conf.d",
 837                                 /* root = */ NULL,
 838                                 "Manager\0",
 839                                 config_item_table_lookup, items,
 840                                 CONFIG_PARSE_WARN,
 841                                 NULL, NULL, NULL);
 842         }
 843
 844         /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
 845          * USEC_INFINITY like everywhere else. */
 846         if (arg_defaults.timeout_start_usec <= 0)
 847                 arg_defaults.timeout_start_usec = USEC_INFINITY;
 848         if (arg_defaults.timeout_stop_usec <= 0)
 849                 arg_defaults.timeout_stop_usec = USEC_INFINITY;
 850
 851         return 0;
 852 }
 853
 854 static void set_manager_defaults(Manager *m) {
 855         int r;
 856
 857         assert(m);
 858
 859         /* Propagates the various default unit property settings into the manager object, i.e. properties
 860          * that do not affect the manager itself, but are just what newly allocated units will have set if
 861          * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the
 862          * manager's own behaviour) */
 863
 864         r = manager_set_unit_defaults(m, &arg_defaults);
 865         if (r < 0)
 866                 log_warning_errno(r, "Failed to set manager defaults, ignoring: %m");
 867
 868         r = manager_default_environment(m);
 869         if (r < 0)
 870                 log_warning_errno(r, "Failed to set manager default environment, ignoring: %m");
 871
 872         r = manager_transient_environment_add(m, arg_default_environment);
 873         if (r < 0)
 874                 log_warning_errno(r, "Failed to add to transient environment, ignoring: %m");
 875 }
 876
 877 static void set_manager_settings(Manager *m) {
 878         int r;
 879
 880         assert(m);
 881
 882         /* Propagates the various manager settings into the manager object, i.e. properties that
 883          * affect the manager itself (as opposed to just being inherited into newly allocated
 884          * units, see set_manager_defaults() above). */
 885
 886         m->confirm_spawn = arg_confirm_spawn;
 887         m->service_watchdogs = arg_service_watchdogs;
 888         m->cad_burst_action = arg_cad_burst_action;
 889         /* Note that we don't do structured initialization here, otherwise it will reset the rate limit
 890          * counter on every daemon-reload. */
 891         m->reload_reexec_ratelimit.interval = arg_reload_limit_interval_sec;
 892         m->reload_reexec_ratelimit.burst = arg_reload_limit_burst;
 893
 894         manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
 895         manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
 896         manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
 897         manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
 898         r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
 899         if (r < 0)
 900                 log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
 901
 902         manager_set_show_status(m, arg_show_status, "command line");
 903         m->status_unit_format = arg_status_unit_format;
 904 }
 905
 906 static int parse_argv(int argc, char *argv[]) {
 907         enum {
 908                 COMMON_GETOPT_ARGS,
 909                 SYSTEMD_GETOPT_ARGS,
 910         };
 911
 912         static const struct option options[] = {
 913                 COMMON_GETOPT_OPTIONS,
 914                 SYSTEMD_GETOPT_OPTIONS,
 915                 {}
 916         };
 917
 918         int c, r;
 919         bool user_arg_seen = false;
 920
 921         assert(argc >= 1);
 922         assert(argv);
 923
 924         if (getpid_cached() == 1)
 925                 opterr = 0;
 926
 927         while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0)
 928
 929                 switch (c) {
 930
 931                 case ARG_LOG_LEVEL:
 932                         r = log_set_max_level_from_string(optarg);
 933                         if (r < 0)
 934                                 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
 935
 936                         break;
 937
 938                 case ARG_LOG_TARGET:
 939                         r = log_set_target_from_string(optarg);
 940                         if (r < 0)
 941                                 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
 942
 943                         break;
 944
 945                 case ARG_LOG_COLOR:
 946
 947                         if (optarg) {
 948                                 r = log_show_color_from_string(optarg);
 949                                 if (r < 0)
 950                                         return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
 951                                                                optarg);
 952                         } else
 953                                 log_show_color(true);
 954
 955                         break;
 956
 957                 case ARG_LOG_LOCATION:
 958                         if (optarg) {
 959                                 r = log_show_location_from_string(optarg);
 960                                 if (r < 0)
 961                                         return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
 962                                                                optarg);
 963                         } else
 964                                 log_show_location(true);
 965
 966                         break;
 967
 968                 case ARG_LOG_TIME:
 969
 970                         if (optarg) {
 971                                 r = log_show_time_from_string(optarg);
 972                                 if (r < 0)
 973                                         return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
 974                                                                optarg);
 975                         } else
 976                                 log_show_time(true);
 977
 978                         break;
 979
 980                 case ARG_DEFAULT_STD_OUTPUT:
 981                         r = exec_output_from_string(optarg);
 982                         if (r < 0)
 983                                 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
 984                                                        optarg);
 985                         arg_defaults.std_output = r;
 986                         break;
 987
 988                 case ARG_DEFAULT_STD_ERROR:
 989                         r = exec_output_from_string(optarg);
 990                         if (r < 0)
 991                                 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
 992                                                        optarg);
 993                         arg_defaults.std_error = r;
 994                         break;
 995
 996                 case ARG_UNIT:
 997                         r = free_and_strdup(&arg_default_unit, optarg);
 998                         if (r < 0)
 999                                 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
1000
1001                         break;
1002
1003                 case ARG_SYSTEM:
1004                         arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
1005                         break;
1006
1007                 case ARG_USER:
1008                         arg_runtime_scope = RUNTIME_SCOPE_USER;
1009                         user_arg_seen = true;
1010                         break;
1011
1012                 case ARG_TEST:
1013                         arg_action = ACTION_TEST;
1014                         break;
1015
1016                 case ARG_NO_PAGER:
1017                         arg_pager_flags |= PAGER_DISABLE;
1018                         break;
1019
1020                 case ARG_VERSION:
1021                         arg_action = ACTION_VERSION;
1022                         break;
1023
1024                 case ARG_DUMP_CONFIGURATION_ITEMS:
1025                         arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
1026                         break;
1027
1028                 case ARG_DUMP_BUS_PROPERTIES:
1029                         arg_action = ACTION_DUMP_BUS_PROPERTIES;
1030                         break;
1031
1032                 case ARG_BUS_INTROSPECT:
1033                         arg_bus_introspect = optarg;
1034                         arg_action = ACTION_BUS_INTROSPECT;
1035                         break;
1036
1037                 case ARG_DUMP_CORE:
1038                         r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
1039                         if (r < 0)
1040                                 return r;
1041                         break;
1042
1043                 case ARG_CRASH_CHVT:
1044                         r = parse_crash_chvt(optarg, &arg_crash_chvt);
1045                         if (r < 0)
1046                                 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
1047                                                        optarg);
1048                         break;
1049
1050                 case ARG_CRASH_SHELL:
1051                         r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
1052                         if (r < 0)
1053                                 return r;
1054                         break;
1055
1056                 case ARG_CRASH_REBOOT:
1057                         r = parse_boolean_argument("--crash-reboot", optarg, NULL);
1058                         if (r < 0)
1059                                 return r;
1060                         arg_crash_action = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
1061                         break;
1062
1063                 case ARG_CRASH_ACTION:
1064                         r = crash_action_from_string(optarg);
1065                         if (r < 0)
1066                                 return log_error_errno(r, "Failed to parse crash action \"%s\": %m", optarg);
1067                         arg_crash_action = r;
1068                         break;
1069
1070                 case ARG_CONFIRM_SPAWN:
1071                         arg_confirm_spawn = mfree(arg_confirm_spawn);
1072
1073                         r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
1074                         if (r < 0)
1075                                 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
1076                                                        optarg);
1077                         break;
1078
1079                 case ARG_SERVICE_WATCHDOGS:
1080                         r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
1081                         if (r < 0)
1082                                 return r;
1083                         break;
1084
1085                 case ARG_SHOW_STATUS:
1086                         if (optarg) {
1087                                 r = parse_show_status(optarg, &arg_show_status);
1088                                 if (r < 0)
1089                                         return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1090                                                                optarg);
1091                         } else
1092                                 arg_show_status = SHOW_STATUS_YES;
1093                         break;
1094
1095                 case ARG_DESERIALIZE: {
1096                         int fd;
1097                         FILE *f;
1098
1099                         fd = parse_fd(optarg);
1100                         if (fd < 0)
1101                                 return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg);
1102
1103                         (void) fd_cloexec(fd, true);
1104
1105                         f = fdopen(fd, "r");
1106                         if (!f)
1107                                 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1108
1109                         safe_fclose(arg_serialization);
1110                         arg_serialization = f;
1111
1112                         break;
1113                 }
1114
1115                 case ARG_SWITCHED_ROOT:
1116                         arg_switched_root = true;
1117                         break;
1118
1119                 case ARG_MACHINE_ID:
1120                         r = id128_from_string_nonzero(optarg, &arg_machine_id);
1121                         if (r < 0)
1122                                 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1123                         break;
1124
1125                 case 'h':
1126                         arg_action = ACTION_HELP;
1127                         break;
1128
1129                 case 'D':
1130                         log_set_max_level(LOG_DEBUG);
1131                         break;
1132
1133                 case 'b':
1134                 case 's':
1135                 case 'z':
1136                         /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1137                          * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1138                          */
1139                 case '?':
1140                         if (getpid_cached() != 1)
1141                                 return -EINVAL;
1142                         else
1143                                 return 0;
1144
1145                 default:
1146                         assert_not_reached();
1147                 }
1148
1149         if (optind < argc && getpid_cached() != 1)
1150                 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1151                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1152
1153         if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen)
1154                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1155                                        "Explicit --user argument required to run as user manager.");
1156
1157         return 0;
1158 }
1159
1160 static int help(void) {
1161         _cleanup_free_ char *link = NULL;
1162         int r;
1163
1164         r = terminal_urlify_man("systemd", "1", &link);
1165         if (r < 0)
1166                 return log_oom();
1167
1168         printf("%s [OPTIONS...]\n\n"
1169                "%sStarts and monitors system and user services.%s\n\n"
1170                "This program takes no positional arguments.\n\n"
1171                "%sOptions%s:\n"
1172                "  -h --help                      Show this help\n"
1173                "     --version                   Show version\n"
1174                "     --test                      Determine initial transaction, dump it and exit\n"
1175                "     --system                    Combined with --test: operate in system mode\n"
1176                "     --user                      Combined with --test: operate in user mode\n"
1177                "     --dump-configuration-items  Dump understood unit configuration items\n"
1178                "     --dump-bus-properties       Dump exposed bus properties\n"
1179                "     --bus-introspect=PATH       Write XML introspection data\n"
1180                "     --unit=UNIT                 Set default unit\n"
1181                "     --dump-core[=BOOL]          Dump core on crash\n"
1182                "     --crash-vt=NR               Change to specified VT on crash\n"
1183                "     --crash-action=ACTION       Specify what to do on crash\n"
1184                "     --crash-shell[=BOOL]        Run shell on crash\n"
1185                "     --confirm-spawn[=BOOL]      Ask for confirmation when spawning processes\n"
1186                "     --show-status[=BOOL]        Show status updates on the console during boot\n"
1187                "     --log-target=TARGET         Set log target (console, journal, kmsg,\n"
1188                "                                                 journal-or-kmsg, null)\n"
1189                "     --log-level=LEVEL           Set log level (debug, info, notice, warning,\n"
1190                "                                                err, crit, alert, emerg)\n"
1191                "     --log-color[=BOOL]          Highlight important log messages\n"
1192                "     --log-location[=BOOL]       Include code location in log messages\n"
1193                "     --log-time[=BOOL]           Prefix log messages with current time\n"
1194                "     --default-standard-output=  Set default standard output for services\n"
1195                "     --default-standard-error=   Set default standard error output for services\n"
1196                "     --no-pager                  Do not pipe output into a pager\n"
1197                "\nSee the %s for details.\n",
1198                program_invocation_short_name,
1199                ansi_highlight(),
1200                ansi_normal(),
1201                ansi_underline(),
1202                ansi_normal(),
1203                link);
1204
1205         return 0;
1206 }
1207
1208 static int prepare_reexecute(
1209                 Manager *m,
1210                 FILE **ret_f,
1211                 FDSet **ret_fds,
1212                 bool switching_root) {
1213
1214         _cleanup_fdset_free_ FDSet *fds = NULL;
1215         _cleanup_fclose_ FILE *f = NULL;
1216         int r;
1217
1218         assert(m);
1219         assert(ret_f);
1220         assert(ret_fds);
1221
1222         /* Make sure nothing is really destructed when we shut down */
1223         m->n_reloading++;
1224         bus_manager_send_reloading(m, true);
1225
1226         r = manager_open_serialization(m, &f);
1227         if (r < 0)
1228                 return log_error_errno(r, "Failed to create serialization file: %m");
1229
1230         fds = fdset_new();
1231         if (!fds)
1232                 return log_oom();
1233
1234         r = manager_serialize(m, f, fds, switching_root);
1235         if (r < 0)
1236                 return r;
1237
1238         r = finish_serialization_file(f);
1239         if (r < 0)
1240                 return log_error_errno(r, "Failed to finish serialization file: %m");
1241
1242         r = fd_cloexec(fileno(f), false);
1243         if (r < 0)
1244                 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1245
1246         r = fdset_cloexec(fds, false);
1247         if (r < 0)
1248                 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1249
1250         *ret_f = TAKE_PTR(f);
1251         *ret_fds = TAKE_PTR(fds);
1252
1253         return 0;
1254 }
1255
1256 static void bump_file_max_and_nr_open(void) {
1257
1258         /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
1259          * numbers of file descriptors are no longer a performance problem and their memory is properly
1260          * tracked by memcg, thus counting them and limiting them in another two layers of limits is
1261          * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
1262          * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
1263
1264 #if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1265         int r;
1266 #endif
1267
1268 #if BUMP_PROC_SYS_FS_FILE_MAX
1269         /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
1270          * different, but the operation would fail silently.) */
1271         r = sysctl_write("fs/file-max", LONG_MAX_STR);
1272         if (r < 0)
1273                 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING,
1274                                r, "Failed to bump fs.file-max, ignoring: %m");
1275 #endif
1276
1277 #if BUMP_PROC_SYS_FS_NR_OPEN
1278         int v = INT_MAX;
1279
1280         /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know
1281          * what they are. The expression by which the maximum is determined is dependent on the architecture,
1282          * and is something we don't really want to copy to userspace, as it is dependent on implementation
1283          * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try
1284          * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that
1285          * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */
1286
1287         for (;;) {
1288                 int k;
1289
1290                 v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1291                 if (v < 1024) {
1292                         log_warning("Can't bump fs.nr_open, value too small.");
1293                         break;
1294                 }
1295
1296                 k = read_nr_open();
1297                 if (k < 0) {
1298                         log_error_errno(k, "Failed to read fs.nr_open: %m");
1299                         break;
1300                 }
1301                 if (k >= v) { /* Already larger */
1302                         log_debug("Skipping bump, value is already larger.");
1303                         break;
1304                 }
1305
1306                 r = sysctl_writef("fs/nr_open", "%i", v);
1307                 if (r == -EINVAL) {
1308                         log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1309                         v /= 2;
1310                         continue;
1311                 }
1312                 if (r < 0) {
1313                         log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1314                         break;
1315                 }
1316
1317                 log_debug("Successfully bumped fs.nr_open to %i", v);
1318                 break;
1319         }
1320 #endif
1321 }
1322
1323 static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
1324         struct rlimit new_rlimit;
1325         int r, nr;
1326
1327         /* Get the underlying absolute limit the kernel enforces */
1328         nr = read_nr_open();
1329
1330         /* Calculate the new limits to use for us. Never lower from what we inherited. */
1331         new_rlimit = (struct rlimit) {
1332                 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1333                 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1334         };
1335
1336         /* Shortcut if nothing changes. */
1337         if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1338             saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1339                 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1340                 return 0;
1341         }
1342
1343         /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1344          * both hard and soft. */
1345         r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1346         if (r < 0)
1347                 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1348
1349         return 0;
1350 }
1351
1352 static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
1353         struct rlimit new_rlimit;
1354         uint64_t mm;
1355         int r;
1356
1357         /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
1358          * which should normally disable such checks. We need them to implement IPAddressAllow= and
1359          * IPAddressDeny=, hence let's bump the value high enough for our user. */
1360
1361         /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1362          * must be unsigned, hence this is a given, but let's make this clear here. */
1363         assert_cc(RLIM_INFINITY > 0);
1364
1365         mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
1366                                            * physical RAM. We allow an eighth to be locked by us, just to
1367                                            * pick a value. */
1368
1369         new_rlimit = (struct rlimit) {
1370                 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1371                 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1372         };
1373
1374         if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1375             saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1376                 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1377                 return 0;
1378         }
1379
1380         r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1381         if (r < 0)
1382                 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1383
1384         return 0;
1385 }
1386
1387 static int enforce_syscall_archs(Set *archs) {
1388 #if HAVE_SECCOMP
1389         int r;
1390
1391         if (!is_seccomp_available())
1392                 return 0;
1393
1394         r = seccomp_restrict_archs(arg_syscall_archs);
1395         if (r < 0)
1396                 return log_error_errno(r, "Failed to enforce system call architecture restriction: %m");
1397 #endif
1398         return 0;
1399 }
1400
1401 static int os_release_status(void) {
1402         _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
1403                             *ansi_color = NULL, *support_end = NULL;
1404         int r;
1405
1406         r = parse_os_release(NULL,
1407                              "PRETTY_NAME", &pretty_name,
1408                              "NAME",        &name,
1409                              "VERSION",     &version,
1410                              "ANSI_COLOR",  &ansi_color,
1411                              "SUPPORT_END", &support_end);
1412         if (r < 0)
1413                 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1414                                       "Failed to read os-release file, ignoring: %m");
1415
1416         const char *label = os_release_pretty_name(pretty_name, name);
1417         const char *color = empty_to_null(ansi_color) ?: "1";
1418
1419         if (show_status_on(arg_show_status)) {
1420                 if (in_initrd()) {
1421                         if (log_get_show_color())
1422                                 status_printf(NULL, 0,
1423                                               ANSI_HIGHLIGHT "Booting initrd of " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "." ANSI_NORMAL,
1424                                               color, label);
1425                         else
1426                                 status_printf(NULL, 0,
1427                                               "Booting initrd of %s...", label);
1428                 } else {
1429                         if (log_get_show_color())
1430                                 status_printf(NULL, 0,
1431                                               "\n" ANSI_HIGHLIGHT "Welcome to " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "!" ANSI_NORMAL "\n",
1432                                               color, label);
1433                         else
1434                                 status_printf(NULL, 0,
1435                                               "\nWelcome to %s!\n",
1436                                               label);
1437                 }
1438         }
1439
1440         if (support_end && os_release_support_ended(support_end, /* quiet */ false, NULL) > 0)
1441                 /* pretty_name may include the version already, so we'll print the version only if we
1442                  * have it and we're not using pretty_name. */
1443                 status_printf(ANSI_HIGHLIGHT_RED "  !!  " ANSI_NORMAL, 0,
1444                               "This OS version (%s%s%s) is past its end-of-support date (%s)",
1445                               label,
1446                               (pretty_name || !version) ? "" : " version ",
1447                               (pretty_name || !version) ? "" : version,
1448                               support_end);
1449
1450         return 0;
1451 }
1452
1453 static int setup_os_release(RuntimeScope scope) {
1454         char os_release_dst[STRLEN("/run/user//systemd/propagate/.os-release-stage/os-release") + DECIMAL_STR_MAX(uid_t)] =
1455                 "/run/systemd/propagate/.os-release-stage/os-release";
1456         const char *os_release_src = "/etc/os-release";
1457         int r;
1458
1459         assert(IN_SET(scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER));
1460
1461         if (access("/etc/os-release", F_OK) < 0) {
1462                 if (errno != ENOENT)
1463                         log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m");
1464
1465                 os_release_src = "/usr/lib/os-release";
1466         }
1467
1468         if (scope == RUNTIME_SCOPE_USER)
1469                 xsprintf(os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid());
1470
1471         r = mkdir_parents_label(os_release_dst, 0755);
1472         if (r < 0)
1473                 return log_debug_errno(r, "Failed to create parent directory of '%s', ignoring: %m", os_release_dst);
1474
1475         r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
1476         if (r < 0)
1477                 return log_debug_errno(r, "Failed to copy '%s' to '%s', ignoring: %m",
1478                                        os_release_src, os_release_dst);
1479
1480         return 0;
1481 }
1482
1483 static int write_container_id(void) {
1484         const char *c;
1485         int r = 0;  /* avoid false maybe-uninitialized warning */
1486
1487         c = getenv("container");
1488         if (isempty(c))
1489                 return 0;
1490
1491         WITH_UMASK(0022)
1492                 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1493         if (r < 0)
1494                 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1495
1496         return 1;
1497 }
1498
1499 static int write_boot_or_shutdown_osc(const char *type) {
1500         int r;
1501
1502         assert(STRPTR_IN_SET(type, "boot", "shutdown"));
1503
1504         if (getenv_terminal_is_dumb())
1505                 return 0;
1506
1507         _cleanup_close_ int fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
1508         if (fd < 0)
1509                 return log_debug_errno(fd, "Failed to open /dev/console to print %s OSC, ignoring: %m", type);
1510
1511         _cleanup_free_ char *seq = NULL;
1512         if (streq(type, "boot"))
1513                 r = osc_context_open_boot(&seq);
1514         else
1515                 r = osc_context_close(SD_ID128_ALLF, &seq);
1516         if (r < 0)
1517                 return log_debug_errno(r, "Failed to acquire %s OSC sequence, ignoring: %m", type);
1518
1519         r = loop_write(fd, seq, SIZE_MAX);
1520         if (r < 0)
1521                 return log_debug_errno(r, "Failed to write %s OSC sequence, ignoring: %m", type);
1522
1523         if (DEBUG_LOGGING) {
1524                 _cleanup_free_ char *h = cescape(seq);
1525                 log_debug("OSC sequence for %s successfully written: %s", type, strna(h));
1526         }
1527
1528         return 0;
1529 }
1530
1531 static int bump_unix_max_dgram_qlen(void) {
1532         _cleanup_free_ char *qlen = NULL;
1533         unsigned long v;
1534         int r;
1535
1536         /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
1537          * the value really really early during boot, so that it is actually applied to all our sockets,
1538          * including the $NOTIFY_SOCKET one. */
1539
1540         r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1541         if (r < 0)
1542                 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1543                                       "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1544
1545         r = safe_atolu(qlen, &v);
1546         if (r < 0)
1547                 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1548
1549         if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1550                 return 0;
1551
1552         r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN));
1553         if (r < 0)
1554                 return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1555                                       "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1556
1557         return 1;
1558 }
1559
1560 static int fixup_environment(void) {
1561         int r;
1562
1563         /* Only fix up the environment when we are started as PID 1 */
1564         if (getpid_cached() != 1)
1565                 return 0;
1566
1567         /* We expect the environment to be set correctly if run inside a container. */
1568         if (detect_container() > 0)
1569                 return 0;
1570
1571         /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
1572          * backend device used by the console. We try to make a better guess here since some consoles might
1573          * not have support for color mode for example.
1574          *
1575          * However if TERM was configured through the kernel command line then leave it alone. */
1576         _cleanup_free_ char *term = NULL;
1577         r = proc_cmdline_get_key("TERM", 0, &term);
1578         if (r < 0)
1579                 return r;
1580         if (r > 0) {
1581                 /* If we pick up $TERM, then also pick up $COLORTERM, $NO_COLOR */
1582                 FOREACH_STRING(v, "COLORTERM", "NO_COLOR") {
1583                         _cleanup_free_ char *vv = NULL;
1584                         r = proc_cmdline_get_key(v, 0, &vv);
1585                         if (r < 0)
1586                                 return r;
1587                         if (r > 0 && setenv(v, vv, /* overwrite= */ true) < 0)
1588                                 return -errno;
1589                 }
1590         } else {
1591                 /* If no $TERM is set then look for the per-tty variable instead */
1592                 r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term);
1593                 if (r < 0)
1594                         return r;
1595         }
1596
1597         if (!term)
1598                 (void) query_term_for_tty("/dev/console", &term);
1599
1600         if (setenv("TERM", term ?: FALLBACK_TERM, /* overwrite= */ true) < 0)
1601                 return -errno;
1602
1603         /* The kernels sets HOME=/ for init. Let's undo this. */
1604         if (path_equal(getenv("HOME"), "/"))
1605                 assert_se(unsetenv("HOME") == 0);
1606
1607         return 0;
1608 }
1609
1610 static void redirect_telinit(int argc, char *argv[]) {
1611
1612         /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
1613
1614 #if HAVE_SYSV_COMPAT
1615         if (getpid_cached() == 1)
1616                 return;
1617
1618         if (!invoked_as(argv, "init"))
1619                 return;
1620
1621         execv(SYSTEMCTL_BINARY_PATH, argv);
1622         log_error_errno(errno, "Failed to execute %s: %m", SYSTEMCTL_BINARY_PATH);
1623         exit(EXIT_FAILURE);
1624 #endif
1625 }
1626
1627 static int become_shutdown(int objective, int retval) {
1628         static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
1629                 [MANAGER_EXIT]     = "exit",
1630                 [MANAGER_REBOOT]   = "reboot",
1631                 [MANAGER_POWEROFF] = "poweroff",
1632                 [MANAGER_HALT]     = "halt",
1633                 [MANAGER_KEXEC]    = "kexec",
1634         };
1635
1636         char timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
1637              exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
1638
1639         _cleanup_strv_free_ char **env_block = NULL;
1640         _cleanup_free_ char *max_log_levels = NULL;
1641         usec_t watchdog_timer = 0;
1642         int r;
1643
1644         assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1645         assert(table[objective]);
1646
1647         xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
1648
1649         const char* command_line[11] = {
1650                 SYSTEMD_SHUTDOWN_BINARY_PATH,
1651                 table[objective],
1652                 timeout,
1653                 /* Note that the last position is a terminator and must contain NULL. */
1654         };
1655         size_t pos = 3;
1656
1657         assert(command_line[pos-1]);
1658         assert(!command_line[pos]);
1659
1660         (void) log_max_levels_to_string(log_get_max_level(), &max_log_levels);
1661
1662         if (max_log_levels) {
1663                 command_line[pos++] = "--log-level";
1664                 command_line[pos++] = max_log_levels;
1665         }
1666
1667         switch (log_get_target()) {
1668
1669         case LOG_TARGET_KMSG:
1670         case LOG_TARGET_JOURNAL_OR_KMSG:
1671         case LOG_TARGET_SYSLOG_OR_KMSG:
1672                 command_line[pos++] = "--log-target=kmsg";
1673                 break;
1674
1675         case LOG_TARGET_NULL:
1676                 command_line[pos++] = "--log-target=null";
1677                 break;
1678
1679         case LOG_TARGET_CONSOLE:
1680         default:
1681                 command_line[pos++] = "--log-target=console";
1682         };
1683
1684         if (log_get_show_color())
1685                 command_line[pos++] = "--log-color";
1686
1687         if (log_get_show_location())
1688                 command_line[pos++] = "--log-location";
1689
1690         if (log_get_show_time())
1691                 command_line[pos++] = "--log-time";
1692
1693         xsprintf(exit_code, "--exit-code=%d", retval);
1694         command_line[pos++] = exit_code;
1695
1696         assert(pos < ELEMENTSOF(command_line));
1697
1698         /* The watchdog: */
1699
1700         if (objective == MANAGER_REBOOT)
1701                 watchdog_timer = arg_reboot_watchdog;
1702         else if (objective == MANAGER_KEXEC)
1703                 watchdog_timer = arg_kexec_watchdog;
1704
1705         /* If we reboot or kexec let's set the shutdown watchdog and tell the
1706          * shutdown binary to repeatedly ping it.
1707          * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
1708         (void) watchdog_setup_pretimeout(0);
1709         (void) watchdog_setup_pretimeout_governor(NULL);
1710         r = watchdog_setup(watchdog_timer);
1711         watchdog_close(/* disarm= */ r < 0);
1712
1713         /* The environment block: */
1714
1715         env_block = strv_copy(environ);
1716
1717         /* Tell the binary how often to ping, ignore failure */
1718         (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
1719
1720         /* Make sure that tools that look for $WATCHDOG_USEC (and might get started by the exitrd) don't get
1721          * confused by the variable, because the sd_watchdog_enabled() protocol uses the same variable for
1722          * the same purposes. */
1723         (void) strv_extendf(&env_block, "WATCHDOG_PID=" PID_FMT, getpid_cached());
1724
1725         if (arg_watchdog_device)
1726                 (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
1727
1728         (void) write_boot_or_shutdown_osc("shutdown");
1729
1730         execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1731         return -errno;
1732 }
1733
1734 static void initialize_clock_timewarp(void) {
1735         int r;
1736
1737         /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1738          * we are running, but only once. */
1739
1740         if (clock_is_localtime(NULL) > 0) {
1741                 int min;
1742
1743                 /* The very first call of settimeofday() also does a time warp in the kernel.
1744                  *
1745                  * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
1746                  * take care of maintaining the RTC and do all adjustments. This matches the behavior of
1747                  * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
1748                  */
1749                 r = clock_set_timezone(&min);
1750                 if (r < 0)
1751                         log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1752                 else
1753                         log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1754
1755         } else if (!in_initrd())
1756                 /*
1757                  * Do a dummy very first call to seal the kernel's time warp magic.
1758                  *
1759                  * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
1760                  * LOCAL, but the real system could be set up that way. In such case, we need to delay the
1761                  * time-warp or the sealing until we reach the real system.
1762                  *
1763                  * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
1764                  * the time will jump or be incorrect at every daylight saving time change. All kernel local
1765                  * time concepts will be treated as UTC that way.
1766                  */
1767                 (void) clock_reset_timewarp();
1768 }
1769
1770 static void apply_clock_update(void) {
1771         /* This is called later than clock_apply_epoch(), i.e. after we have parsed
1772          * configuration files/kernel command line and such. */
1773
1774         if (arg_clock_usec == 0)
1775                 return;
1776
1777         if (getpid_cached() != 1)
1778                 return;
1779
1780         if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
1781                 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1782         else
1783                 log_info("Set system clock to %s, as specified on the kernel command line.",
1784                          FORMAT_TIMESTAMP(arg_clock_usec));
1785 }
1786
1787 static void cmdline_take_random_seed(void) {
1788         size_t suggested;
1789         int r;
1790
1791         if (arg_random_seed_size == 0)
1792                 return;
1793
1794         if (getpid_cached() != 1)
1795                 return;
1796
1797         assert(arg_random_seed);
1798         suggested = random_pool_size();
1799
1800         if (arg_random_seed_size < suggested)
1801                 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1802                             arg_random_seed_size, suggested);
1803
1804         r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1805         if (r < 0) {
1806                 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1807                 return;
1808         }
1809
1810         log_notice("Successfully credited entropy passed on kernel command line.\n"
1811                    "Note that the seed provided this way is accessible to unprivileged programs. "
1812                    "This functionality should not be used outside of testing environments.");
1813 }
1814
1815 static void initialize_coredump(bool skip_setup) {
1816         if (getpid_cached() != 1)
1817                 return;
1818
1819         /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
1820          * the limit) will process core dumps for system services by default. */
1821         if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1822                 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1823
1824         /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
1825          * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
1826          * command line later so core dumps can still be generated during early startup and in initrd. */
1827         if (!skip_setup)
1828                 disable_coredumps();
1829 }
1830
1831 static void initialize_core_pattern(bool skip_setup) {
1832         int r;
1833
1834         if (skip_setup || !arg_early_core_pattern)
1835                 return;
1836
1837         if (getpid_cached() != 1)
1838                 return;
1839
1840         r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1841         if (r < 0)
1842                 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
1843                                   arg_early_core_pattern);
1844 }
1845
1846 static void apply_protect_system(bool skip_setup) {
1847         int r;
1848
1849         if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
1850                 return;
1851
1852         if (arg_protect_system < 0 && !in_initrd()) {
1853                 log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
1854                 return;
1855         }
1856
1857         r = make_mount_point("/usr");
1858         if (r < 0) {
1859                 log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
1860                 return;
1861         }
1862
1863         if (mount_nofollow_verbose(
1864                         LOG_WARNING,
1865                         /* what= */ NULL,
1866                         "/usr",
1867                         /* fstype= */ NULL,
1868                         MS_BIND|MS_REMOUNT|MS_RDONLY,
1869                         /* options= */ NULL) < 0)
1870                 return;
1871
1872         log_info("Successfully made /usr/ read-only.");
1873 }
1874
1875 static void update_cpu_affinity(bool skip_setup) {
1876         _cleanup_free_ char *mask = NULL;
1877
1878         if (skip_setup || !arg_cpu_affinity.set)
1879                 return;
1880
1881         assert(arg_cpu_affinity.allocated > 0);
1882
1883         mask = cpu_set_to_range_string(&arg_cpu_affinity);
1884         log_debug("Setting CPU affinity to {%s}.", strnull(mask));
1885
1886         if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1887                 log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
1888 }
1889
1890 static void update_numa_policy(bool skip_setup) {
1891         int r;
1892         _cleanup_free_ char *nodes = NULL;
1893         const char * policy = NULL;
1894
1895         if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1896                 return;
1897
1898         if (DEBUG_LOGGING) {
1899                 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1900                 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1901                 log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
1902         }
1903
1904         r = apply_numa_policy(&arg_numa_policy);
1905         if (r == -EOPNOTSUPP)
1906                 log_debug_errno(r, "NUMA support not available, ignoring.");
1907         else if (r < 0)
1908                 log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
1909 }
1910
1911 static void filter_args(
1912                 const char* dst[],
1913                 size_t *dst_index,
1914                 char **src,
1915                 int argc) {
1916
1917         assert(dst);
1918         assert(dst_index);
1919
1920         /* Copy some filtered arguments into the dst array from src. */
1921         for (int i = 1; i < argc; i++) {
1922                 if (STR_IN_SET(src[i],
1923                                "--switched-root",
1924                                "--system",
1925                                "--user"))
1926                         continue;
1927
1928                 if (startswith(src[i], "--deserialize="))
1929                         continue;
1930                 if (streq(src[i], "--deserialize")) {
1931                         i++;                            /* Skip the argument too */
1932                         continue;
1933                 }
1934
1935                 /* Skip target unit designators. We already acted upon this information and have queued
1936                  * appropriate jobs. We don't want to redo all this after reexecution. */
1937                 if (startswith(src[i], "--unit="))
1938                         continue;
1939                 if (streq(src[i], "--unit")) {
1940                         i++;                            /* Skip the argument too */
1941                         continue;
1942                 }
1943
1944                 /* Seems we have a good old option. Let's pass it over to the new instance. */
1945                 dst[(*dst_index)++] = src[i];
1946         }
1947 }
1948
1949 static void finish_remaining_processes(ManagerObjective objective) {
1950         assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1951
1952         /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1953          * SIGCHLD for them after deserializing. */
1954         if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
1955                 broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
1956
1957         /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
1958          * of units that were configured with SurviveFinalKillSignal=yes. */
1959         if (objective == MANAGER_SOFT_REBOOT)
1960                 broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
1961 }
1962
1963 static int do_reexecute(
1964                 ManagerObjective objective,
1965                 int argc,
1966                 char* argv[],
1967                 const struct rlimit *saved_rlimit_nofile,
1968                 const struct rlimit *saved_rlimit_memlock,
1969                 FDSet *fds,
1970                 const char *switch_root_dir,
1971                 const char *switch_root_init,
1972                 uint64_t saved_capability_ambient_set,
1973                 const char **ret_error_message) {
1974
1975         size_t i, args_size;
1976         const char **args;
1977         int r;
1978
1979         assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT));
1980         assert(argc >= 0);
1981         assert(saved_rlimit_nofile);
1982         assert(saved_rlimit_memlock);
1983         assert(ret_error_message);
1984
1985         /* Close and disarm the watchdog, so that the new instance can reinitialize it, but the machine
1986          * doesn't get rebooted while we do that. */
1987         watchdog_close(/* disarm= */ true);
1988
1989         if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) {
1990                 /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */
1991                 r = path_is_os_tree("/run/nextroot");
1992                 if (r < 0 && r != -ENOENT)
1993                         log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m");
1994                 else if (r > 0)
1995                         switch_root_dir = "/run/nextroot";
1996         }
1997
1998         if (switch_root_dir) {
1999                 /* If we're supposed to switch root, preemptively check the existence of a usable init.
2000                  * Otherwise the system might end up in a completely undebuggable state afterwards. */
2001                 if (switch_root_init) {
2002                         r = chase_and_access(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2003                         if (r < 0)
2004                                 log_warning_errno(r, "Failed to chase configured init %s/%s: %m",
2005                                                   switch_root_dir, switch_root_init);
2006                 } else {
2007                         r = chase_and_access(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2008                         if (r < 0)
2009                                 log_debug_errno(r, "Failed to chase our own binary %s/%s: %m",
2010                                                 switch_root_dir, SYSTEMD_BINARY_PATH);
2011                 }
2012
2013                 if (r < 0) {
2014                         r = chase_and_access("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2015                         if (r < 0) {
2016                                 *ret_error_message = "Switch root target contains no usable init";
2017                                 return log_error_errno(r, "Failed to chase %s/sbin/init", switch_root_dir);
2018                         }
2019                 }
2020         }
2021
2022         /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
2023          * the kernel default to its child processes */
2024         if (saved_rlimit_nofile->rlim_cur != 0)
2025                 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
2026         if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
2027                 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
2028
2029         finish_remaining_processes(objective);
2030
2031         if (switch_root_dir) {
2032                 r = switch_root(/* new_root= */ switch_root_dir,
2033                                 /* old_root_after= */ NULL,
2034                                 /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) |
2035                                              (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN));
2036                 if (r < 0)
2037                         log_error_errno(r, "Failed to switch root, trying to continue: %m");
2038         }
2039
2040         r = capability_ambient_set_apply(saved_capability_ambient_set, /* also_inherit= */ false);
2041         if (r < 0)
2042                 log_warning_errno(r, "Failed to apply the starting ambient set, ignoring: %m");
2043
2044         args_size = argc + 5;
2045         args = newa(const char*, args_size);
2046
2047         if (!switch_root_init) {
2048                 char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
2049
2050                 /* First try to spawn ourselves with the right path, and with full serialization. We do this
2051                  * only if the user didn't specify an explicit init to spawn. */
2052
2053                 assert(arg_serialization);
2054                 assert(fds);
2055
2056                 xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
2057
2058                 i = 1;         /* Leave args[0] empty for now. */
2059
2060                 /* Put our stuff first to make sure it always gets parsed in case
2061                  * we get weird stuff from the kernel cmdline (like --) */
2062                 if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
2063                         args[i++] = "--switched-root";
2064                 args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope);
2065                 args[i++] = sfd;
2066
2067                 filter_args(args, &i, argv, argc);
2068
2069                 args[i++] = NULL;
2070
2071                 assert(i <= args_size);
2072
2073                 /*
2074                  * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
2075                  * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
2076                  * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
2077                  * and wait() for it in the parent, before proceeding into the exec().
2078                  */
2079                 valgrind_summary_hack();
2080
2081                 args[0] = SYSTEMD_BINARY_PATH;
2082                 (void) execv(args[0], (char* const*) args);
2083
2084                 if (objective == MANAGER_REEXECUTE) {
2085                         *ret_error_message = "Failed to execute our own binary";
2086                         return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
2087                 }
2088
2089                 log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
2090         }
2091
2092         /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
2093          * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
2094          * but let's hope that doesn't matter.) */
2095
2096         arg_serialization = safe_fclose(arg_serialization);
2097         fds = fdset_free(fds);
2098
2099         /* Drop /run/systemd directory. Some of its content can be used as a flag indicating that systemd is
2100          * the init system but we might be replacing it with something different. If systemd is used again it
2101          * will recreate the directory and its content anyway. */
2102         r = rm_rf("/run/systemd.pre-switch-root", REMOVE_ROOT|REMOVE_MISSING_OK);
2103         if (r < 0)
2104                 log_warning_errno(r, "Failed to prepare /run/systemd.pre-switch-root/, ignoring: %m");
2105
2106         r = RET_NERRNO(rename("/run/systemd", "/run/systemd.pre-switch-root"));
2107         if (r < 0)
2108                 log_warning_errno(r, "Failed to move /run/systemd/ to /run/systemd.pre-switch-root/, ignoring: %m");
2109
2110         /* Reopen the console */
2111         (void) make_console_stdio();
2112
2113         i = 1;         /* Leave args[0] empty for now. */
2114         for (int j = 1; j <= argc; j++)
2115                 args[i++] = argv[j];
2116         assert(i <= args_size);
2117
2118         /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
2119         (void) reset_all_signal_handlers();
2120         (void) reset_signal_mask();
2121         (void) rlimit_nofile_safe();
2122
2123         if (switch_root_init) {
2124                 args[0] = switch_root_init;
2125                 (void) execve(args[0], (char* const*) args, saved_env);
2126                 log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
2127         }
2128
2129         args[0] = "/sbin/init";
2130         (void) execv(args[0], (char* const*) args);
2131         r = -errno;
2132         *ret_error_message = "Failed to execute /sbin/init";
2133
2134         if (r == -ENOENT) {
2135                 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2136                                       ANSI_HIGHLIGHT_RED "  !!  " ANSI_NORMAL,
2137                                       "%s", *ret_error_message);
2138
2139                 log_warning_errno(r, "No /sbin/init, trying fallback shell");
2140
2141                 args[0] = "/bin/sh";
2142                 args[1] = NULL;
2143                 (void) execve(args[0], (char* const*) args, saved_env);
2144                 r = -errno;
2145                 *ret_error_message = "Failed to execute fallback shell";
2146         }
2147
2148         return log_error_errno(r, "%s, giving up: %m", *ret_error_message);
2149 }
2150
2151 static int invoke_main_loop(
2152                 Manager *m,
2153                 const struct rlimit *saved_rlimit_nofile,
2154                 const struct rlimit *saved_rlimit_memlock,
2155                 int *ret_retval,                   /* Return parameters relevant for shutting down */
2156                 FDSet **ret_fds,                   /* Return parameters for reexecuting */
2157                 char **ret_switch_root_dir,        /* … */
2158                 char **ret_switch_root_init,       /* … */
2159                 const char **ret_error_message) {
2160
2161         int r;
2162
2163         assert(m);
2164         assert(saved_rlimit_nofile);
2165         assert(saved_rlimit_memlock);
2166         assert(ret_retval);
2167         assert(ret_fds);
2168         assert(ret_switch_root_dir);
2169         assert(ret_switch_root_init);
2170         assert(ret_error_message);
2171
2172         for (;;) {
2173                 int objective = manager_loop(m);
2174                 if (objective < 0) {
2175                         *ret_error_message = "Failed to run main loop";
2176                         return log_struct_errno(LOG_EMERG, objective,
2177                                                 LOG_MESSAGE("Failed to run main loop: %m"),
2178                                                 LOG_MESSAGE_ID(SD_MESSAGE_CORE_MAINLOOP_FAILED_STR));
2179                 }
2180
2181                 /* Ensure shutdown timestamp is taken even when bypassing the job engine */
2182                 if (IN_SET(objective,
2183                            MANAGER_SOFT_REBOOT,
2184                            MANAGER_REBOOT,
2185                            MANAGER_KEXEC,
2186                            MANAGER_HALT,
2187                            MANAGER_POWEROFF) &&
2188                     !dual_timestamp_is_set(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START))
2189                         dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START);
2190
2191                 switch (objective) {
2192
2193                 case MANAGER_RELOAD: {
2194                         LogTarget saved_log_target;
2195                         int saved_log_level;
2196
2197                         manager_send_reloading(m);
2198
2199                         log_info("Reloading...");
2200
2201                         /* First, save any overridden log level/target, then parse the configuration file,
2202                          * which might change the log level to new settings. */
2203
2204                         saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
2205                         saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
2206
2207                         (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
2208
2209                         set_manager_defaults(m);
2210                         set_manager_settings(m);
2211
2212                         update_cpu_affinity(false);
2213                         update_numa_policy(false);
2214
2215                         if (saved_log_level >= 0)
2216                                 manager_override_log_level(m, saved_log_level);
2217                         if (saved_log_target >= 0)
2218                                 manager_override_log_target(m, saved_log_target);
2219
2220                         if (manager_reload(m) < 0)
2221                                 /* Reloading failed before the point of no return.
2222                                  * Let's continue running as if nothing happened. */
2223                                 m->objective = MANAGER_OK;
2224                         else
2225                                 log_info("Reloading finished in " USEC_FMT " ms.",
2226                                          usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC);
2227
2228                         continue;
2229                 }
2230
2231                 case MANAGER_REEXECUTE:
2232
2233                         manager_send_reloading(m); /* From the perspective of the manager calling us this is
2234                                                     * pretty much the same as a reload */
2235
2236                         r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
2237                         if (r < 0) {
2238                                 *ret_error_message = "Failed to prepare for reexecution";
2239                                 return r;
2240                         }
2241
2242                         log_notice("Reexecuting.");
2243
2244                         *ret_retval = EXIT_FAILURE;
2245                         *ret_switch_root_dir = *ret_switch_root_init = NULL;
2246
2247                         return objective;
2248
2249                 case MANAGER_SWITCH_ROOT:
2250
2251                         manager_send_reloading(m); /* From the perspective of the manager calling us this is
2252                                                     * pretty much the same as a reload */
2253
2254                         manager_set_switching_root(m, true);
2255
2256                         if (!m->switch_root_init) {
2257                                 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
2258                                 if (r < 0) {
2259                                         *ret_error_message = "Failed to prepare for reexecution";
2260                                         return r;
2261                                 }
2262                         } else
2263                                 *ret_fds = NULL;
2264
2265                         log_notice("Switching root.");
2266
2267                         *ret_retval = EXIT_FAILURE;
2268
2269                         /* Steal the switch root parameters */
2270                         *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2271                         *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
2272
2273                         return objective;
2274
2275                 case MANAGER_SOFT_REBOOT:
2276                         manager_send_reloading(m);
2277                         manager_set_switching_root(m, true);
2278
2279                         r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true);
2280                         if (r < 0) {
2281                                 *ret_error_message = "Failed to prepare for reexecution";
2282                                 return r;
2283                         }
2284
2285                         log_notice("Soft-rebooting.");
2286
2287                         *ret_retval = EXIT_FAILURE;
2288                         *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2289                         *ret_switch_root_init = NULL;
2290
2291                         return objective;
2292
2293                 case MANAGER_EXIT:
2294                         if (MANAGER_IS_USER(m)) {
2295                                 log_debug("Exit.");
2296
2297                                 *ret_retval = m->return_value;
2298                                 *ret_fds = NULL;
2299                                 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2300
2301                                 return objective;
2302                         }
2303
2304                         _fallthrough_;
2305                 case MANAGER_REBOOT:
2306                 case MANAGER_POWEROFF:
2307                 case MANAGER_HALT:
2308                 case MANAGER_KEXEC: {
2309                         log_notice("Shutting down.");
2310
2311                         *ret_retval = m->return_value;
2312                         *ret_fds = NULL;
2313                         *ret_switch_root_dir = *ret_switch_root_init = NULL;
2314
2315                         return objective;
2316                 }
2317
2318                 default:
2319                         assert_not_reached();
2320                 }
2321         }
2322 }
2323
2324 static void log_execution_mode(bool *ret_first_boot) {
2325         bool first_boot = false;
2326         int r;
2327
2328         assert(ret_first_boot);
2329
2330         switch (arg_runtime_scope) {
2331
2332         case RUNTIME_SCOPE_SYSTEM: {
2333                 struct utsname uts;
2334                 int v;
2335
2336                 log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
2337                          arg_action == ACTION_TEST ? "test " : "",
2338                          systemd_features);
2339
2340                 v = detect_virtualization();
2341                 if (v > 0)
2342                         log_info("Detected virtualization %s.", virtualization_to_string(v));
2343
2344                 v = detect_confidential_virtualization();
2345                 if (v > 0)
2346                         log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v));
2347
2348                 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
2349
2350                 if (in_initrd())
2351                         log_info("Running in initrd.");
2352                 else {
2353                         _cleanup_free_ char *id_text = NULL;
2354
2355                         /* Let's check whether we are in first boot. First, check if an override was
2356                          * specified on the kernel command line. If yes, we honour that. */
2357
2358                         r = proc_cmdline_get_bool("systemd.condition_first_boot", /* flags = */ 0, &first_boot);
2359                         if (r < 0)
2360                                 log_debug_errno(r, "Failed to parse systemd.condition_first_boot= kernel command line argument, ignoring: %m");
2361
2362                         if (r > 0)
2363                                 log_full(first_boot ? LOG_INFO : LOG_DEBUG,
2364                                          "Kernel command line argument says we are %s first boot.",
2365                                          first_boot ? "in" : "not in");
2366                         else {
2367                                 /* Second, perform autodetection. We use /etc/machine-id as flag file for
2368                                  * this: If it is missing or contains the value "uninitialized", this is the
2369                                  * first boot. In other cases, it is not. This allows container managers and
2370                                  * installers to provision a couple of files in /etc but still permit the
2371                                  * first-boot initialization to occur. If the container manager wants to
2372                                  * provision the machine ID it should pass $container_uuid to PID 1. */
2373
2374                                 r = read_one_line_file("/etc/machine-id", &id_text);
2375                                 if (r < 0 || streq(id_text, "uninitialized")) {
2376                                         if (r < 0 && r != -ENOENT)
2377                                                 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
2378
2379                                         first_boot = true;
2380                                         log_info("Detected first boot.");
2381                                 } else
2382                                         log_debug("Detected initialized system, this is not the first boot.");
2383                         }
2384                 }
2385
2386                 assert_se(uname(&uts) >= 0);
2387
2388                 if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
2389                         log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
2390                                     "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
2391                 else
2392                         log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
2393
2394                 break;
2395         }
2396
2397         case RUNTIME_SCOPE_USER:
2398                 if (DEBUG_LOGGING) {
2399                         _cleanup_free_ char *t = NULL;
2400
2401                         t = uid_to_name(getuid());
2402                         log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2403                                   arg_action == ACTION_TEST ? " test" : "",
2404                                   getuid(), strna(t), systemd_features);
2405                 }
2406
2407                 break;
2408
2409         default:
2410                 assert_not_reached();
2411         }
2412
2413         *ret_first_boot = first_boot;
2414 }
2415
2416 static int initialize_runtime(
2417                 bool skip_setup,
2418                 bool first_boot,
2419                 struct rlimit *saved_rlimit_nofile,
2420                 struct rlimit *saved_rlimit_memlock,
2421                 uint64_t *saved_ambient_set,
2422                 const char **ret_error_message) {
2423
2424         int r;
2425
2426         assert(saved_ambient_set);
2427         assert(ret_error_message);
2428
2429         /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2430          *
2431          * - Some only apply to --system instances
2432          * - Some only apply to --user instances
2433          * - Some only apply when we first start up, but not when we reexecute
2434          */
2435
2436         if (arg_action != ACTION_RUN)
2437                 return 0;
2438
2439         update_cpu_affinity(skip_setup);
2440         update_numa_policy(skip_setup);
2441
2442         switch (arg_runtime_scope) {
2443
2444         case RUNTIME_SCOPE_SYSTEM:
2445                 /* Make sure we leave a core dump without panicking the kernel. */
2446                 install_crash_handler();
2447
2448                 if (!skip_setup) {
2449                         /* Check that /usr/ is either on the same file system as / or mounted already. */
2450                         if (dir_is_empty("/usr", /* ignore_hidden_or_backup = */ true) > 0) {
2451                                 *ret_error_message = "Refusing to run in unsupported environment where /usr/ is not populated";
2452                                 return -ENOEXEC;
2453                         }
2454
2455                         /* Pull credentials from various sources into a common credential directory (we do
2456                          * this here, before setting up the machine ID, so that we can use credential info
2457                          * for setting up the machine ID) */
2458                         (void) import_credentials();
2459
2460                         (void) os_release_status();
2461                         (void) machine_id_setup(/* root = */ NULL, arg_machine_id,
2462                                                 (first_boot ? MACHINE_ID_SETUP_FORCE_TRANSIENT : 0) |
2463                                                 (arg_machine_id_from_firmware ? MACHINE_ID_SETUP_FORCE_FIRMWARE : 0),
2464                                                 /* ret = */ NULL);
2465                         (void) hostname_setup(/* really = */ true);
2466                         (void) loopback_setup();
2467
2468                         bump_unix_max_dgram_qlen();
2469                         bump_file_max_and_nr_open();
2470
2471                         write_container_id();
2472
2473                         (void) write_boot_or_shutdown_osc("boot");
2474
2475                         /* Copy os-release to the propagate directory, so that we update it for services running
2476                          * under RootDirectory=/RootImage= when we do a soft reboot. */
2477                         r = setup_os_release(RUNTIME_SCOPE_SYSTEM);
2478                         if (r < 0)
2479                                 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2480                 }
2481
2482                 r = watchdog_set_device(arg_watchdog_device);
2483                 if (r < 0)
2484                         log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2485
2486                 if (!cap_test_all(arg_capability_bounding_set)) {
2487                         r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2488                         if (r < 0) {
2489                                 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2490                                 return log_struct_errno(LOG_EMERG, r,
2491                                                         LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"),
2492                                                         LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR));
2493                         }
2494
2495                         r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2496                         if (r < 0) {
2497                                 *ret_error_message = "Failed to drop capability bounding set";
2498                                 return log_struct_errno(LOG_EMERG, r,
2499                                                         LOG_MESSAGE("Failed to drop capability bounding set: %m"),
2500                                                         LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR));
2501                         }
2502                 }
2503
2504                 if (arg_no_new_privs) {
2505                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2506                                 *ret_error_message = "Failed to disable new privileges";
2507                                 return log_struct_errno(LOG_EMERG, errno,
2508                                                         LOG_MESSAGE("Failed to disable new privileges: %m"),
2509                                                         LOG_MESSAGE_ID(SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR));
2510                         }
2511                 }
2512
2513                 break;
2514
2515         case RUNTIME_SCOPE_USER: {
2516                 _cleanup_free_ char *p = NULL;
2517
2518                 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2519                  * user mode. In system mode mount_setup() already did that. */
2520
2521                 r = xdg_user_runtime_dir("/systemd", &p);
2522                 if (r < 0) {
2523                         *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2524                         return log_struct_errno(LOG_EMERG, r,
2525                                                 LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"),
2526                                                 LOG_MESSAGE_ID(SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR));
2527                 }
2528
2529                 if (!skip_setup) {
2530                         (void) mkdir_p_label(p, 0755);
2531                         (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2532
2533                         r = setup_os_release(RUNTIME_SCOPE_USER);
2534                         if (r < 0)
2535                                 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2536                 }
2537
2538                 break;
2539         }
2540
2541         default:
2542                 assert_not_reached();
2543         }
2544
2545         /* The two operations on the ambient set are meant for a user serssion manager. They do not affect
2546          * system manager operation, because by default it starts with an empty ambient set.
2547          *
2548          * Preserve the ambient set for later use with sd-executor processes. */
2549         r = capability_get_ambient(saved_ambient_set);
2550         if (r < 0)
2551                 log_warning_errno(r, "Failed to save ambient capabilities, ignoring: %m");
2552
2553         /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
2554          * not affect the permitted and effective sets which are important for the manager itself to
2555          * operate. */
2556         r = capability_ambient_set_apply(0, /* also_inherit= */ false);
2557         if (r < 0)
2558                 log_warning_errno(r, "Failed to reset ambient capability set, ignoring: %m");
2559
2560         if (arg_timer_slack_nsec != NSEC_INFINITY)
2561                 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2562                         log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2563
2564         if (arg_syscall_archs) {
2565                 r = enforce_syscall_archs(arg_syscall_archs);
2566                 if (r < 0) {
2567                         *ret_error_message = "Failed to set syscall architectures";
2568                         return r;
2569                 }
2570         }
2571
2572         r = make_reaper_process(true);
2573         if (r < 0)
2574                 log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
2575
2576         /* Bump up RLIMIT_NOFILE for systemd itself */
2577         (void) bump_rlimit_nofile(saved_rlimit_nofile);
2578         (void) bump_rlimit_memlock(saved_rlimit_memlock);
2579
2580         return 0;
2581 }
2582
2583 static int do_queue_default_job(
2584                 Manager *m,
2585                 const char **ret_error_message) {
2586
2587         _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2588         const char *unit;
2589         Job *job;
2590         Unit *target;
2591         int r;
2592
2593         if (arg_default_unit)
2594                 unit = arg_default_unit;
2595         else if (in_initrd())
2596                 unit = SPECIAL_INITRD_TARGET;
2597         else
2598                 unit = SPECIAL_DEFAULT_TARGET;
2599
2600         log_debug("Activating default unit: %s", unit);
2601
2602         r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2603         if (r < 0 && in_initrd() && !arg_default_unit) {
2604                 /* Fall back to default.target, which we used to always use by default. Only do this if no
2605                  * explicit configuration was given. */
2606
2607                 log_info("Falling back to %s.", SPECIAL_DEFAULT_TARGET);
2608
2609                 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2610         }
2611         if (r < 0) {
2612                 log_info("Falling back to %s.", SPECIAL_RESCUE_TARGET);
2613
2614                 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2615                 if (r < 0) {
2616                         *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2617                                                            : "Failed to load " SPECIAL_RESCUE_TARGET;
2618                         return r;
2619                 }
2620         }
2621
2622         assert(target->load_state == UNIT_LOADED);
2623
2624         r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, &error, &job);
2625         if (r == -EPERM) {
2626                 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2627
2628                 sd_bus_error_free(&error);
2629
2630                 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, &error, &job);
2631                 if (r < 0) {
2632                         *ret_error_message = "Failed to start default target";
2633                         return log_struct_errno(LOG_EMERG, r,
2634                                                 LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)),
2635                                                 LOG_MESSAGE_ID(SD_MESSAGE_CORE_START_TARGET_FAILED_STR));
2636                 }
2637
2638         } else if (r < 0) {
2639                 *ret_error_message = "Failed to isolate default target";
2640                 return log_struct_errno(LOG_EMERG, r,
2641                                         LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)),
2642                                         LOG_MESSAGE_ID(SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR));
2643         } else
2644                 log_info("Queued %s job for default target %s.",
2645                          job_type_to_string(job->type),
2646                          unit_status_string(job->unit, NULL));
2647
2648         m->default_unit_job_id = job->id;
2649
2650         return 0;
2651 }
2652
2653 static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2654                          struct rlimit *saved_rlimit_memlock) {
2655
2656         assert(saved_rlimit_nofile);
2657         assert(saved_rlimit_memlock);
2658
2659         if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2660                 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2661
2662         if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2663                 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2664 }
2665
2666 static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2667         struct rlimit *rl;
2668
2669         if (arg_defaults.rlimit[RLIMIT_NOFILE])
2670                 return;
2671
2672         /* Make sure forked processes get limits based on the original kernel setting */
2673
2674         rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2675         if (!rl) {
2676                 log_oom();
2677                 return;
2678         }
2679
2680         /* Bump the hard limit for system services to a substantially higher value. The default
2681          * hard limit current kernels set is pretty low (4K), mostly for historical
2682          * reasons. According to kernel developers, the fd handling in recent kernels has been
2683          * optimized substantially enough, so that we can bump the limit now, without paying too
2684          * high a price in memory or performance. Note however that we only bump the hard limit,
2685          * not the soft limit. That's because select() works the way it works, and chokes on fds
2686          * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2687          * unexpecting programs that they get fds higher than what they can process using
2688          * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2689          * this pitfall:  programs that are written by folks aware of the select() problem in mind
2690          * (and thus use poll()/epoll instead of select(), the way everybody should) can
2691          * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2692          * we pass. */
2693         if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2694                 int nr;
2695
2696                 /* Get the underlying absolute limit the kernel enforces */
2697                 nr = read_nr_open();
2698
2699                 rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2700         }
2701
2702         /* If for some reason we were invoked with a soft limit above 1024 (which should never
2703          * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2704          * instance), then lower what we pass on to not confuse our children */
2705         rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2706
2707         arg_defaults.rlimit[RLIMIT_NOFILE] = rl;
2708 }
2709
2710 static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2711         struct rlimit *rl;
2712
2713         /* Pass the original value down to invoked processes */
2714
2715         if (arg_defaults.rlimit[RLIMIT_MEMLOCK])
2716                 return;
2717
2718         rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2719         if (!rl) {
2720                 log_oom();
2721                 return;
2722         }
2723
2724         if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)  {
2725                 /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
2726                  * default for this since kernel 5.16) */
2727                 rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2728                 rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2729         }
2730
2731         arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl;
2732 }
2733
2734 static void setenv_manager_environment(void) {
2735         int r;
2736
2737         STRV_FOREACH(p, arg_manager_environment) {
2738                 log_debug("Setting '%s' in our own environment.", *p);
2739
2740                 r = putenv_dup(*p, true);
2741                 if (r < 0)
2742                         log_warning_errno(r, "Failed to setenv \"%s\", ignoring: %m", *p);
2743         }
2744 }
2745
2746 static void reset_arguments(void) {
2747         /* Frees/resets arg_* variables, with a few exceptions commented below. */
2748
2749         arg_default_unit = mfree(arg_default_unit);
2750
2751         /* arg_runtime_scope — ignore */
2752
2753         arg_dump_core = true;
2754         arg_crash_chvt = -1;
2755         arg_crash_shell = false;
2756         arg_crash_action = CRASH_FREEZE;
2757         arg_confirm_spawn = mfree(arg_confirm_spawn);
2758         arg_show_status = _SHOW_STATUS_INVALID;
2759         arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2760         arg_switched_root = false;
2761         arg_pager_flags = 0;
2762         arg_service_watchdogs = true;
2763
2764         unit_defaults_done(&arg_defaults);
2765         unit_defaults_init(&arg_defaults, arg_runtime_scope);
2766
2767         arg_runtime_watchdog = 0;
2768         arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2769         arg_kexec_watchdog = 0;
2770         arg_pretimeout_watchdog = 0;
2771         arg_early_core_pattern = mfree(arg_early_core_pattern);
2772         arg_watchdog_device = mfree(arg_watchdog_device);
2773         arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
2774
2775         arg_default_environment = strv_free(arg_default_environment);
2776         arg_manager_environment = strv_free(arg_manager_environment);
2777
2778         arg_capability_bounding_set = CAP_MASK_UNSET;
2779         arg_no_new_privs = false;
2780         arg_protect_system = -1;
2781         arg_timer_slack_nsec = NSEC_INFINITY;
2782
2783         arg_syscall_archs = set_free(arg_syscall_archs);
2784
2785         /* arg_serialization — ignore */
2786
2787         arg_machine_id = (sd_id128_t) {};
2788         arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2789
2790         cpu_set_done(&arg_cpu_affinity);
2791         numa_policy_reset(&arg_numa_policy);
2792
2793         arg_random_seed = mfree(arg_random_seed);
2794         arg_random_seed_size = 0;
2795         arg_clock_usec = 0;
2796
2797         arg_reload_limit_interval_sec = 0;
2798         arg_reload_limit_burst = 0;
2799 }
2800
2801 static void determine_default_oom_score_adjust(void) {
2802         int r, a, b;
2803
2804         /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
2805          * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
2806          * user). */
2807
2808         if (arg_defaults.oom_score_adjust_set)
2809                 return;
2810
2811         if (getuid() == 0)
2812                 return;
2813
2814         r = get_oom_score_adjust(&a);
2815         if (r < 0)
2816                 return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
2817
2818         assert_cc(100 <= OOM_SCORE_ADJ_MAX);
2819         b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
2820
2821         if (a == b)
2822                 return;
2823
2824         arg_defaults.oom_score_adjust = b;
2825         arg_defaults.oom_score_adjust_set = true;
2826 }
2827
2828 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2829                                const struct rlimit *saved_rlimit_memlock) {
2830         int r;
2831
2832         assert(saved_rlimit_nofile);
2833         assert(saved_rlimit_memlock);
2834
2835         /* Assign configuration defaults */
2836         reset_arguments();
2837
2838         r = parse_config_file();
2839         if (r < 0)
2840                 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2841
2842         if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2843                 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2844                 if (r < 0)
2845                         log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2846         }
2847
2848         /* Initialize some default rlimits for services if they haven't been configured */
2849         fallback_rlimit_nofile(saved_rlimit_nofile);
2850         fallback_rlimit_memlock(saved_rlimit_memlock);
2851
2852         /* Note that this also parses bits from the kernel command line, including "debug". */
2853         log_parse_environment();
2854
2855         /* Initialize the show status setting if it hasn't been set explicitly yet */
2856         if (arg_show_status == _SHOW_STATUS_INVALID)
2857                 arg_show_status = SHOW_STATUS_YES;
2858
2859         /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
2860         determine_default_oom_score_adjust();
2861
2862         /* Push variables into the manager environment block */
2863         setenv_manager_environment();
2864
2865         /* Parse log environment variables again to take into account any new environment variables. */
2866         log_parse_environment();
2867
2868         return 0;
2869 }
2870
2871 static int safety_checks(void) {
2872
2873         if (getpid_cached() == 1 &&
2874             arg_action != ACTION_RUN)
2875                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2876                                        "Unsupported execution mode while PID 1.");
2877
2878         if (getpid_cached() == 1 &&
2879             arg_runtime_scope == RUNTIME_SCOPE_USER)
2880                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2881                                        "Can't run --user mode as PID 1.");
2882
2883         if (arg_action == ACTION_RUN &&
2884             arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
2885             getpid_cached() != 1)
2886                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2887                                        "Can't run system mode unless PID 1.");
2888
2889         if (arg_action == ACTION_TEST &&
2890             geteuid() == 0)
2891                 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2892                                        "Don't run test mode as root.");
2893
2894         switch (arg_runtime_scope) {
2895
2896         case RUNTIME_SCOPE_USER:
2897
2898                 if (arg_action == ACTION_RUN &&
2899                     sd_booted() <= 0)
2900                         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2901                                                "Trying to run as user instance, but the system has not been booted with systemd.");
2902
2903                 if (arg_action == ACTION_RUN &&
2904                     !getenv("XDG_RUNTIME_DIR"))
2905                         return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2906                                                "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2907
2908                 break;
2909
2910         case RUNTIME_SCOPE_SYSTEM:
2911                 if (arg_action == ACTION_RUN &&
2912                     running_in_chroot() > 0)
2913                         return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2914                                                "Cannot be run in a chroot() environment.");
2915                 break;
2916
2917         default:
2918                 assert_not_reached();
2919         }
2920
2921         return 0;
2922 }
2923
2924 static int initialize_security(
2925                 bool *loaded_policy,
2926                 dual_timestamp *security_start_timestamp,
2927                 dual_timestamp *security_finish_timestamp,
2928                 const char **ret_error_message) {
2929
2930         int r;
2931
2932         assert(loaded_policy);
2933         assert(security_start_timestamp);
2934         assert(security_finish_timestamp);
2935         assert(ret_error_message);
2936
2937         dual_timestamp_now(security_start_timestamp);
2938
2939         r = mac_selinux_setup(loaded_policy);
2940         if (r < 0) {
2941                 *ret_error_message = "Failed to load SELinux policy";
2942                 return r;
2943         }
2944
2945         r = mac_smack_setup(loaded_policy);
2946         if (r < 0) {
2947                 *ret_error_message = "Failed to load SMACK policy";
2948                 return r;
2949         }
2950
2951         r = mac_apparmor_setup();
2952         if (r < 0) {
2953                 *ret_error_message = "Failed to load AppArmor policy";
2954                 return r;
2955         }
2956
2957         r = ima_setup();
2958         if (r < 0) {
2959                 *ret_error_message = "Failed to load IMA policy";
2960                 return r;
2961         }
2962
2963         r = ipe_setup();
2964         if (r < 0) {
2965                 *ret_error_message = "Failed to load IPE policy";
2966                 return r;
2967         }
2968
2969         dual_timestamp_now(security_finish_timestamp);
2970         return 0;
2971 }
2972
2973 static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2974         int r;
2975
2976         assert(ret_fds);
2977         assert(ret_error_message);
2978
2979         /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
2980          * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
2981          * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
2982          * mechanism to distinguish passed in fds from our own.
2983          *
2984          * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
2985          * process behind our back. We should not take possession of that (and then accidentally close
2986          * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
2987         r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
2988         if (r < 0) {
2989                 *ret_error_message = "Failed to allocate fd set";
2990                 return log_struct_errno(LOG_EMERG, r,
2991                                         LOG_MESSAGE("Failed to allocate fd set: %m"),
2992                                         LOG_MESSAGE_ID(SD_MESSAGE_CORE_FD_SET_FAILED_STR));
2993         }
2994
2995         /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
2996         assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
2997
2998         return 0;
2999 }
3000
3001 static void setup_console_terminal(bool skip_setup) {
3002
3003         if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM)
3004                 return;
3005
3006         /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
3007          * controlling tty. */
3008         terminal_detach_session();
3009
3010         /* Reset the console, but only if this is really init and we are freshly booted */
3011         if (!skip_setup)
3012                 (void) console_setup();
3013 }
3014
3015 static bool early_skip_setup_check(int argc, char *argv[]) {
3016         bool found_deserialize = false;
3017
3018         /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
3019          * later, so let's just have a quick peek here. Note that if we have switched root, do all the
3020          * special setup things anyway, even if in that case we also do deserialization. */
3021
3022         for (int i = 1; i < argc; i++)
3023                 if (streq(argv[i], "--switched-root"))
3024                         return false; /* If we switched root, don't skip the setup. */
3025                 else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
3026                         found_deserialize = true;
3027
3028         return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
3029 }
3030
3031 static int save_env(void) {
3032         char **l;
3033
3034         l = strv_copy(environ);
3035         if (!l)
3036                 return log_oom();
3037
3038         strv_free_and_replace(saved_env, l);
3039         return 0;
3040 }
3041
3042 int main(int argc, char *argv[]) {
3043         dual_timestamp
3044                 initrd_timestamp = DUAL_TIMESTAMP_NULL,
3045                 userspace_timestamp = DUAL_TIMESTAMP_NULL,
3046                 kernel_timestamp = DUAL_TIMESTAMP_NULL,
3047                 security_start_timestamp = DUAL_TIMESTAMP_NULL,
3048                 security_finish_timestamp = DUAL_TIMESTAMP_NULL;
3049         struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
3050                 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
3051                                                                           * in. Note we use different values
3052                                                                           * for the two that indicate whether
3053                                                                           * these fields are initialized! */
3054         bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
3055         char *switch_root_dir = NULL, *switch_root_init = NULL;
3056         usec_t before_startup, after_startup;
3057         static char systemd[] = "systemd";
3058         const char *error_message = NULL;
3059         uint64_t saved_ambient_set = 0;
3060         int r, retval = EXIT_FAILURE;
3061         Manager *m = NULL;
3062         FDSet *fds = NULL;
3063
3064         assert_se(argc > 0 && !isempty(argv[0]));
3065
3066         /* SysV compatibility: redirect init → telinit */
3067         redirect_telinit(argc, argv);
3068
3069         /* Take timestamps early on */
3070         dual_timestamp_from_monotonic(&kernel_timestamp, 0);
3071         dual_timestamp_now(&userspace_timestamp);
3072
3073         /* Figure out whether we need to do initialize the system, or if we already did that because we are
3074          * reexecuting. */
3075         skip_setup = early_skip_setup_check(argc, argv);
3076
3077         /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
3078          * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
3079          * right-away. */
3080         program_invocation_short_name = systemd;
3081         (void) prctl(PR_SET_NAME, systemd);
3082
3083         /* Save the original command line */
3084         save_argc_argv(argc, argv);
3085
3086         /* Save the original environment as we might need to restore it if we're requested to execute another
3087          * system manager later. */
3088         r = save_env();
3089         if (r < 0) {
3090                 error_message = "Failed to copy environment block";
3091                 goto finish;
3092         }
3093
3094         /* Make sure that if the user says "syslog" we actually log to the journal. */
3095         log_set_upgrade_syslog_to_journal(true);
3096
3097         if (getpid_cached() == 1) {
3098                 /* When we run as PID 1 force system mode */
3099                 arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
3100
3101                 /* Disable the umask logic */
3102                 umask(0);
3103
3104                 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
3105                  * not be activated yet (even though the log socket for it exists). */
3106                 log_set_prohibit_ipc(true);
3107
3108                 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
3109                  * is important so that we never end up logging to any foreign stderr, for example if we have
3110                  * to log in a child process right before execve()'ing the actual binary, at a point in time
3111                  * where socket activation stderr/stdout area already set up. */
3112                 log_set_always_reopen_console(true);
3113
3114                 if (detect_container() <= 0) {
3115
3116                         /* Running outside of a container as PID 1 */
3117                         log_set_target_and_open(LOG_TARGET_KMSG);
3118
3119                         if (in_initrd())
3120                                 initrd_timestamp = userspace_timestamp;
3121
3122                         if (!skip_setup) {
3123                                 r = mount_setup_early();
3124                                 if (r < 0) {
3125                                         error_message = "Failed to mount early API filesystems";
3126                                         goto finish;
3127                                 }
3128                         }
3129
3130                         /* We might have just mounted /proc, so let's try to parse the kernel
3131                          * command line log arguments immediately. */
3132                         log_parse_environment();
3133
3134                         /* Let's open the log backend a second time, in case the first time didn't
3135                          * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
3136                          * available, and it previously wasn't. */
3137                         log_open();
3138
3139                         if (!skip_setup) {
3140                                 disable_printk_ratelimit();
3141
3142                                 r = initialize_security(
3143                                                 &loaded_policy,
3144                                                 &security_start_timestamp,
3145                                                 &security_finish_timestamp,
3146                                                 &error_message);
3147                                 if (r < 0)
3148                                         goto finish;
3149                         }
3150
3151                         r = mac_init();
3152                         if (r < 0) {
3153                                 error_message = "Failed to initialize MAC support";
3154                                 goto finish;
3155                         }
3156
3157                         if (!skip_setup)
3158                                 initialize_clock_timewarp();
3159
3160                         clock_apply_epoch(/* allow_backwards= */ !skip_setup);
3161
3162                         /* Set the default for later on, but don't actually open the logs like this for
3163                          * now. Note that if we are transitioning from the initrd there might still be
3164                          * journal fd open, and we shouldn't attempt opening that before we parsed
3165                          * /proc/cmdline which might redirect output elsewhere. */
3166                         log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
3167
3168                 } else {
3169                         /* Running inside a container, as PID 1 */
3170                         log_set_target_and_open(LOG_TARGET_CONSOLE);
3171
3172                         /* For later on, see above... */
3173                         log_set_target(LOG_TARGET_JOURNAL);
3174
3175                         /* clear the kernel timestamp, because we are in a container */
3176                         kernel_timestamp = DUAL_TIMESTAMP_NULL;
3177                 }
3178
3179                 initialize_coredump(skip_setup);
3180
3181                 r = fixup_environment();
3182                 if (r < 0) {
3183                         log_struct_errno(LOG_EMERG, r,
3184                                          LOG_MESSAGE("Failed to fix up PID 1 environment: %m"),
3185                                          LOG_MESSAGE_ID(SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR));
3186                         error_message = "Failed to fix up PID1 environment";
3187                         goto finish;
3188                 }
3189
3190                 /* Try to figure out if we can use colors with the console. No need to do that for user
3191                  * instances since they never log into the console. */
3192                 log_show_color(colors_enabled());
3193
3194                 r = make_null_stdio();
3195                 if (r < 0)
3196                         log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
3197
3198                 /* Load the kernel modules early. */
3199                 if (!skip_setup)
3200                         (void) kmod_setup();
3201
3202                 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
3203                 r = mount_setup(loaded_policy, skip_setup);
3204                 if (r < 0) {
3205                         error_message = "Failed to mount API filesystems";
3206                         goto finish;
3207                 }
3208
3209                 /* The efivarfs is now mounted, let's lock down the system token. */
3210                 lock_down_efi_variables();
3211         } else {
3212                 /* Running as user instance */
3213                 arg_runtime_scope = RUNTIME_SCOPE_USER;
3214                 log_set_always_reopen_console(true);
3215                 log_set_target_and_open(LOG_TARGET_AUTO);
3216
3217                 /* clear the kernel timestamp, because we are not PID 1 */
3218                 kernel_timestamp = DUAL_TIMESTAMP_NULL;
3219
3220                 r = mac_init();
3221                 if (r < 0) {
3222                         error_message = "Failed to initialize MAC support";
3223                         goto finish;
3224                 }
3225         }
3226
3227         /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
3228          * transitioning from the initrd to the main systemd or suchlike. */
3229         save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
3230
3231         /* Reset all signal handlers. */
3232         (void) reset_all_signal_handlers();
3233         (void) ignore_signals(SIGNALS_IGNORE);
3234
3235         (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
3236
3237         r = parse_argv(argc, argv);
3238         if (r < 0) {
3239                 error_message = "Failed to parse command line arguments";
3240                 goto finish;
3241         }
3242
3243         r = safety_checks();
3244         if (r < 0)
3245                 goto finish;
3246
3247         if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
3248                 pager_open(arg_pager_flags);
3249
3250         if (arg_action != ACTION_RUN)
3251                 skip_setup = true;
3252
3253         if (arg_action == ACTION_HELP) {
3254                 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
3255                 goto finish;
3256         } else if (arg_action == ACTION_VERSION) {
3257                 retval = version();
3258                 goto finish;
3259         } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
3260                 unit_dump_config_items(stdout);
3261                 retval = EXIT_SUCCESS;
3262                 goto finish;
3263         } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
3264                 dump_bus_properties(stdout);
3265                 retval = EXIT_SUCCESS;
3266                 goto finish;
3267         } else if (arg_action == ACTION_BUS_INTROSPECT) {
3268                 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
3269                 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
3270                 goto finish;
3271         }
3272
3273         assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
3274
3275         /* Move out of the way, so that we won't block unmounts */
3276         assert_se(chdir("/") == 0);
3277
3278         if (arg_action == ACTION_RUN) {
3279                 if (!skip_setup) {
3280                         /* Apply the systemd.clock_usec= kernel command line switch */
3281                         apply_clock_update();
3282
3283                         /* Apply random seed from kernel command line */
3284                         cmdline_take_random_seed();
3285                 }
3286
3287                 /* A core pattern might have been specified via the cmdline. */
3288                 initialize_core_pattern(skip_setup);
3289
3290                 /* Make /usr/ read-only */
3291                 apply_protect_system(skip_setup);
3292
3293                 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
3294                 log_close();
3295
3296                 /* Remember open file descriptors for later deserialization */
3297                 r = collect_fds(&fds, &error_message);
3298                 if (r < 0)
3299                         goto finish;
3300
3301                 /* Give up any control of the console, but make sure its initialized. */
3302                 setup_console_terminal(skip_setup);
3303
3304                 /* Open the logging devices, if possible and necessary */
3305                 log_open();
3306         }
3307
3308         log_execution_mode(&first_boot);
3309
3310         r = cg_has_legacy();
3311         if (r < 0) {
3312                 error_message = "Failed to check cgroup hierarchy";
3313                 goto finish;
3314         }
3315         if (r > 0) {
3316                 r = log_full_errno(LOG_EMERG, SYNTHETIC_ERRNO(EPROTO),
3317                                    "Detected cgroup v1 hierarchy at /sys/fs/cgroup/, which is no longer supported by current version of systemd.\n"
3318                                    "Please instruct your initrd to mount cgroup v2 (unified) hierarchy,\n"
3319                                    "possibly by removing any stale kernel command line options, such as:\n"
3320                                    "  systemd.legacy_systemd_cgroup_controller=1\n"
3321                                    "  systemd.unified_cgroup_hierarchy=0");
3322
3323                 error_message = "Detected unsupported legacy cgroup hierarchy, refusing execution";
3324                 goto finish;
3325         }
3326
3327         r = initialize_runtime(skip_setup,
3328                                first_boot,
3329                                &saved_rlimit_nofile,
3330                                &saved_rlimit_memlock,
3331                                &saved_ambient_set,
3332                                &error_message);
3333         if (r < 0)
3334                 goto finish;
3335
3336         r = manager_new(arg_runtime_scope,
3337                         arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
3338                         &m);
3339         if (r < 0) {
3340                 log_struct_errno(LOG_EMERG, r,
3341                                  LOG_MESSAGE("Failed to allocate manager object: %m"),
3342                                  LOG_MESSAGE_ID(SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR));
3343                 error_message = "Failed to allocate manager object";
3344                 goto finish;
3345         }
3346
3347         m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
3348         m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
3349         m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
3350         m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
3351         m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
3352
3353         m->saved_ambient_set = saved_ambient_set;
3354
3355         set_manager_defaults(m);
3356         set_manager_settings(m);
3357         manager_set_first_boot(m, first_boot);
3358         manager_set_switching_root(m, arg_switched_root);
3359
3360         /* Remember whether we should queue the default job */
3361         queue_default_job = !arg_serialization || arg_switched_root;
3362
3363         before_startup = now(CLOCK_MONOTONIC);
3364
3365         r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
3366         if (r < 0) {
3367                 error_message = "Failed to start up manager";
3368                 goto finish;
3369         }
3370
3371         /* This will close all file descriptors that were opened, but not claimed by any unit. */
3372         fds = fdset_free(fds);
3373         arg_serialization = safe_fclose(arg_serialization);
3374
3375         if (queue_default_job) {
3376                 r = do_queue_default_job(m, &error_message);
3377                 if (r < 0)
3378                         goto finish;
3379         }
3380
3381         after_startup = now(CLOCK_MONOTONIC);
3382
3383         log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
3384                  "Loaded units and determined initial transaction in %s.",
3385                  FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
3386
3387         if (arg_action == ACTION_TEST) {
3388                 manager_test_summary(m);
3389                 retval = EXIT_SUCCESS;
3390                 goto finish;
3391         }
3392
3393         r = invoke_main_loop(m,
3394                              &saved_rlimit_nofile,
3395                              &saved_rlimit_memlock,
3396                              &retval,
3397                              &fds,
3398                              &switch_root_dir,
3399                              &switch_root_init,
3400                              &error_message);
3401         /* MANAGER_OK and MANAGER_RELOAD are not expected here. */
3402         assert(r < 0 || IN_SET(r, MANAGER_REEXECUTE, MANAGER_EXIT) ||
3403                (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3404                 IN_SET(r, MANAGER_REBOOT,
3405                           MANAGER_SOFT_REBOOT,
3406                           MANAGER_POWEROFF,
3407                           MANAGER_HALT,
3408                           MANAGER_KEXEC,
3409                           MANAGER_SWITCH_ROOT)));
3410
3411 finish:
3412         pager_close();
3413
3414         if (m) {
3415                 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
3416                 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
3417                 m = manager_free(m);
3418         }
3419
3420         mac_selinux_finish();
3421
3422         if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
3423                 r = do_reexecute(r,
3424                                  argc, argv,
3425                                  &saved_rlimit_nofile,
3426                                  &saved_rlimit_memlock,
3427                                  fds,
3428                                  switch_root_dir,
3429                                  switch_root_init,
3430                                  saved_ambient_set,
3431                                  &error_message); /* This only returns if reexecution failed */
3432
3433         arg_serialization = safe_fclose(arg_serialization);
3434         fds = fdset_free(fds);
3435
3436         saved_env = strv_free(saved_env);
3437
3438 #if HAVE_VALGRIND_VALGRIND_H
3439         /* If we are PID 1 and running under valgrind, then let's exit
3440          * here explicitly. valgrind will only generate nice output on
3441          * exit(), not on exec(), hence let's do the former not the
3442          * latter here. */
3443         if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
3444                 /* Cleanup watchdog_device strings for valgrind. We need them
3445                  * in become_shutdown() so normally we cannot free them yet. */
3446                 watchdog_free_device();
3447                 reset_arguments();
3448                 return retval;
3449         }
3450 #endif
3451
3452 #if HAS_FEATURE_ADDRESS_SANITIZER
3453         /* At this stage we most likely don't have stdio/stderr open, so the following
3454          * LSan check would not print any actionable information and would just crash
3455          * PID 1. To make this a bit more helpful, let's try to open /dev/console,
3456          * and if we succeed redirect LSan's report there. */
3457         if (getpid_cached() == 1) {
3458                 _cleanup_close_ int tty_fd = -EBADF;
3459
3460                 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
3461                 if (tty_fd >= 0)
3462                         __sanitizer_set_report_fd((void*) (intptr_t) tty_fd);
3463
3464                 __lsan_do_leak_check();
3465         }
3466 #endif
3467
3468         if (r < 0)
3469                 (void) sd_notifyf(/* unset_environment= */ false,
3470                                   "ERRNO=%i", -r);
3471
3472         /* Try to invoke the shutdown binary unless we already failed.
3473          * If we failed above, we want to freeze after finishing cleanup. */
3474         if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3475             IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) {
3476                 r = become_shutdown(r, retval);
3477                 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
3478                 error_message = "Failed to execute shutdown binary";
3479         }
3480
3481         /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
3482          * a mechanism to pick up systemd's exit status in the VM. */
3483         (void) sd_notifyf(/* unset_environment= */ false,
3484                           "EXIT_STATUS=%i", retval);
3485
3486         watchdog_free_device();
3487         arg_watchdog_device = mfree(arg_watchdog_device);
3488
3489         if (getpid_cached() == 1) {
3490                 if (error_message)
3491                         manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
3492                                               ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
3493                                               "%s.", error_message);
3494                 freeze_or_exit_or_reboot();
3495         }
3496
3497         reset_arguments();
3498         return retval;
3499 }