]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/main.c
cab8e0094118e3e598e5cf1d53b1dc88d16cd1b0
[thirdparty/systemd.git] / src / core / main.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <fcntl.h>
4 #include <getopt.h>
5 #include <linux/oom.h>
6 #include <linux/vt.h>
7 #include <stdlib.h>
8 #include <sys/mount.h>
9 #include <sys/prctl.h>
10 #include <sys/utsname.h>
11 #include <unistd.h>
12
13 #if HAVE_VALGRIND_VALGRIND_H
14 # include <valgrind/valgrind.h>
15 #endif
16
17 #include "sd-bus.h"
18 #include "sd-daemon.h"
19 #include "sd-messages.h"
20
21 #include "alloc-util.h"
22 #include "apparmor-setup.h"
23 #include "architecture.h"
24 #include "argv-util.h"
25 #include "build.h"
26 #include "bus-error.h"
27 #include "capability-util.h"
28 #include "cgroup-setup.h"
29 #include "chase.h"
30 #include "clock-util.h"
31 #include "clock-warp.h"
32 #include "conf-parser.h"
33 #include "confidential-virt.h"
34 #include "constants.h"
35 #include "copy.h"
36 #include "coredump-util.h"
37 #include "cpu-set-util.h"
38 #include "crash-handler.h"
39 #include "dbus.h"
40 #include "dbus-manager.h"
41 #include "dev-setup.h"
42 #include "efi-random.h"
43 #include "emergency-action.h"
44 #include "env-util.h"
45 #include "escape.h"
46 #include "fd-util.h"
47 #include "fdset.h"
48 #include "fileio.h"
49 #include "format-util.h"
50 #include "getopt-defs.h"
51 #include "hexdecoct.h"
52 #include "hostname-setup.h"
53 #include "id128-util.h"
54 #include "ima-setup.h"
55 #include "import-creds.h"
56 #include "initrd-util.h"
57 #include "io-util.h"
58 #include "ipe-setup.h"
59 #include "killall.h"
60 #include "kmod-setup.h"
61 #include "label-util.h"
62 #include "libmount-util.h"
63 #include "limits-util.h"
64 #include "load-fragment.h"
65 #include "log.h"
66 #include "loopback-setup.h"
67 #include "machine-id-setup.h"
68 #include "main.h"
69 #include "manager.h"
70 #include "manager-dump.h"
71 #include "manager-serialize.h"
72 #include "mkdir-label.h"
73 #include "mount-setup.h"
74 #include "mount-util.h"
75 #include "os-util.h"
76 #include "osc-context.h"
77 #include "pager.h"
78 #include "parse-argument.h"
79 #include "parse-util.h"
80 #include "path-util.h"
81 #include "pretty-print.h"
82 #include "proc-cmdline.h"
83 #include "process-util.h"
84 #include "random-util.h"
85 #include "rlimit-util.h"
86 #include "rm-rf.h"
87 #include "seccomp-util.h"
88 #include "selinux-setup.h"
89 #include "selinux-util.h"
90 #include "serialize.h"
91 #include "set.h"
92 #include "signal-util.h"
93 #include "smack-setup.h"
94 #include "special.h"
95 #include "stat-util.h"
96 #include "stdio-util.h"
97 #include "strv.h"
98 #include "switch-root.h"
99 #include "sysctl-util.h"
100 #include "terminal-util.h"
101 #include "time-util.h"
102 #include "umask-util.h"
103 #include "unit-name.h"
104 #include "user-util.h"
105 #include "version.h"
106 #include "virt.h"
107 #include "watchdog.h"
108
109 #if HAS_FEATURE_ADDRESS_SANITIZER
110 #include <sanitizer/lsan_interface.h>
111 #endif
112
113 static enum {
114 ACTION_RUN,
115 ACTION_HELP,
116 ACTION_VERSION,
117 ACTION_TEST,
118 ACTION_DUMP_CONFIGURATION_ITEMS,
119 ACTION_DUMP_BUS_PROPERTIES,
120 ACTION_BUS_INTROSPECT,
121 } arg_action = ACTION_RUN;
122
123 static const char *arg_bus_introspect = NULL;
124
125 /* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
126 * defaults are assigned in reset_arguments() below. */
127 static char *arg_default_unit;
128 static RuntimeScope arg_runtime_scope;
129 bool arg_dump_core;
130 int arg_crash_chvt;
131 bool arg_crash_shell;
132 CrashAction arg_crash_action;
133 static char *arg_confirm_spawn;
134 static ShowStatus arg_show_status;
135 static StatusUnitFormat arg_status_unit_format;
136 static bool arg_switched_root;
137 static PagerFlags arg_pager_flags;
138 static bool arg_service_watchdogs;
139 static UnitDefaults arg_defaults;
140 static usec_t arg_runtime_watchdog;
141 static usec_t arg_reboot_watchdog;
142 static usec_t arg_kexec_watchdog;
143 static usec_t arg_pretimeout_watchdog;
144 static char *arg_early_core_pattern;
145 static char *arg_watchdog_pretimeout_governor;
146 static char *arg_watchdog_device;
147 static char **arg_default_environment;
148 static char **arg_manager_environment;
149 static uint64_t arg_capability_bounding_set;
150 static bool arg_no_new_privs;
151 static int arg_protect_system;
152 static nsec_t arg_timer_slack_nsec;
153 static Set* arg_syscall_archs;
154 static FILE* arg_serialization;
155 static sd_id128_t arg_machine_id;
156 static bool arg_machine_id_from_firmware = false;
157 static EmergencyAction arg_cad_burst_action;
158 static CPUSet arg_cpu_affinity;
159 static NUMAPolicy arg_numa_policy;
160 static usec_t arg_clock_usec;
161 static void *arg_random_seed;
162 static size_t arg_random_seed_size;
163 static usec_t arg_reload_limit_interval_sec;
164 static unsigned arg_reload_limit_burst;
165
166 /* A copy of the original environment block */
167 static char **saved_env = NULL;
168
169 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
170 const struct rlimit *saved_rlimit_memlock);
171
172 static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_crash_action, crash_action, CrashAction, CRASH_FREEZE);
173
174 static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
175 _cleanup_free_ char *base = NULL;
176 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
177 int r;
178
179 r = xdg_user_config_dir("/systemd", &base);
180 if (r < 0)
181 return r;
182
183 r = strv_extendf(&files, "%s/user.conf", base);
184 if (r < 0)
185 return r;
186
187 r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
188 if (r < 0)
189 return r;
190
191 r = strv_consume(&dirs, TAKE_PTR(base));
192 if (r < 0)
193 return r;
194
195 r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
196 if (r < 0)
197 return r;
198
199 *ret_files = TAKE_PTR(files);
200 *ret_dirs = TAKE_PTR(dirs);
201 return 0;
202 }
203
204 static int save_console_winsize_in_environment(int tty_fd) {
205 int r;
206
207 assert(tty_fd >= 0);
208
209 struct winsize ws = {};
210 if (ioctl(tty_fd, TIOCGWINSZ, &ws) < 0) {
211 log_debug_errno(errno, "Failed to acquire console window size, ignoring.");
212 goto unset;
213 }
214
215 if (ws.ws_col <= 0 && ws.ws_row <= 0) {
216 log_debug("No console window size set, ignoring.");
217 goto unset;
218 }
219
220 r = setenvf("COLUMNS", /* overwrite= */ true, "%u", ws.ws_col);
221 if (r < 0) {
222 log_debug_errno(r, "Failed to set $COLUMNS, ignoring: %m");
223 goto unset;
224 }
225
226 r = setenvf("LINES", /* overwrite= */ true, "%u", ws.ws_row);
227 if (r < 0) {
228 log_debug_errno(r, "Failed to set $LINES, ignoring: %m");
229 goto unset;
230 }
231
232 log_debug("Recorded console dimensions in environment: $COLUMNS=%u $LINES=%u.", ws.ws_col, ws.ws_row);
233 return 1;
234
235 unset:
236 (void) unsetenv("COLUMNS");
237 (void) unsetenv("LINES");
238 return 0;
239 }
240
241 static int console_setup(void) {
242
243 if (getpid_cached() != 1)
244 return 0;
245
246 _cleanup_close_ int tty_fd = -EBADF;
247
248 tty_fd = open_terminal("/dev/console", O_RDWR|O_NOCTTY|O_CLOEXEC);
249 if (tty_fd < 0)
250 return log_error_errno(tty_fd, "Failed to open %s: %m", "/dev/console");
251
252 /* We don't want to force text mode. Plymouth may be showing pictures already from initrd. */
253 reset_dev_console_fd(tty_fd, /* switch_to_text= */ false);
254
255 save_console_winsize_in_environment(tty_fd);
256
257 return 0;
258 }
259
260 static int parse_timeout(const char *value, usec_t *ret) {
261 int r = 0;
262
263 assert(value);
264 assert(ret);
265
266 if (streq(value, "default"))
267 *ret = USEC_INFINITY;
268 else if (streq(value, "off"))
269 *ret = 0;
270 else
271 r = parse_sec(value, ret);
272
273 return r;
274 }
275
276 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
277 int r;
278
279 assert(key);
280
281 if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
282
283 if (proc_cmdline_value_missing(key, value))
284 return 0;
285
286 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
287 log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
288 else if (in_initrd() == !!startswith(key, "rd."))
289 return free_and_strdup_warn(&arg_default_unit, value);
290
291 } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
292
293 r = value ? parse_boolean(value) : true;
294 if (r < 0)
295 log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
296 else
297 arg_dump_core = r;
298
299 } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
300
301 if (proc_cmdline_value_missing(key, value))
302 return 0;
303
304 if (path_is_absolute(value))
305 (void) parse_path_argument(value, false, &arg_early_core_pattern);
306 else
307 log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
308
309 } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
310
311 if (!value)
312 arg_crash_chvt = 0; /* turn on */
313 else {
314 r = parse_crash_chvt(value, &arg_crash_chvt);
315 if (r < 0)
316 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
317 }
318
319 } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
320
321 r = value ? parse_boolean(value) : true;
322 if (r < 0)
323 log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
324 else
325 arg_crash_shell = r;
326
327 } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
328
329 r = value ? parse_boolean(value) : true;
330 if (r < 0)
331 log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
332 else
333 arg_crash_action = r ? CRASH_REBOOT : CRASH_FREEZE;
334
335 } else if (proc_cmdline_key_streq(key, "systemd.crash_action")) {
336
337 if (proc_cmdline_value_missing(key, value))
338 return 0;
339
340 r = crash_action_from_string(value);
341 if (r < 0)
342 log_warning_errno(r, "Failed to parse crash action switch %s, ignoring: %m", value);
343 else
344 arg_crash_action = r;
345
346 } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
347 char *s;
348
349 r = parse_confirm_spawn(value, &s);
350 if (r < 0)
351 log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
352 else
353 free_and_replace(arg_confirm_spawn, s);
354
355 } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
356
357 r = value ? parse_boolean(value) : true;
358 if (r < 0)
359 log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
360 else
361 arg_service_watchdogs = r;
362
363 } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
364
365 if (value) {
366 r = parse_show_status(value, &arg_show_status);
367 if (r < 0)
368 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
369 } else
370 arg_show_status = SHOW_STATUS_YES;
371
372 } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
373
374 if (proc_cmdline_value_missing(key, value))
375 return 0;
376
377 r = status_unit_format_from_string(value);
378 if (r < 0)
379 log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
380 else
381 arg_status_unit_format = r;
382
383 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
384
385 if (proc_cmdline_value_missing(key, value))
386 return 0;
387
388 r = exec_output_from_string(value);
389 if (r < 0)
390 log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
391 else
392 arg_defaults.std_output = r;
393
394 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
395
396 if (proc_cmdline_value_missing(key, value))
397 return 0;
398
399 r = exec_output_from_string(value);
400 if (r < 0)
401 log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
402 else
403 arg_defaults.std_error = r;
404
405 } else if (streq(key, "systemd.setenv")) {
406
407 if (proc_cmdline_value_missing(key, value))
408 return 0;
409
410 if (!env_assignment_is_valid(value))
411 log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
412 else {
413 r = strv_env_replace_strdup(&arg_default_environment, value);
414 if (r < 0)
415 return log_oom();
416 }
417
418 } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
419
420 if (proc_cmdline_value_missing(key, value))
421 return 0;
422
423 if (streq(value, "firmware"))
424 arg_machine_id_from_firmware = true;
425 else {
426 r = id128_from_string_nonzero(value, &arg_machine_id);
427 if (r < 0)
428 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
429 else
430 arg_machine_id_from_firmware = false;
431 }
432 } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
433
434 if (proc_cmdline_value_missing(key, value))
435 return 0;
436
437 r = parse_sec(value, &arg_defaults.timeout_start_usec);
438 if (r < 0)
439 log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
440
441 if (arg_defaults.timeout_start_usec <= 0)
442 arg_defaults.timeout_start_usec = USEC_INFINITY;
443
444 } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) {
445
446 if (proc_cmdline_value_missing(key, value))
447 return 0;
448
449 r = parse_sec(value, &arg_defaults.device_timeout_usec);
450 if (r < 0)
451 log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value);
452
453 if (arg_defaults.device_timeout_usec <= 0)
454 arg_defaults.device_timeout_usec = USEC_INFINITY;
455
456 } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
457
458 if (proc_cmdline_value_missing(key, value))
459 return 0;
460
461 r = parse_cpu_set(value, &arg_cpu_affinity);
462 if (r < 0)
463 log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
464
465 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
466
467 if (proc_cmdline_value_missing(key, value))
468 return 0;
469
470 (void) parse_path_argument(value, false, &arg_watchdog_device);
471
472 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
473
474 if (proc_cmdline_value_missing(key, value))
475 return 0;
476
477 r = parse_timeout(value, &arg_runtime_watchdog);
478 if (r < 0) {
479 log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
480 return 0;
481 }
482
483 arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
484
485 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
486
487 if (proc_cmdline_value_missing(key, value))
488 return 0;
489
490 r = parse_timeout(value, &arg_pretimeout_watchdog);
491 if (r < 0) {
492 log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
493 return 0;
494 }
495
496 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
497
498 if (proc_cmdline_value_missing(key, value) || isempty(value)) {
499 arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
500 return 0;
501 }
502
503 if (!string_is_safe(value)) {
504 log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
505 return 0;
506 }
507
508 return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
509
510 } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
511
512 if (proc_cmdline_value_missing(key, value))
513 return 0;
514
515 r = safe_atou64(value, &arg_clock_usec);
516 if (r < 0)
517 log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
518
519 } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
520 void *p;
521 size_t sz;
522
523 if (proc_cmdline_value_missing(key, value))
524 return 0;
525
526 r = unbase64mem(value, &p, &sz);
527 if (r < 0)
528 log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
529
530 free(arg_random_seed);
531 arg_random_seed = sz > 0 ? p : mfree(p);
532 arg_random_seed_size = sz;
533
534 } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) {
535
536 if (proc_cmdline_value_missing(key, value))
537 return 0;
538
539 r = parse_sec(value, &arg_reload_limit_interval_sec);
540 if (r < 0) {
541 log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value);
542 return 0;
543 }
544
545 } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) {
546
547 if (proc_cmdline_value_missing(key, value))
548 return 0;
549
550 r = safe_atou(value, &arg_reload_limit_burst);
551 if (r < 0) {
552 log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value);
553 return 0;
554 }
555
556 } else if (streq(key, "quiet") && !value) {
557
558 if (arg_show_status == _SHOW_STATUS_INVALID)
559 arg_show_status = SHOW_STATUS_ERROR;
560
561 } else if (streq(key, "debug") && !value) {
562
563 /* Note that log_parse_environment() handles 'debug'
564 * too, and sets the log level to LOG_DEBUG. */
565
566 if (detect_container() > 0)
567 log_set_target(LOG_TARGET_CONSOLE);
568
569 } else if (!value) {
570 const char *target;
571
572 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
573 target = runlevel_to_target(key);
574 if (target)
575 return free_and_strdup_warn(&arg_default_unit, target);
576 }
577
578 return 0;
579 }
580
581 #define DEFINE_SETTER(name, func, descr) \
582 static int name(const char *unit, \
583 const char *filename, \
584 unsigned line, \
585 const char *section, \
586 unsigned section_line, \
587 const char *lvalue, \
588 int ltype, \
589 const char *rvalue, \
590 void *data, \
591 void *userdata) { \
592 \
593 int r; \
594 \
595 assert(filename); \
596 assert(lvalue); \
597 assert(rvalue); \
598 \
599 r = func(rvalue); \
600 if (r < 0) \
601 log_syntax(unit, LOG_ERR, filename, line, r, \
602 "Invalid " descr "'%s': %m", \
603 rvalue); \
604 \
605 return 0; \
606 }
607
608 DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
609 DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
610 DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
611 DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
612 DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
613
614 static int config_parse_default_timeout_abort(
615 const char *unit,
616 const char *filename,
617 unsigned line,
618 const char *section,
619 unsigned section_line,
620 const char *lvalue,
621 int ltype,
622 const char *rvalue,
623 void *data,
624 void *userdata) {
625 int r;
626
627 r = config_parse_timeout_abort(
628 unit,
629 filename,
630 line,
631 section,
632 section_line,
633 lvalue,
634 ltype,
635 rvalue,
636 &arg_defaults.timeout_abort_usec,
637 userdata);
638 if (r >= 0)
639 arg_defaults.timeout_abort_set = r;
640 return 0;
641 }
642
643 static int config_parse_oom_score_adjust(
644 const char *unit,
645 const char *filename,
646 unsigned line,
647 const char *section,
648 unsigned section_line,
649 const char *lvalue,
650 int ltype,
651 const char *rvalue,
652 void *data,
653 void *userdata) {
654
655 int oa, r;
656
657 if (isempty(rvalue)) {
658 arg_defaults.oom_score_adjust_set = false;
659 return 0;
660 }
661
662 r = parse_oom_score_adjust(rvalue, &oa);
663 if (r < 0)
664 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
665
666 arg_defaults.oom_score_adjust = oa;
667 arg_defaults.oom_score_adjust_set = true;
668
669 return 0;
670 }
671
672 static int config_parse_protect_system_pid1(
673 const char *unit,
674 const char *filename,
675 unsigned line,
676 const char *section,
677 unsigned section_line,
678 const char *lvalue,
679 int ltype,
680 const char *rvalue,
681 void *data,
682 void *userdata) {
683
684 int *v = ASSERT_PTR(data), r;
685
686 /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
687 * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
688 * "full"). And we will enable this automatically for the initrd unless configured otherwise.
689 *
690 * We might extend this later to match more closely what the per-service ProtectSystem= can do, but
691 * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
692 * at the moment we enable this logic. */
693
694 if (isempty(rvalue) || streq(rvalue, "auto")) {
695 *v = -1;
696 return 0;
697 }
698
699 r = parse_boolean(rvalue);
700 if (r < 0)
701 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
702
703 *v = r;
704 return 0;
705 }
706
707 static int config_parse_crash_reboot(
708 const char *unit,
709 const char *filename,
710 unsigned line,
711 const char *section,
712 unsigned section_line,
713 const char *lvalue,
714 int ltype,
715 const char *rvalue,
716 void *data,
717 void *userdata) {
718
719 CrashAction *v = ASSERT_PTR(data);
720 int r;
721
722 if (isempty(rvalue)) {
723 *v = CRASH_REBOOT;
724 return 0;
725 }
726
727 r = parse_boolean(rvalue);
728 if (r < 0)
729 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
730
731 *v = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
732 return 0;
733 }
734
735 static int parse_config_file(void) {
736 const ConfigTableItem items[] = {
737 { "Manager", "LogLevel", config_parse_level2, 0, NULL },
738 { "Manager", "LogTarget", config_parse_target, 0, NULL },
739 { "Manager", "LogColor", config_parse_color, 0, NULL },
740 { "Manager", "LogLocation", config_parse_location, 0, NULL },
741 { "Manager", "LogTime", config_parse_time, 0, NULL },
742 { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
743 { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
744 { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
745 { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
746 { "Manager", "CrashReboot", config_parse_crash_reboot, 0, &arg_crash_action },
747 { "Manager", "CrashAction", config_parse_crash_action, 0, &arg_crash_action },
748 { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
749 { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
750 { "Manager", "CPUAffinity", config_parse_cpu_set, 0, &arg_cpu_affinity },
751 { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
752 { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy.nodes },
753 { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_LEGACY, NULL },
754 { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog },
755 { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog },
756 { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog },
757 { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
758 { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog },
759 { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
760 { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
761 { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
762 { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
763 { "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system },
764 #if HAVE_SECCOMP
765 { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
766 #else
767 { "Manager", "SystemCallArchitectures", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
768
769 #endif
770 { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
771 { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_defaults.timer_accuracy_usec },
772 { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_defaults.std_output },
773 { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_defaults.std_error },
774 { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_defaults.timeout_start_usec },
775 { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_defaults.timeout_stop_usec },
776 { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
777 { "Manager", "DefaultDeviceTimeoutSec", config_parse_sec, 0, &arg_defaults.device_timeout_usec },
778 { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_defaults.restart_usec },
779 { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_defaults.start_limit.interval}, /* obsolete alias */
780 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_defaults.start_limit.interval},
781 { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_defaults.start_limit.burst },
782 { "Manager", "DefaultRestrictSUIDSGID", config_parse_bool, 0, &arg_defaults.restrict_suid_sgid },
783 { "Manager", "DefaultEnvironment", config_parse_environ, arg_runtime_scope, &arg_default_environment },
784 { "Manager", "ManagerEnvironment", config_parse_environ, arg_runtime_scope, &arg_manager_environment },
785 { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_defaults.rlimit },
786 { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_defaults.rlimit },
787 { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_defaults.rlimit },
788 { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_defaults.rlimit },
789 { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_defaults.rlimit },
790 { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_defaults.rlimit },
791 { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_defaults.rlimit },
792 { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_defaults.rlimit },
793 { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_defaults.rlimit },
794 { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_defaults.rlimit },
795 { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_defaults.rlimit },
796 { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_defaults.rlimit },
797 { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_defaults.rlimit },
798 { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_defaults.rlimit },
799 { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_defaults.rlimit },
800 { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_defaults.rlimit },
801 { "Manager", "DefaultCPUAccounting", config_parse_warn_compat, DISABLED_LEGACY, NULL },
802 { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_defaults.io_accounting },
803 { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_defaults.ip_accounting },
804 { "Manager", "DefaultBlockIOAccounting", config_parse_warn_compat, DISABLED_LEGACY, NULL },
805 { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_defaults.memory_accounting },
806 { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_defaults.tasks_accounting },
807 { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_defaults.tasks_max },
808 { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.memory_pressure_threshold_usec },
809 { "Manager", "DefaultMemoryPressureWatch", config_parse_memory_pressure_watch, 0, &arg_defaults.memory_pressure_watch },
810 { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action },
811 { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy },
812 { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL },
813 { "Manager", "ReloadLimitIntervalSec", config_parse_sec, 0, &arg_reload_limit_interval_sec },
814 { "Manager", "ReloadLimitBurst", config_parse_unsigned, 0, &arg_reload_limit_burst },
815 #if ENABLE_SMACK
816 { "Manager", "DefaultSmackProcessLabel", config_parse_string, 0, &arg_defaults.smack_process_label },
817 #else
818 { "Manager", "DefaultSmackProcessLabel", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
819 #endif
820 {}
821 };
822
823 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
824 (void) config_parse_standard_file_with_dropins(
825 "systemd/system.conf",
826 "Manager\0",
827 config_item_table_lookup, items,
828 CONFIG_PARSE_WARN,
829 /* userdata= */ NULL);
830 else {
831 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
832 int r;
833
834 assert(arg_runtime_scope == RUNTIME_SCOPE_USER);
835
836 r = manager_find_user_config_paths(&files, &dirs);
837 if (r < 0)
838 return log_error_errno(r, "Failed to determine config file paths: %m");
839
840 (void) config_parse_many(
841 (const char* const*) files,
842 (const char* const*) dirs,
843 "user.conf.d",
844 /* root = */ NULL,
845 "Manager\0",
846 config_item_table_lookup, items,
847 CONFIG_PARSE_WARN,
848 NULL, NULL, NULL);
849 }
850
851 /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
852 * USEC_INFINITY like everywhere else. */
853 if (arg_defaults.timeout_start_usec <= 0)
854 arg_defaults.timeout_start_usec = USEC_INFINITY;
855 if (arg_defaults.timeout_stop_usec <= 0)
856 arg_defaults.timeout_stop_usec = USEC_INFINITY;
857
858 return 0;
859 }
860
861 static void set_manager_defaults(Manager *m) {
862 int r;
863
864 assert(m);
865
866 /* Propagates the various default unit property settings into the manager object, i.e. properties
867 * that do not affect the manager itself, but are just what newly allocated units will have set if
868 * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the
869 * manager's own behaviour) */
870
871 r = manager_set_unit_defaults(m, &arg_defaults);
872 if (r < 0)
873 log_warning_errno(r, "Failed to set manager defaults, ignoring: %m");
874
875 r = manager_default_environment(m);
876 if (r < 0)
877 log_warning_errno(r, "Failed to set manager default environment, ignoring: %m");
878
879 r = manager_transient_environment_add(m, arg_default_environment);
880 if (r < 0)
881 log_warning_errno(r, "Failed to add to transient environment, ignoring: %m");
882 }
883
884 static void set_manager_settings(Manager *m) {
885 int r;
886
887 assert(m);
888
889 /* Propagates the various manager settings into the manager object, i.e. properties that
890 * affect the manager itself (as opposed to just being inherited into newly allocated
891 * units, see set_manager_defaults() above). */
892
893 m->confirm_spawn = arg_confirm_spawn;
894 m->service_watchdogs = arg_service_watchdogs;
895 m->cad_burst_action = arg_cad_burst_action;
896 /* Note that we don't do structured initialization here, otherwise it will reset the rate limit
897 * counter on every daemon-reload. */
898 m->reload_reexec_ratelimit.interval = arg_reload_limit_interval_sec;
899 m->reload_reexec_ratelimit.burst = arg_reload_limit_burst;
900
901 manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
902 manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
903 manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
904 manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
905 r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
906 if (r < 0)
907 log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
908
909 manager_set_show_status(m, arg_show_status, "command line");
910 m->status_unit_format = arg_status_unit_format;
911 }
912
913 static int parse_argv(int argc, char *argv[]) {
914 enum {
915 COMMON_GETOPT_ARGS,
916 SYSTEMD_GETOPT_ARGS,
917 };
918
919 static const struct option options[] = {
920 COMMON_GETOPT_OPTIONS,
921 SYSTEMD_GETOPT_OPTIONS,
922 {}
923 };
924
925 int c, r;
926 bool user_arg_seen = false;
927
928 assert(argc >= 1);
929 assert(argv);
930
931 if (getpid_cached() == 1)
932 opterr = 0;
933
934 while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0)
935
936 switch (c) {
937
938 case ARG_LOG_LEVEL:
939 r = log_set_max_level_from_string(optarg);
940 if (r < 0)
941 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
942
943 break;
944
945 case ARG_LOG_TARGET:
946 r = log_set_target_from_string(optarg);
947 if (r < 0)
948 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
949
950 break;
951
952 case ARG_LOG_COLOR:
953
954 if (optarg) {
955 r = log_show_color_from_string(optarg);
956 if (r < 0)
957 return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
958 optarg);
959 } else
960 log_show_color(true);
961
962 break;
963
964 case ARG_LOG_LOCATION:
965 if (optarg) {
966 r = log_show_location_from_string(optarg);
967 if (r < 0)
968 return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
969 optarg);
970 } else
971 log_show_location(true);
972
973 break;
974
975 case ARG_LOG_TIME:
976
977 if (optarg) {
978 r = log_show_time_from_string(optarg);
979 if (r < 0)
980 return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
981 optarg);
982 } else
983 log_show_time(true);
984
985 break;
986
987 case ARG_DEFAULT_STD_OUTPUT:
988 r = exec_output_from_string(optarg);
989 if (r < 0)
990 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
991 optarg);
992 arg_defaults.std_output = r;
993 break;
994
995 case ARG_DEFAULT_STD_ERROR:
996 r = exec_output_from_string(optarg);
997 if (r < 0)
998 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
999 optarg);
1000 arg_defaults.std_error = r;
1001 break;
1002
1003 case ARG_UNIT:
1004 r = free_and_strdup(&arg_default_unit, optarg);
1005 if (r < 0)
1006 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
1007
1008 break;
1009
1010 case ARG_SYSTEM:
1011 arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
1012 break;
1013
1014 case ARG_USER:
1015 arg_runtime_scope = RUNTIME_SCOPE_USER;
1016 user_arg_seen = true;
1017 break;
1018
1019 case ARG_TEST:
1020 arg_action = ACTION_TEST;
1021 break;
1022
1023 case ARG_NO_PAGER:
1024 arg_pager_flags |= PAGER_DISABLE;
1025 break;
1026
1027 case ARG_VERSION:
1028 arg_action = ACTION_VERSION;
1029 break;
1030
1031 case ARG_DUMP_CONFIGURATION_ITEMS:
1032 arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
1033 break;
1034
1035 case ARG_DUMP_BUS_PROPERTIES:
1036 arg_action = ACTION_DUMP_BUS_PROPERTIES;
1037 break;
1038
1039 case ARG_BUS_INTROSPECT:
1040 arg_bus_introspect = optarg;
1041 arg_action = ACTION_BUS_INTROSPECT;
1042 break;
1043
1044 case ARG_DUMP_CORE:
1045 r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
1046 if (r < 0)
1047 return r;
1048 break;
1049
1050 case ARG_CRASH_CHVT:
1051 r = parse_crash_chvt(optarg, &arg_crash_chvt);
1052 if (r < 0)
1053 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
1054 optarg);
1055 break;
1056
1057 case ARG_CRASH_SHELL:
1058 r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
1059 if (r < 0)
1060 return r;
1061 break;
1062
1063 case ARG_CRASH_REBOOT:
1064 r = parse_boolean_argument("--crash-reboot", optarg, NULL);
1065 if (r < 0)
1066 return r;
1067 arg_crash_action = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
1068 break;
1069
1070 case ARG_CRASH_ACTION:
1071 r = crash_action_from_string(optarg);
1072 if (r < 0)
1073 return log_error_errno(r, "Failed to parse crash action \"%s\": %m", optarg);
1074 arg_crash_action = r;
1075 break;
1076
1077 case ARG_CONFIRM_SPAWN:
1078 arg_confirm_spawn = mfree(arg_confirm_spawn);
1079
1080 r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
1081 if (r < 0)
1082 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
1083 optarg);
1084 break;
1085
1086 case ARG_SERVICE_WATCHDOGS:
1087 r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
1088 if (r < 0)
1089 return r;
1090 break;
1091
1092 case ARG_SHOW_STATUS:
1093 if (optarg) {
1094 r = parse_show_status(optarg, &arg_show_status);
1095 if (r < 0)
1096 return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1097 optarg);
1098 } else
1099 arg_show_status = SHOW_STATUS_YES;
1100 break;
1101
1102 case ARG_DESERIALIZE: {
1103 int fd;
1104 FILE *f;
1105
1106 fd = parse_fd(optarg);
1107 if (fd < 0)
1108 return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg);
1109
1110 (void) fd_cloexec(fd, true);
1111
1112 f = fdopen(fd, "r");
1113 if (!f)
1114 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1115
1116 safe_fclose(arg_serialization);
1117 arg_serialization = f;
1118
1119 break;
1120 }
1121
1122 case ARG_SWITCHED_ROOT:
1123 arg_switched_root = true;
1124 break;
1125
1126 case ARG_MACHINE_ID:
1127 r = id128_from_string_nonzero(optarg, &arg_machine_id);
1128 if (r < 0)
1129 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1130 break;
1131
1132 case 'h':
1133 arg_action = ACTION_HELP;
1134 break;
1135
1136 case 'D':
1137 log_set_max_level(LOG_DEBUG);
1138 break;
1139
1140 case 'b':
1141 case 's':
1142 case 'z':
1143 /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1144 * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1145 */
1146 case '?':
1147 if (getpid_cached() != 1)
1148 return -EINVAL;
1149 else
1150 return 0;
1151
1152 default:
1153 assert_not_reached();
1154 }
1155
1156 if (optind < argc && getpid_cached() != 1)
1157 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1158 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1159
1160 if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen)
1161 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1162 "Explicit --user argument required to run as user manager.");
1163
1164 return 0;
1165 }
1166
1167 static int help(void) {
1168 _cleanup_free_ char *link = NULL;
1169 int r;
1170
1171 r = terminal_urlify_man("systemd", "1", &link);
1172 if (r < 0)
1173 return log_oom();
1174
1175 printf("%s [OPTIONS...]\n\n"
1176 "%sStarts and monitors system and user services.%s\n\n"
1177 "This program takes no positional arguments.\n\n"
1178 "%sOptions%s:\n"
1179 " -h --help Show this help\n"
1180 " --version Show version\n"
1181 " --test Determine initial transaction, dump it and exit\n"
1182 " --system Combined with --test: operate in system mode\n"
1183 " --user Combined with --test: operate in user mode\n"
1184 " --dump-configuration-items Dump understood unit configuration items\n"
1185 " --dump-bus-properties Dump exposed bus properties\n"
1186 " --bus-introspect=PATH Write XML introspection data\n"
1187 " --unit=UNIT Set default unit\n"
1188 " --dump-core[=BOOL] Dump core on crash\n"
1189 " --crash-vt=NR Change to specified VT on crash\n"
1190 " --crash-action=ACTION Specify what to do on crash\n"
1191 " --crash-shell[=BOOL] Run shell on crash\n"
1192 " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
1193 " --show-status[=BOOL] Show status updates on the console during boot\n"
1194 " --log-target=TARGET Set log target (console, journal, kmsg,\n"
1195 " journal-or-kmsg, null)\n"
1196 " --log-level=LEVEL Set log level (debug, info, notice, warning,\n"
1197 " err, crit, alert, emerg)\n"
1198 " --log-color[=BOOL] Highlight important log messages\n"
1199 " --log-location[=BOOL] Include code location in log messages\n"
1200 " --log-time[=BOOL] Prefix log messages with current time\n"
1201 " --default-standard-output= Set default standard output for services\n"
1202 " --default-standard-error= Set default standard error output for services\n"
1203 " --no-pager Do not pipe output into a pager\n"
1204 "\nSee the %s for details.\n",
1205 program_invocation_short_name,
1206 ansi_highlight(),
1207 ansi_normal(),
1208 ansi_underline(),
1209 ansi_normal(),
1210 link);
1211
1212 return 0;
1213 }
1214
1215 static int prepare_reexecute(
1216 Manager *m,
1217 FILE **ret_f,
1218 FDSet **ret_fds,
1219 bool switching_root) {
1220
1221 _cleanup_fdset_free_ FDSet *fds = NULL;
1222 _cleanup_fclose_ FILE *f = NULL;
1223 int r;
1224
1225 assert(m);
1226 assert(ret_f);
1227 assert(ret_fds);
1228
1229 /* Make sure nothing is really destructed when we shut down */
1230 m->n_reloading++;
1231 bus_manager_send_reloading(m, true);
1232
1233 r = manager_open_serialization(m, &f);
1234 if (r < 0)
1235 return log_error_errno(r, "Failed to create serialization file: %m");
1236
1237 fds = fdset_new();
1238 if (!fds)
1239 return log_oom();
1240
1241 r = manager_serialize(m, f, fds, switching_root);
1242 if (r < 0)
1243 return r;
1244
1245 r = finish_serialization_file(f);
1246 if (r < 0)
1247 return log_error_errno(r, "Failed to finish serialization file: %m");
1248
1249 r = fd_cloexec(fileno(f), false);
1250 if (r < 0)
1251 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1252
1253 r = fdset_cloexec(fds, false);
1254 if (r < 0)
1255 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1256
1257 *ret_f = TAKE_PTR(f);
1258 *ret_fds = TAKE_PTR(fds);
1259
1260 return 0;
1261 }
1262
1263 static void bump_file_max_and_nr_open(void) {
1264
1265 /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
1266 * numbers of file descriptors are no longer a performance problem and their memory is properly
1267 * tracked by memcg, thus counting them and limiting them in another two layers of limits is
1268 * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
1269 * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
1270
1271 #if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1272 int r;
1273 #endif
1274
1275 #if BUMP_PROC_SYS_FS_FILE_MAX
1276 /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
1277 * different, but the operation would fail silently.) */
1278 r = sysctl_write("fs/file-max", LONG_MAX_STR);
1279 if (r < 0)
1280 log_full_errno(ERRNO_IS_NEG_FS_WRITE_REFUSED(r) ? LOG_DEBUG : LOG_WARNING, r,
1281 "Failed to bump fs.file-max, ignoring: %m");
1282 #endif
1283
1284 #if BUMP_PROC_SYS_FS_NR_OPEN
1285 /* The kernel enforces maximum and minimum values on the fs.nr_open, but they are not directly
1286 * exposed, but hardcoded in fs/file.c. Hopefully, these values will not be changed, but not sure.
1287 * Let's first try the hardcoded maximum value, and if it does not work, try the half of it. */
1288
1289 for (unsigned v = NR_OPEN_MAXIMUM; v >= NR_OPEN_MINIMUM; v /= 2) {
1290 unsigned k = read_nr_open();
1291 if (k >= v) { /* Already larger */
1292 log_debug("Skipping bump, value is already larger.");
1293 break;
1294 }
1295
1296 r = sysctl_writef("fs/nr_open", "%u", v);
1297 if (r == -EINVAL) {
1298 log_debug("Couldn't write fs.nr_open as %u, halving it.", v);
1299 continue;
1300 }
1301 if (r < 0) {
1302 log_full_errno(ERRNO_IS_NEG_FS_WRITE_REFUSED(r) ? LOG_DEBUG : LOG_WARNING, r,
1303 "Failed to bump fs.nr_open, ignoring: %m");
1304 break;
1305 }
1306
1307 log_debug("Successfully bumped fs.nr_open to %u", v);
1308 break;
1309 }
1310 #endif
1311 }
1312
1313 static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
1314 struct rlimit new_rlimit;
1315 int r;
1316
1317 /* Get the underlying absolute limit the kernel enforces */
1318 unsigned nr = read_nr_open();
1319
1320 /* Calculate the new limits to use for us. Never lower from what we inherited. */
1321 new_rlimit = (struct rlimit) {
1322 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1323 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1324 };
1325
1326 /* Shortcut if nothing changes. */
1327 if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1328 saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1329 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1330 return 0;
1331 }
1332
1333 /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1334 * both hard and soft. */
1335 r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1336 if (r < 0)
1337 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1338
1339 return 0;
1340 }
1341
1342 static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
1343 struct rlimit new_rlimit;
1344 uint64_t mm;
1345 int r;
1346
1347 /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
1348 * which should normally disable such checks. We need them to implement IPAddressAllow= and
1349 * IPAddressDeny=, hence let's bump the value high enough for our user. */
1350
1351 /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1352 * must be unsigned, hence this is a given, but let's make this clear here. */
1353 assert_cc(RLIM_INFINITY > 0);
1354
1355 mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
1356 * physical RAM. We allow an eighth to be locked by us, just to
1357 * pick a value. */
1358
1359 new_rlimit = (struct rlimit) {
1360 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1361 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1362 };
1363
1364 if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1365 saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1366 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1367 return 0;
1368 }
1369
1370 r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1371 if (r < 0)
1372 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1373
1374 return 0;
1375 }
1376
1377 static int enforce_syscall_archs(Set *archs) {
1378 #if HAVE_SECCOMP
1379 int r;
1380
1381 if (!is_seccomp_available())
1382 return 0;
1383
1384 r = seccomp_restrict_archs(arg_syscall_archs);
1385 if (r < 0)
1386 return log_error_errno(r, "Failed to enforce system call architecture restriction: %m");
1387 #endif
1388 return 0;
1389 }
1390
1391 static int os_release_status(void) {
1392 _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
1393 *ansi_color = NULL, *support_end = NULL;
1394 int r;
1395
1396 r = parse_os_release(NULL,
1397 "PRETTY_NAME", &pretty_name,
1398 "NAME", &name,
1399 "VERSION", &version,
1400 "ANSI_COLOR", &ansi_color,
1401 "SUPPORT_END", &support_end);
1402 if (r < 0)
1403 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1404 "Failed to read os-release file, ignoring: %m");
1405
1406 const char *label = os_release_pretty_name(pretty_name, name);
1407 const char *color = empty_to_null(ansi_color) ?: "1";
1408
1409 if (show_status_on(arg_show_status)) {
1410 if (in_initrd()) {
1411 if (log_get_show_color())
1412 status_printf(NULL, 0,
1413 ANSI_HIGHLIGHT "Booting initrd of " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "." ANSI_NORMAL,
1414 color, label);
1415 else
1416 status_printf(NULL, 0,
1417 "Booting initrd of %s...", label);
1418 } else {
1419 if (log_get_show_color())
1420 status_printf(NULL, 0,
1421 "\n" ANSI_HIGHLIGHT "Welcome to " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "!" ANSI_NORMAL "\n",
1422 color, label);
1423 else
1424 status_printf(NULL, 0,
1425 "\nWelcome to %s!\n",
1426 label);
1427 }
1428 }
1429
1430 if (support_end && os_release_support_ended(support_end, /* quiet = */ false, /* ret_eol = */ NULL) > 0)
1431 /* pretty_name may include the version already, so we'll print the version only if we
1432 * have it and we're not using pretty_name. */
1433 status_printf(ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, 0,
1434 "This OS version (%s%s%s) is past its end-of-support date (%s)",
1435 label,
1436 (pretty_name || !version) ? "" : " version ",
1437 (pretty_name || !version) ? "" : version,
1438 support_end);
1439
1440 return 0;
1441 }
1442
1443 static int setup_os_release(RuntimeScope scope) {
1444 char os_release_dst[STRLEN("/run/user//systemd/propagate/.os-release-stage/os-release") + DECIMAL_STR_MAX(uid_t)] =
1445 "/run/systemd/propagate/.os-release-stage/os-release";
1446 const char *os_release_src = "/etc/os-release";
1447 int r;
1448
1449 assert(IN_SET(scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER));
1450
1451 if (access("/etc/os-release", F_OK) < 0) {
1452 if (errno != ENOENT)
1453 log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m");
1454
1455 os_release_src = "/usr/lib/os-release";
1456 }
1457
1458 if (scope == RUNTIME_SCOPE_USER)
1459 xsprintf(os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid());
1460
1461 r = mkdir_parents_label(os_release_dst, 0755);
1462 if (r < 0)
1463 return log_debug_errno(r, "Failed to create parent directory of '%s', ignoring: %m", os_release_dst);
1464
1465 r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
1466 if (r < 0)
1467 return log_debug_errno(r, "Failed to copy '%s' to '%s', ignoring: %m",
1468 os_release_src, os_release_dst);
1469
1470 return 0;
1471 }
1472
1473 static int write_container_id(void) {
1474 const char *c;
1475 int r = 0; /* avoid false maybe-uninitialized warning */
1476
1477 c = getenv("container");
1478 if (isempty(c))
1479 return 0;
1480
1481 WITH_UMASK(0022)
1482 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1483 if (r < 0)
1484 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1485
1486 return 1;
1487 }
1488
1489 static int write_boot_or_shutdown_osc(const char *type) {
1490 int r;
1491
1492 assert(STRPTR_IN_SET(type, "boot", "shutdown"));
1493
1494 if (getenv_terminal_is_dumb())
1495 return 0;
1496
1497 _cleanup_close_ int fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
1498 if (fd < 0)
1499 return log_debug_errno(fd, "Failed to open /dev/console to print %s OSC, ignoring: %m", type);
1500
1501 _cleanup_free_ char *seq = NULL;
1502 if (streq(type, "boot"))
1503 r = osc_context_open_boot(&seq);
1504 else
1505 r = osc_context_close(SD_ID128_ALLF, &seq);
1506 if (r < 0)
1507 return log_debug_errno(r, "Failed to acquire %s OSC sequence, ignoring: %m", type);
1508
1509 r = loop_write(fd, seq, SIZE_MAX);
1510 if (r < 0)
1511 return log_debug_errno(r, "Failed to write %s OSC sequence, ignoring: %m", type);
1512
1513 if (DEBUG_LOGGING) {
1514 _cleanup_free_ char *h = cescape(seq);
1515 log_debug("OSC sequence for %s successfully written: %s", type, strna(h));
1516 }
1517
1518 return 0;
1519 }
1520
1521 static int bump_unix_max_dgram_qlen(void) {
1522 _cleanup_free_ char *qlen = NULL;
1523 unsigned long v;
1524 int r;
1525
1526 /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
1527 * the value really really early during boot, so that it is actually applied to all our sockets,
1528 * including the $NOTIFY_SOCKET one. */
1529
1530 r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1531 if (r < 0)
1532 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1533 "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1534
1535 r = safe_atolu(qlen, &v);
1536 if (r < 0)
1537 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1538
1539 if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1540 return 0;
1541
1542 r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN));
1543 if (r < 0)
1544 return log_full_errno(ERRNO_IS_NEG_FS_WRITE_REFUSED(r) ? LOG_DEBUG : LOG_WARNING, r,
1545 "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1546
1547 return 1;
1548 }
1549
1550 static int fixup_environment(void) {
1551 int r;
1552
1553 /* Only fix up the environment when we are started as PID 1 */
1554 if (getpid_cached() != 1)
1555 return 0;
1556
1557 /* We expect the environment to be set correctly if run inside a container. */
1558 if (detect_container() > 0)
1559 return 0;
1560
1561 /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
1562 * backend device used by the console. We try to make a better guess here since some consoles might
1563 * not have support for color mode for example.
1564 *
1565 * However if TERM was configured through the kernel command line then leave it alone. */
1566 _cleanup_free_ char *term = NULL;
1567 r = proc_cmdline_get_key("TERM", 0, &term);
1568 if (r < 0)
1569 return r;
1570 if (r > 0) {
1571 /* If we pick up $TERM, then also pick up $COLORTERM, $NO_COLOR */
1572 FOREACH_STRING(v, "COLORTERM", "NO_COLOR") {
1573 _cleanup_free_ char *vv = NULL;
1574 r = proc_cmdline_get_key(v, 0, &vv);
1575 if (r < 0)
1576 return r;
1577 if (r > 0 && setenv(v, vv, /* overwrite= */ true) < 0)
1578 return -errno;
1579 }
1580 } else {
1581 /* If no $TERM is set then look for the per-tty variable instead */
1582 r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term);
1583 if (r < 0)
1584 return r;
1585 }
1586
1587 if (!term)
1588 (void) query_term_for_tty("/dev/console", &term);
1589
1590 if (setenv("TERM", term ?: FALLBACK_TERM, /* overwrite= */ true) < 0)
1591 return -errno;
1592
1593 /* The kernels sets HOME=/ for init. Let's undo this. */
1594 if (path_equal(getenv("HOME"), "/"))
1595 assert_se(unsetenv("HOME") == 0);
1596
1597 return 0;
1598 }
1599
1600 static int become_shutdown(int objective, int retval) {
1601 static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
1602 [MANAGER_EXIT] = "exit",
1603 [MANAGER_REBOOT] = "reboot",
1604 [MANAGER_POWEROFF] = "poweroff",
1605 [MANAGER_HALT] = "halt",
1606 [MANAGER_KEXEC] = "kexec",
1607 };
1608
1609 char timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
1610 exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
1611
1612 _cleanup_strv_free_ char **env_block = NULL;
1613 _cleanup_free_ char *max_log_levels = NULL;
1614 usec_t watchdog_timer = 0;
1615 int r;
1616
1617 assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1618 assert(table[objective]);
1619
1620 xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
1621
1622 const char* command_line[11] = {
1623 SYSTEMD_SHUTDOWN_BINARY_PATH,
1624 table[objective],
1625 timeout,
1626 /* Note that the last position is a terminator and must contain NULL. */
1627 };
1628 size_t pos = 3;
1629
1630 assert(command_line[pos-1]);
1631 assert(!command_line[pos]);
1632
1633 (void) log_max_levels_to_string(log_get_max_level(), &max_log_levels);
1634
1635 if (max_log_levels) {
1636 command_line[pos++] = "--log-level";
1637 command_line[pos++] = max_log_levels;
1638 }
1639
1640 switch (log_get_target()) {
1641
1642 case LOG_TARGET_KMSG:
1643 case LOG_TARGET_JOURNAL_OR_KMSG:
1644 case LOG_TARGET_SYSLOG_OR_KMSG:
1645 command_line[pos++] = "--log-target=kmsg";
1646 break;
1647
1648 case LOG_TARGET_NULL:
1649 command_line[pos++] = "--log-target=null";
1650 break;
1651
1652 case LOG_TARGET_CONSOLE:
1653 default:
1654 command_line[pos++] = "--log-target=console";
1655 };
1656
1657 if (log_get_show_color())
1658 command_line[pos++] = "--log-color";
1659
1660 if (log_get_show_location())
1661 command_line[pos++] = "--log-location";
1662
1663 if (log_get_show_time())
1664 command_line[pos++] = "--log-time";
1665
1666 xsprintf(exit_code, "--exit-code=%d", retval);
1667 command_line[pos++] = exit_code;
1668
1669 assert(pos < ELEMENTSOF(command_line));
1670
1671 /* The watchdog: */
1672
1673 if (objective == MANAGER_REBOOT)
1674 watchdog_timer = arg_reboot_watchdog;
1675 else if (objective == MANAGER_KEXEC)
1676 watchdog_timer = arg_kexec_watchdog;
1677
1678 /* If we reboot or kexec let's set the shutdown watchdog and tell the
1679 * shutdown binary to repeatedly ping it.
1680 * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
1681 (void) watchdog_setup_pretimeout(0);
1682 (void) watchdog_setup_pretimeout_governor(NULL);
1683 r = watchdog_setup(watchdog_timer);
1684 watchdog_close(/* disarm= */ r < 0);
1685
1686 /* The environment block: */
1687
1688 env_block = strv_copy(environ);
1689
1690 /* Tell the binary how often to ping, ignore failure */
1691 (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
1692
1693 /* Make sure that tools that look for $WATCHDOG_USEC (and might get started by the exitrd) don't get
1694 * confused by the variable, because the sd_watchdog_enabled() protocol uses the same variable for
1695 * the same purposes. */
1696 (void) strv_extendf(&env_block, "WATCHDOG_PID=" PID_FMT, getpid_cached());
1697
1698 if (arg_watchdog_device)
1699 (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
1700
1701 (void) write_boot_or_shutdown_osc("shutdown");
1702
1703 execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1704 return -errno;
1705 }
1706
1707 static void initialize_clock_timewarp(void) {
1708 int r;
1709
1710 /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1711 * we are running, but only once. */
1712
1713 if (clock_is_localtime(NULL) > 0) {
1714 int min;
1715
1716 /* The very first call of settimeofday() also does a time warp in the kernel.
1717 *
1718 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
1719 * take care of maintaining the RTC and do all adjustments. This matches the behavior of
1720 * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
1721 */
1722 r = clock_set_timezone(&min);
1723 if (r < 0)
1724 log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1725 else
1726 log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1727
1728 } else if (!in_initrd())
1729 /*
1730 * Do a dummy very first call to seal the kernel's time warp magic.
1731 *
1732 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
1733 * LOCAL, but the real system could be set up that way. In such case, we need to delay the
1734 * time-warp or the sealing until we reach the real system.
1735 *
1736 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
1737 * the time will jump or be incorrect at every daylight saving time change. All kernel local
1738 * time concepts will be treated as UTC that way.
1739 */
1740 (void) clock_reset_timewarp();
1741 }
1742
1743 static void apply_clock_update(void) {
1744 /* This is called later than clock_apply_epoch(), i.e. after we have parsed
1745 * configuration files/kernel command line and such. */
1746
1747 if (arg_clock_usec == 0)
1748 return;
1749
1750 if (getpid_cached() != 1)
1751 return;
1752
1753 if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
1754 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1755 else
1756 log_info("Set system clock to %s, as specified on the kernel command line.",
1757 FORMAT_TIMESTAMP(arg_clock_usec));
1758 }
1759
1760 static void cmdline_take_random_seed(void) {
1761 size_t suggested;
1762 int r;
1763
1764 if (arg_random_seed_size == 0)
1765 return;
1766
1767 if (getpid_cached() != 1)
1768 return;
1769
1770 assert(arg_random_seed);
1771 suggested = random_pool_size();
1772
1773 if (arg_random_seed_size < suggested)
1774 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1775 arg_random_seed_size, suggested);
1776
1777 r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1778 if (r < 0) {
1779 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1780 return;
1781 }
1782
1783 log_notice("Successfully credited entropy passed on kernel command line.\n"
1784 "Note that the seed provided this way is accessible to unprivileged programs. "
1785 "This functionality should not be used outside of testing environments.");
1786 }
1787
1788 static void initialize_coredump(bool skip_setup) {
1789 if (getpid_cached() != 1)
1790 return;
1791
1792 /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
1793 * the limit) will process core dumps for system services by default. */
1794 if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1795 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1796
1797 /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
1798 * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
1799 * command line later so core dumps can still be generated during early startup and in initrd. */
1800 if (!skip_setup)
1801 disable_coredumps();
1802 }
1803
1804 static void initialize_core_pattern(bool skip_setup) {
1805 int r;
1806
1807 if (skip_setup || !arg_early_core_pattern)
1808 return;
1809
1810 if (getpid_cached() != 1)
1811 return;
1812
1813 r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1814 if (r < 0)
1815 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
1816 arg_early_core_pattern);
1817 }
1818
1819 static void apply_protect_system(bool skip_setup) {
1820 int r;
1821
1822 if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
1823 return;
1824
1825 if (arg_protect_system < 0 && !in_initrd()) {
1826 log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
1827 return;
1828 }
1829
1830 r = make_mount_point("/usr");
1831 if (r < 0) {
1832 log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
1833 return;
1834 }
1835
1836 if (mount_nofollow_verbose(
1837 LOG_WARNING,
1838 /* what= */ NULL,
1839 "/usr",
1840 /* fstype= */ NULL,
1841 MS_BIND|MS_REMOUNT|MS_RDONLY,
1842 /* options= */ NULL) < 0)
1843 return;
1844
1845 log_info("Successfully made /usr/ read-only.");
1846 }
1847
1848 static void update_cpu_affinity(bool skip_setup) {
1849 _cleanup_free_ char *mask = NULL;
1850
1851 if (skip_setup || !arg_cpu_affinity.set)
1852 return;
1853
1854 assert(arg_cpu_affinity.allocated > 0);
1855
1856 mask = cpu_set_to_range_string(&arg_cpu_affinity);
1857 log_debug("Setting CPU affinity to {%s}.", strnull(mask));
1858
1859 if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1860 log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
1861 }
1862
1863 static void update_numa_policy(bool skip_setup) {
1864 int r;
1865 _cleanup_free_ char *nodes = NULL;
1866 const char * policy = NULL;
1867
1868 if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1869 return;
1870
1871 if (DEBUG_LOGGING) {
1872 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1873 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1874 log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
1875 }
1876
1877 r = apply_numa_policy(&arg_numa_policy);
1878 if (r == -EOPNOTSUPP)
1879 log_debug_errno(r, "NUMA support not available, ignoring.");
1880 else if (r < 0)
1881 log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
1882 }
1883
1884 static void filter_args(
1885 const char* dst[],
1886 size_t *dst_index,
1887 char **src,
1888 int argc) {
1889
1890 assert(dst);
1891 assert(dst_index);
1892
1893 /* Copy some filtered arguments into the dst array from src. */
1894 for (int i = 1; i < argc; i++) {
1895 if (STR_IN_SET(src[i],
1896 "--switched-root",
1897 "--system",
1898 "--user"))
1899 continue;
1900
1901 if (startswith(src[i], "--deserialize="))
1902 continue;
1903 if (streq(src[i], "--deserialize")) {
1904 i++; /* Skip the argument too */
1905 continue;
1906 }
1907
1908 /* Skip target unit designators. We already acted upon this information and have queued
1909 * appropriate jobs. We don't want to redo all this after reexecution. */
1910 if (startswith(src[i], "--unit="))
1911 continue;
1912 if (streq(src[i], "--unit")) {
1913 i++; /* Skip the argument too */
1914 continue;
1915 }
1916
1917 /* Seems we have a good old option. Let's pass it over to the new instance. */
1918 dst[(*dst_index)++] = src[i];
1919 }
1920 }
1921
1922 static void finish_remaining_processes(ManagerObjective objective) {
1923 assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1924
1925 /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1926 * SIGCHLD for them after deserializing. */
1927 if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
1928 broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
1929
1930 /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
1931 * of units that were configured with SurviveFinalKillSignal=yes. */
1932 if (objective == MANAGER_SOFT_REBOOT)
1933 broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
1934 }
1935
1936 static void reduce_vt(ManagerObjective objective) {
1937 int r;
1938
1939 if (objective != MANAGER_SOFT_REBOOT)
1940 return;
1941
1942 /* Switches back to VT 1, and releases all other VTs, in an attempt to return to a situation similar
1943 * to how it was during the original kernel initialization. This is important because if some random
1944 * TTY is in foreground, /dev/console will end up pointing to it, where the future init system will
1945 * then write its status output to, but where it probably shouldn't be writing to. */
1946
1947 r = chvt(1);
1948 if (r < 0)
1949 log_debug_errno(r, "Failed to switch to VT TTY 1, ignoring: %m");
1950
1951 _cleanup_close_ int tty0_fd = open_terminal("/dev/tty0", O_RDWR|O_NOCTTY|O_CLOEXEC|O_NONBLOCK);
1952 if (tty0_fd < 0)
1953 return (void) log_debug_errno(tty0_fd, "Failed to open '/dev/tty0', ignoring: %m");
1954
1955 for (int ttynr = 2; ttynr <= VTNR_MAX; ttynr++)
1956 if (ioctl(tty0_fd, VT_DISALLOCATE, ttynr) < 0)
1957 log_debug_errno(errno, "Failed to disallocate VT TTY %i, ignoring: %m", ttynr);
1958 else
1959 log_debug("Successfully disallocated VT TTY %i.", ttynr);
1960 }
1961
1962 static int do_reexecute(
1963 ManagerObjective objective,
1964 int argc,
1965 char* argv[],
1966 const struct rlimit *saved_rlimit_nofile,
1967 const struct rlimit *saved_rlimit_memlock,
1968 FDSet *fds,
1969 const char *switch_root_dir,
1970 const char *switch_root_init,
1971 uint64_t saved_capability_ambient_set,
1972 const char **ret_error_message) {
1973
1974 size_t i, args_size;
1975 const char **args;
1976 int r;
1977
1978 assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT));
1979 assert(argc >= 0);
1980 assert(saved_rlimit_nofile);
1981 assert(saved_rlimit_memlock);
1982 assert(ret_error_message);
1983
1984 /* Close and disarm the watchdog, so that the new instance can reinitialize it, but the machine
1985 * doesn't get rebooted while we do that. */
1986 watchdog_close(/* disarm= */ true);
1987
1988 if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) {
1989 /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */
1990 r = path_is_os_tree("/run/nextroot");
1991 if (r < 0 && r != -ENOENT)
1992 log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m");
1993 else if (r > 0)
1994 switch_root_dir = "/run/nextroot";
1995 }
1996
1997 if (switch_root_dir) {
1998 /* If we're supposed to switch root, preemptively check the existence of a usable init.
1999 * Otherwise the system might end up in a completely undebuggable state afterwards. */
2000 if (switch_root_init) {
2001 r = chase_and_access(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2002 if (r < 0)
2003 log_warning_errno(r, "Failed to chase configured init %s/%s: %m",
2004 switch_root_dir, switch_root_init);
2005 } else {
2006 r = chase_and_access(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2007 if (r < 0)
2008 log_debug_errno(r, "Failed to chase our own binary %s/%s: %m",
2009 switch_root_dir, SYSTEMD_BINARY_PATH);
2010 }
2011
2012 if (r < 0) {
2013 r = chase_and_access("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2014 if (r < 0) {
2015 *ret_error_message = "Switch root target contains no usable init";
2016 return log_error_errno(r, "Failed to chase %s/sbin/init", switch_root_dir);
2017 }
2018 }
2019 }
2020
2021 /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
2022 * the kernel default to its child processes */
2023 if (saved_rlimit_nofile->rlim_cur != 0)
2024 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
2025 if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
2026 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
2027
2028 finish_remaining_processes(objective);
2029 reduce_vt(objective);
2030
2031 if (switch_root_dir) {
2032 r = switch_root(/* new_root= */ switch_root_dir,
2033 /* old_root_after= */ NULL,
2034 /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) |
2035 (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN));
2036 if (r < 0)
2037 log_error_errno(r, "Failed to switch root, trying to continue: %m");
2038 }
2039
2040 r = capability_ambient_set_apply(saved_capability_ambient_set, /* also_inherit= */ false);
2041 if (r < 0)
2042 log_warning_errno(r, "Failed to apply the starting ambient set, ignoring: %m");
2043
2044 args_size = argc + 5;
2045 args = newa(const char*, args_size);
2046
2047 if (!switch_root_init) {
2048 char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
2049
2050 /* First try to spawn ourselves with the right path, and with full serialization. We do this
2051 * only if the user didn't specify an explicit init to spawn. */
2052
2053 assert(arg_serialization);
2054 assert(fds);
2055
2056 xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
2057
2058 i = 1; /* Leave args[0] empty for now. */
2059
2060 /* Put our stuff first to make sure it always gets parsed in case
2061 * we get weird stuff from the kernel cmdline (like --) */
2062 if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
2063 args[i++] = "--switched-root";
2064 args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope);
2065 args[i++] = sfd;
2066
2067 filter_args(args, &i, argv, argc);
2068
2069 args[i++] = NULL;
2070
2071 assert(i <= args_size);
2072
2073 /*
2074 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
2075 * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
2076 * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
2077 * and wait() for it in the parent, before proceeding into the exec().
2078 */
2079 valgrind_summary_hack();
2080
2081 args[0] = SYSTEMD_BINARY_PATH;
2082 (void) execv(args[0], (char* const*) args);
2083
2084 if (objective == MANAGER_REEXECUTE) {
2085 *ret_error_message = "Failed to execute our own binary";
2086 return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
2087 }
2088
2089 log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
2090 }
2091
2092 /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
2093 * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
2094 * but let's hope that doesn't matter.) */
2095
2096 arg_serialization = safe_fclose(arg_serialization);
2097 fds = fdset_free(fds);
2098
2099 /* Drop /run/systemd directory. Some of its content can be used as a flag indicating that systemd is
2100 * the init system but we might be replacing it with something different. If systemd is used again it
2101 * will recreate the directory and its content anyway. */
2102 r = rm_rf("/run/systemd.pre-switch-root", REMOVE_ROOT|REMOVE_MISSING_OK);
2103 if (r < 0)
2104 log_warning_errno(r, "Failed to prepare /run/systemd.pre-switch-root/, ignoring: %m");
2105
2106 r = RET_NERRNO(rename("/run/systemd", "/run/systemd.pre-switch-root"));
2107 if (r < 0)
2108 log_warning_errno(r, "Failed to move /run/systemd/ to /run/systemd.pre-switch-root/, ignoring: %m");
2109
2110 /* Reopen the console */
2111 (void) make_console_stdio();
2112
2113 i = 1; /* Leave args[0] empty for now. */
2114 for (int j = 1; j <= argc; j++)
2115 args[i++] = argv[j];
2116 assert(i <= args_size);
2117
2118 /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
2119 (void) reset_all_signal_handlers();
2120 (void) reset_signal_mask();
2121 (void) rlimit_nofile_safe();
2122
2123 if (switch_root_init) {
2124 args[0] = switch_root_init;
2125 (void) execve(args[0], (char* const*) args, saved_env);
2126 log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
2127 }
2128
2129 args[0] = "/sbin/init";
2130 (void) execv(args[0], (char* const*) args);
2131 r = -errno;
2132 *ret_error_message = "Failed to execute /sbin/init";
2133
2134 if (r == -ENOENT) {
2135 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2136 ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
2137 "%s", *ret_error_message);
2138
2139 log_warning_errno(r, "No /sbin/init, trying fallback shell");
2140
2141 args[0] = "/bin/sh";
2142 args[1] = NULL;
2143 (void) execve(args[0], (char* const*) args, saved_env);
2144 r = -errno;
2145 *ret_error_message = "Failed to execute fallback shell";
2146 }
2147
2148 return log_error_errno(r, "%s, giving up: %m", *ret_error_message);
2149 }
2150
2151 static int invoke_main_loop(
2152 Manager *m,
2153 const struct rlimit *saved_rlimit_nofile,
2154 const struct rlimit *saved_rlimit_memlock,
2155 int *ret_retval, /* Return parameters relevant for shutting down */
2156 FDSet **ret_fds, /* Return parameters for reexecuting */
2157 char **ret_switch_root_dir, /* … */
2158 char **ret_switch_root_init, /* … */
2159 const char **ret_error_message) {
2160
2161 int r;
2162
2163 assert(m);
2164 assert(saved_rlimit_nofile);
2165 assert(saved_rlimit_memlock);
2166 assert(ret_retval);
2167 assert(ret_fds);
2168 assert(ret_switch_root_dir);
2169 assert(ret_switch_root_init);
2170 assert(ret_error_message);
2171
2172 for (;;) {
2173 int objective = manager_loop(m);
2174 if (objective < 0) {
2175 *ret_error_message = "Failed to run main loop";
2176 return log_struct_errno(LOG_EMERG, objective,
2177 LOG_MESSAGE("Failed to run main loop: %m"),
2178 LOG_MESSAGE_ID(SD_MESSAGE_CORE_MAINLOOP_FAILED_STR));
2179 }
2180
2181 /* Ensure shutdown timestamp is taken even when bypassing the job engine */
2182 if (IN_SET(objective,
2183 MANAGER_SOFT_REBOOT,
2184 MANAGER_REBOOT,
2185 MANAGER_KEXEC,
2186 MANAGER_HALT,
2187 MANAGER_POWEROFF) &&
2188 !dual_timestamp_is_set(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START))
2189 dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START);
2190
2191 switch (objective) {
2192
2193 case MANAGER_RELOAD: {
2194 LogTarget saved_log_target;
2195 int saved_log_level;
2196
2197 manager_send_reloading(m);
2198
2199 log_info("Reloading...");
2200
2201 /* First, save any overridden log level/target, then parse the configuration file,
2202 * which might change the log level to new settings. */
2203
2204 saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
2205 saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
2206
2207 (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
2208
2209 set_manager_defaults(m);
2210 set_manager_settings(m);
2211
2212 update_cpu_affinity(false);
2213 update_numa_policy(false);
2214
2215 if (saved_log_level >= 0)
2216 manager_override_log_level(m, saved_log_level);
2217 if (saved_log_target >= 0)
2218 manager_override_log_target(m, saved_log_target);
2219
2220 if (manager_reload(m) < 0)
2221 /* Reloading failed before the point of no return.
2222 * Let's continue running as if nothing happened. */
2223 m->objective = MANAGER_OK;
2224 else
2225 log_info("Reloading finished in " USEC_FMT " ms.",
2226 usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC);
2227
2228 continue;
2229 }
2230
2231 case MANAGER_REEXECUTE:
2232
2233 manager_send_reloading(m); /* From the perspective of the manager calling us this is
2234 * pretty much the same as a reload */
2235
2236 r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
2237 if (r < 0) {
2238 *ret_error_message = "Failed to prepare for reexecution";
2239 return r;
2240 }
2241
2242 log_notice("Reexecuting.");
2243
2244 *ret_retval = EXIT_FAILURE;
2245 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2246
2247 return objective;
2248
2249 case MANAGER_SWITCH_ROOT:
2250
2251 manager_send_reloading(m); /* From the perspective of the manager calling us this is
2252 * pretty much the same as a reload */
2253
2254 manager_set_switching_root(m, true);
2255
2256 if (!m->switch_root_init) {
2257 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
2258 if (r < 0) {
2259 *ret_error_message = "Failed to prepare for reexecution";
2260 return r;
2261 }
2262 } else
2263 *ret_fds = NULL;
2264
2265 log_notice("Switching root.");
2266
2267 *ret_retval = EXIT_FAILURE;
2268
2269 /* Steal the switch root parameters */
2270 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2271 *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
2272
2273 return objective;
2274
2275 case MANAGER_SOFT_REBOOT:
2276 manager_send_reloading(m);
2277 manager_set_switching_root(m, true);
2278
2279 r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true);
2280 if (r < 0) {
2281 *ret_error_message = "Failed to prepare for reexecution";
2282 return r;
2283 }
2284
2285 log_notice("Soft-rebooting.");
2286
2287 *ret_retval = EXIT_FAILURE;
2288 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2289 *ret_switch_root_init = NULL;
2290
2291 return objective;
2292
2293 case MANAGER_EXIT:
2294 if (MANAGER_IS_USER(m)) {
2295 log_debug("Exit.");
2296
2297 *ret_retval = m->return_value;
2298 *ret_fds = NULL;
2299 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2300
2301 return objective;
2302 }
2303
2304 _fallthrough_;
2305 case MANAGER_REBOOT:
2306 case MANAGER_POWEROFF:
2307 case MANAGER_HALT:
2308 case MANAGER_KEXEC: {
2309 log_notice("Shutting down.");
2310
2311 *ret_retval = m->return_value;
2312 *ret_fds = NULL;
2313 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2314
2315 return objective;
2316 }
2317
2318 default:
2319 assert_not_reached();
2320 }
2321 }
2322 }
2323
2324 static void log_execution_mode(bool *ret_first_boot) {
2325 bool first_boot = false;
2326 int r;
2327
2328 assert(ret_first_boot);
2329
2330 switch (arg_runtime_scope) {
2331
2332 case RUNTIME_SCOPE_SYSTEM: {
2333 struct utsname uts;
2334 int v;
2335
2336 log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
2337 arg_action == ACTION_TEST ? "test " : "",
2338 systemd_features);
2339
2340 v = detect_virtualization();
2341 if (v > 0)
2342 log_info("Detected virtualization %s.", virtualization_to_string(v));
2343
2344 v = detect_confidential_virtualization();
2345 if (v > 0)
2346 log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v));
2347
2348 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
2349
2350 if (in_initrd())
2351 log_info("Running in initrd.");
2352 else {
2353 _cleanup_free_ char *id_text = NULL;
2354
2355 /* Let's check whether we are in first boot. First, check if an override was
2356 * specified on the kernel command line. If yes, we honour that. */
2357
2358 r = proc_cmdline_get_bool("systemd.condition_first_boot", /* flags = */ 0, &first_boot);
2359 if (r < 0)
2360 log_debug_errno(r, "Failed to parse systemd.condition_first_boot= kernel command line argument, ignoring: %m");
2361
2362 if (r > 0)
2363 log_full(first_boot ? LOG_INFO : LOG_DEBUG,
2364 "Kernel command line argument says we are %s first boot.",
2365 first_boot ? "in" : "not in");
2366 else {
2367 /* Second, perform autodetection. We use /etc/machine-id as flag file for
2368 * this: If it is missing or contains the value "uninitialized", this is the
2369 * first boot. In other cases, it is not. This allows container managers and
2370 * installers to provision a couple of files in /etc but still permit the
2371 * first-boot initialization to occur. If the container manager wants to
2372 * provision the machine ID it should pass $container_uuid to PID 1. */
2373
2374 r = read_one_line_file("/etc/machine-id", &id_text);
2375 if (r < 0 || streq(id_text, "uninitialized")) {
2376 if (r < 0 && r != -ENOENT)
2377 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
2378
2379 first_boot = true;
2380 log_info("Detected first boot.");
2381 } else
2382 log_debug("Detected initialized system, this is not the first boot.");
2383 }
2384 }
2385
2386 assert_se(uname(&uts) >= 0);
2387
2388 if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
2389 log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
2390 "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
2391 else
2392 log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
2393
2394 break;
2395 }
2396
2397 case RUNTIME_SCOPE_USER:
2398 if (DEBUG_LOGGING) {
2399 _cleanup_free_ char *t = NULL;
2400
2401 t = uid_to_name(getuid());
2402 log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2403 arg_action == ACTION_TEST ? " test" : "",
2404 getuid(), strna(t), systemd_features);
2405 }
2406
2407 break;
2408
2409 default:
2410 assert_not_reached();
2411 }
2412
2413 *ret_first_boot = first_boot;
2414 }
2415
2416 static int initialize_runtime(
2417 bool skip_setup,
2418 bool first_boot,
2419 struct rlimit *saved_rlimit_nofile,
2420 struct rlimit *saved_rlimit_memlock,
2421 uint64_t *saved_ambient_set,
2422 const char **ret_error_message) {
2423
2424 int r;
2425
2426 assert(saved_ambient_set);
2427 assert(ret_error_message);
2428
2429 /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2430 *
2431 * - Some only apply to --system instances
2432 * - Some only apply to --user instances
2433 * - Some only apply when we first start up, but not when we reexecute
2434 */
2435
2436 if (arg_action != ACTION_RUN)
2437 return 0;
2438
2439 update_cpu_affinity(skip_setup);
2440 update_numa_policy(skip_setup);
2441
2442 switch (arg_runtime_scope) {
2443
2444 case RUNTIME_SCOPE_SYSTEM:
2445 /* Make sure we leave a core dump without panicking the kernel. */
2446 install_crash_handler();
2447
2448 if (!skip_setup) {
2449 /* Check that /usr/ is either on the same file system as / or mounted already. */
2450 if (dir_is_empty("/usr", /* ignore_hidden_or_backup = */ true) > 0) {
2451 *ret_error_message = "Refusing to run in unsupported environment where /usr/ is not populated";
2452 return -ENOEXEC;
2453 }
2454
2455 /* Pull credentials from various sources into a common credential directory (we do
2456 * this here, before setting up the machine ID, so that we can use credential info
2457 * for setting up the machine ID) */
2458 (void) import_credentials();
2459
2460 (void) os_release_status();
2461 (void) machine_id_setup(/* root = */ NULL, arg_machine_id,
2462 (first_boot ? MACHINE_ID_SETUP_FORCE_TRANSIENT : 0) |
2463 (arg_machine_id_from_firmware ? MACHINE_ID_SETUP_FORCE_FIRMWARE : 0),
2464 /* ret = */ NULL);
2465 (void) hostname_setup(/* really = */ true);
2466 (void) loopback_setup();
2467
2468 bump_unix_max_dgram_qlen();
2469 bump_file_max_and_nr_open();
2470
2471 write_container_id();
2472
2473 (void) write_boot_or_shutdown_osc("boot");
2474
2475 /* Copy os-release to the propagate directory, so that we update it for services running
2476 * under RootDirectory=/RootImage= when we do a soft reboot. */
2477 r = setup_os_release(RUNTIME_SCOPE_SYSTEM);
2478 if (r < 0)
2479 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2480 }
2481
2482 r = watchdog_set_device(arg_watchdog_device);
2483 if (r < 0)
2484 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2485
2486 if (!cap_test_all(arg_capability_bounding_set)) {
2487 r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2488 if (r < 0) {
2489 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2490 return log_struct_errno(LOG_EMERG, r,
2491 LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"),
2492 LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR));
2493 }
2494
2495 r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2496 if (r < 0) {
2497 *ret_error_message = "Failed to drop capability bounding set";
2498 return log_struct_errno(LOG_EMERG, r,
2499 LOG_MESSAGE("Failed to drop capability bounding set: %m"),
2500 LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR));
2501 }
2502 }
2503
2504 if (arg_no_new_privs) {
2505 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2506 *ret_error_message = "Failed to disable new privileges";
2507 return log_struct_errno(LOG_EMERG, errno,
2508 LOG_MESSAGE("Failed to disable new privileges: %m"),
2509 LOG_MESSAGE_ID(SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR));
2510 }
2511 }
2512
2513 break;
2514
2515 case RUNTIME_SCOPE_USER: {
2516 _cleanup_free_ char *p = NULL;
2517
2518 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2519 * user mode. In system mode mount_setup() already did that. */
2520
2521 r = xdg_user_runtime_dir("/systemd", &p);
2522 if (r < 0) {
2523 *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2524 return log_struct_errno(LOG_EMERG, r,
2525 LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"),
2526 LOG_MESSAGE_ID(SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR));
2527 }
2528
2529 if (!skip_setup) {
2530 (void) mkdir_p_label(p, 0755);
2531 (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2532
2533 r = setup_os_release(RUNTIME_SCOPE_USER);
2534 if (r < 0)
2535 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2536 }
2537
2538 break;
2539 }
2540
2541 default:
2542 assert_not_reached();
2543 }
2544
2545 /* The two operations on the ambient set are meant for a user serssion manager. They do not affect
2546 * system manager operation, because by default it starts with an empty ambient set.
2547 *
2548 * Preserve the ambient set for later use with sd-executor processes. */
2549 r = capability_get_ambient(saved_ambient_set);
2550 if (r < 0)
2551 log_warning_errno(r, "Failed to save ambient capabilities, ignoring: %m");
2552
2553 /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
2554 * not affect the permitted and effective sets which are important for the manager itself to
2555 * operate. */
2556 r = capability_ambient_set_apply(0, /* also_inherit= */ false);
2557 if (r < 0)
2558 log_warning_errno(r, "Failed to reset ambient capability set, ignoring: %m");
2559
2560 if (arg_timer_slack_nsec != NSEC_INFINITY)
2561 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2562 log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2563
2564 if (arg_syscall_archs) {
2565 r = enforce_syscall_archs(arg_syscall_archs);
2566 if (r < 0) {
2567 *ret_error_message = "Failed to set syscall architectures";
2568 return r;
2569 }
2570 }
2571
2572 r = make_reaper_process(true);
2573 if (r < 0)
2574 log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
2575
2576 /* Bump up RLIMIT_NOFILE for systemd itself */
2577 (void) bump_rlimit_nofile(saved_rlimit_nofile);
2578 (void) bump_rlimit_memlock(saved_rlimit_memlock);
2579
2580 return 0;
2581 }
2582
2583 static int do_queue_default_job(
2584 Manager *m,
2585 const char **ret_error_message) {
2586
2587 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2588 const char *unit;
2589 Job *job;
2590 Unit *target;
2591 int r;
2592
2593 if (arg_default_unit)
2594 unit = arg_default_unit;
2595 else if (in_initrd())
2596 unit = SPECIAL_INITRD_TARGET;
2597 else
2598 unit = SPECIAL_DEFAULT_TARGET;
2599
2600 log_debug("Activating default unit: %s", unit);
2601
2602 r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2603 if (r < 0 && in_initrd() && !arg_default_unit) {
2604 /* Fall back to default.target, which we used to always use by default. Only do this if no
2605 * explicit configuration was given. */
2606
2607 log_info("Falling back to %s.", SPECIAL_DEFAULT_TARGET);
2608
2609 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2610 }
2611 if (r < 0) {
2612 log_info("Falling back to %s.", SPECIAL_RESCUE_TARGET);
2613
2614 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2615 if (r < 0) {
2616 *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2617 : "Failed to load " SPECIAL_RESCUE_TARGET;
2618 return r;
2619 }
2620 }
2621
2622 assert(target->load_state == UNIT_LOADED);
2623
2624 r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, &error, &job);
2625 if (r == -EPERM) {
2626 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2627
2628 sd_bus_error_free(&error);
2629
2630 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, &error, &job);
2631 if (r < 0) {
2632 *ret_error_message = "Failed to start default target";
2633 return log_struct_errno(LOG_EMERG, r,
2634 LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)),
2635 LOG_MESSAGE_ID(SD_MESSAGE_CORE_START_TARGET_FAILED_STR));
2636 }
2637
2638 } else if (r < 0) {
2639 *ret_error_message = "Failed to isolate default target";
2640 return log_struct_errno(LOG_EMERG, r,
2641 LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)),
2642 LOG_MESSAGE_ID(SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR));
2643 }
2644
2645 log_info("Queued %s job for default target %s.",
2646 job_type_to_string(job->type), unit_status_string(job->unit, NULL));
2647
2648 return 0;
2649 }
2650
2651 static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2652 struct rlimit *saved_rlimit_memlock) {
2653
2654 assert(saved_rlimit_nofile);
2655 assert(saved_rlimit_memlock);
2656
2657 if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2658 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2659
2660 if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2661 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2662 }
2663
2664 static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2665 struct rlimit *rl;
2666
2667 if (arg_defaults.rlimit[RLIMIT_NOFILE])
2668 return;
2669
2670 /* Make sure forked processes get limits based on the original kernel setting */
2671
2672 rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2673 if (!rl) {
2674 log_oom();
2675 return;
2676 }
2677
2678 /* Bump the hard limit for system services to a substantially higher value. The default
2679 * hard limit current kernels set is pretty low (4K), mostly for historical
2680 * reasons. According to kernel developers, the fd handling in recent kernels has been
2681 * optimized substantially enough, so that we can bump the limit now, without paying too
2682 * high a price in memory or performance. Note however that we only bump the hard limit,
2683 * not the soft limit. That's because select() works the way it works, and chokes on fds
2684 * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2685 * unexpecting programs that they get fds higher than what they can process using
2686 * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2687 * this pitfall: programs that are written by folks aware of the select() problem in mind
2688 * (and thus use poll()/epoll instead of select(), the way everybody should) can
2689 * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2690 * we pass. */
2691 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
2692 rl->rlim_max = MIN((rlim_t) read_nr_open(), MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2693
2694 /* If for some reason we were invoked with a soft limit above 1024 (which should never
2695 * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2696 * instance), then lower what we pass on to not confuse our children */
2697 rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2698
2699 arg_defaults.rlimit[RLIMIT_NOFILE] = rl;
2700 }
2701
2702 static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2703 struct rlimit *rl;
2704
2705 /* Pass the original value down to invoked processes */
2706
2707 if (arg_defaults.rlimit[RLIMIT_MEMLOCK])
2708 return;
2709
2710 rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2711 if (!rl) {
2712 log_oom();
2713 return;
2714 }
2715
2716 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2717 /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
2718 * default for this since kernel 5.16) */
2719 rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2720 rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2721 }
2722
2723 arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl;
2724 }
2725
2726 static void setenv_manager_environment(void) {
2727 int r;
2728
2729 STRV_FOREACH(p, arg_manager_environment) {
2730 log_debug("Setting '%s' in our own environment.", *p);
2731
2732 r = putenv_dup(*p, true);
2733 if (r < 0)
2734 log_warning_errno(r, "Failed to setenv \"%s\", ignoring: %m", *p);
2735 }
2736 }
2737
2738 static void reset_arguments(void) {
2739 /* Frees/resets arg_* variables, with a few exceptions commented below. */
2740
2741 arg_default_unit = mfree(arg_default_unit);
2742
2743 /* arg_runtime_scope — ignore */
2744
2745 arg_dump_core = true;
2746 arg_crash_chvt = -1;
2747 arg_crash_shell = false;
2748 arg_crash_action = CRASH_FREEZE;
2749 arg_confirm_spawn = mfree(arg_confirm_spawn);
2750 arg_show_status = _SHOW_STATUS_INVALID;
2751 arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2752 arg_switched_root = false;
2753 arg_pager_flags = 0;
2754 arg_service_watchdogs = true;
2755
2756 unit_defaults_done(&arg_defaults);
2757 unit_defaults_init(&arg_defaults, arg_runtime_scope);
2758
2759 arg_runtime_watchdog = 0;
2760 arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2761 arg_kexec_watchdog = 0;
2762 arg_pretimeout_watchdog = 0;
2763 arg_early_core_pattern = mfree(arg_early_core_pattern);
2764 arg_watchdog_device = mfree(arg_watchdog_device);
2765 arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
2766
2767 arg_default_environment = strv_free(arg_default_environment);
2768 arg_manager_environment = strv_free(arg_manager_environment);
2769
2770 arg_capability_bounding_set = CAP_MASK_ALL;
2771 arg_no_new_privs = false;
2772 arg_protect_system = -1;
2773 arg_timer_slack_nsec = NSEC_INFINITY;
2774
2775 arg_syscall_archs = set_free(arg_syscall_archs);
2776
2777 /* arg_serialization — ignore */
2778
2779 arg_machine_id = (sd_id128_t) {};
2780 arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2781
2782 cpu_set_done(&arg_cpu_affinity);
2783 numa_policy_reset(&arg_numa_policy);
2784
2785 arg_random_seed = mfree(arg_random_seed);
2786 arg_random_seed_size = 0;
2787 arg_clock_usec = 0;
2788
2789 arg_reload_limit_interval_sec = 0;
2790 arg_reload_limit_burst = 0;
2791 }
2792
2793 static void determine_default_oom_score_adjust(void) {
2794 int r, a, b;
2795
2796 /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
2797 * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
2798 * user). */
2799
2800 if (arg_defaults.oom_score_adjust_set)
2801 return;
2802
2803 if (getuid() == 0)
2804 return;
2805
2806 r = get_oom_score_adjust(&a);
2807 if (r < 0)
2808 return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
2809
2810 assert_cc(100 <= OOM_SCORE_ADJ_MAX);
2811 b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
2812
2813 if (a == b)
2814 return;
2815
2816 arg_defaults.oom_score_adjust = b;
2817 arg_defaults.oom_score_adjust_set = true;
2818 }
2819
2820 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2821 const struct rlimit *saved_rlimit_memlock) {
2822 int r;
2823
2824 assert(saved_rlimit_nofile);
2825 assert(saved_rlimit_memlock);
2826
2827 /* Assign configuration defaults */
2828 reset_arguments();
2829
2830 r = parse_config_file();
2831 if (r < 0)
2832 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2833
2834 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2835 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2836 if (r < 0)
2837 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2838 }
2839
2840 /* Initialize the show status setting if it hasn't been explicitly set yet */
2841 if (arg_show_status == _SHOW_STATUS_INVALID)
2842 arg_show_status = SHOW_STATUS_YES;
2843
2844 /* Push variables into the manager environment block */
2845 setenv_manager_environment();
2846
2847 /* Parse log environment variables to take into account any new environment variables.
2848 * Note that this also parses bits from the kernel command line, including "debug". */
2849 log_parse_environment();
2850
2851 /* Initialize some default rlimits for services if they haven't been configured */
2852 fallback_rlimit_nofile(saved_rlimit_nofile);
2853 fallback_rlimit_memlock(saved_rlimit_memlock);
2854
2855 /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
2856 determine_default_oom_score_adjust();
2857
2858 return 0;
2859 }
2860
2861 static int safety_checks(void) {
2862
2863 if (getpid_cached() == 1 &&
2864 arg_action != ACTION_RUN)
2865 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2866 "Unsupported execution mode while PID 1.");
2867
2868 if (getpid_cached() == 1 &&
2869 arg_runtime_scope == RUNTIME_SCOPE_USER)
2870 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2871 "Can't run --user mode as PID 1.");
2872
2873 if (arg_action == ACTION_RUN &&
2874 arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
2875 getpid_cached() != 1)
2876 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2877 "Can't run system mode unless PID 1.");
2878
2879 if (arg_action == ACTION_TEST &&
2880 geteuid() == 0)
2881 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2882 "Don't run test mode as root.");
2883
2884 switch (arg_runtime_scope) {
2885
2886 case RUNTIME_SCOPE_USER:
2887
2888 if (arg_action == ACTION_RUN &&
2889 sd_booted() <= 0)
2890 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2891 "Trying to run as user instance, but the system has not been booted with systemd.");
2892
2893 if (arg_action == ACTION_RUN &&
2894 !getenv("XDG_RUNTIME_DIR"))
2895 return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2896 "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2897
2898 break;
2899
2900 case RUNTIME_SCOPE_SYSTEM:
2901 if (arg_action == ACTION_RUN &&
2902 running_in_chroot() > 0)
2903 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2904 "Cannot be run in a chroot() environment.");
2905 break;
2906
2907 default:
2908 assert_not_reached();
2909 }
2910
2911 return 0;
2912 }
2913
2914 static int initialize_security(
2915 bool *loaded_policy,
2916 dual_timestamp *security_start_timestamp,
2917 dual_timestamp *security_finish_timestamp,
2918 const char **ret_error_message) {
2919
2920 int r;
2921
2922 assert(loaded_policy);
2923 assert(security_start_timestamp);
2924 assert(security_finish_timestamp);
2925 assert(ret_error_message);
2926
2927 dual_timestamp_now(security_start_timestamp);
2928
2929 r = mac_selinux_setup(loaded_policy);
2930 if (r < 0) {
2931 *ret_error_message = "Failed to load SELinux policy";
2932 return r;
2933 }
2934
2935 r = mac_smack_setup(loaded_policy);
2936 if (r < 0) {
2937 *ret_error_message = "Failed to load SMACK policy";
2938 return r;
2939 }
2940
2941 r = mac_apparmor_setup();
2942 if (r < 0) {
2943 *ret_error_message = "Failed to load AppArmor policy";
2944 return r;
2945 }
2946
2947 r = ima_setup();
2948 if (r < 0) {
2949 *ret_error_message = "Failed to load IMA policy";
2950 return r;
2951 }
2952
2953 r = ipe_setup();
2954 if (r < 0) {
2955 *ret_error_message = "Failed to load IPE policy";
2956 return r;
2957 }
2958
2959 dual_timestamp_now(security_finish_timestamp);
2960 return 0;
2961 }
2962
2963 static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2964 int r;
2965
2966 assert(ret_fds);
2967 assert(ret_error_message);
2968
2969 /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
2970 * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
2971 * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
2972 * mechanism to distinguish passed in fds from our own.
2973 *
2974 * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
2975 * process behind our back. We should not take possession of that (and then accidentally close
2976 * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
2977 r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
2978 if (r < 0) {
2979 *ret_error_message = "Failed to allocate fd set";
2980 return log_struct_errno(LOG_EMERG, r,
2981 LOG_MESSAGE("Failed to allocate fd set: %m"),
2982 LOG_MESSAGE_ID(SD_MESSAGE_CORE_FD_SET_FAILED_STR));
2983 }
2984
2985 /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
2986 assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
2987
2988 return 0;
2989 }
2990
2991 static void setup_console_terminal(bool skip_setup) {
2992
2993 if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM)
2994 return;
2995
2996 /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
2997 * controlling tty. */
2998 terminal_detach_session();
2999
3000 /* Reset the console, but only if this is really init and we are freshly booted */
3001 if (!skip_setup)
3002 (void) console_setup();
3003 }
3004
3005 static bool early_skip_setup_check(int argc, char *argv[]) {
3006 bool found_deserialize = false;
3007
3008 /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
3009 * later, so let's just have a quick peek here. Note that if we have switched root, do all the
3010 * special setup things anyway, even if in that case we also do deserialization. */
3011
3012 for (int i = 1; i < argc; i++)
3013 if (streq(argv[i], "--switched-root"))
3014 return false; /* If we switched root, don't skip the setup. */
3015 else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
3016 found_deserialize = true;
3017
3018 return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
3019 }
3020
3021 static int save_env(void) {
3022 char **l;
3023
3024 l = strv_copy(environ);
3025 if (!l)
3026 return log_oom();
3027
3028 strv_free_and_replace(saved_env, l);
3029 return 0;
3030 }
3031
3032 int main(int argc, char *argv[]) {
3033 dual_timestamp
3034 initrd_timestamp = DUAL_TIMESTAMP_NULL,
3035 userspace_timestamp = DUAL_TIMESTAMP_NULL,
3036 kernel_timestamp = DUAL_TIMESTAMP_NULL,
3037 security_start_timestamp = DUAL_TIMESTAMP_NULL,
3038 security_finish_timestamp = DUAL_TIMESTAMP_NULL;
3039 struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
3040 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
3041 * in. Note we use different values
3042 * for the two that indicate whether
3043 * these fields are initialized! */
3044 bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
3045 char *switch_root_dir = NULL, *switch_root_init = NULL;
3046 usec_t before_startup, after_startup;
3047 static char systemd[] = "systemd";
3048 const char *error_message = NULL;
3049 uint64_t saved_ambient_set = 0;
3050 int r, retval = EXIT_FAILURE;
3051 Manager *m = NULL;
3052 FDSet *fds = NULL;
3053
3054 assert_se(argc > 0 && !isempty(argv[0]));
3055
3056 /* Take timestamps early on */
3057 dual_timestamp_from_monotonic(&kernel_timestamp, 0);
3058 dual_timestamp_now(&userspace_timestamp);
3059
3060 /* Figure out whether we need to do initialize the system, or if we already did that because we are
3061 * reexecuting. */
3062 skip_setup = early_skip_setup_check(argc, argv);
3063
3064 /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
3065 * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
3066 * right-away. */
3067 program_invocation_short_name = systemd;
3068 (void) prctl(PR_SET_NAME, systemd);
3069
3070 /* Save the original command line */
3071 save_argc_argv(argc, argv);
3072
3073 /* Save the original environment as we might need to restore it if we're requested to execute another
3074 * system manager later. */
3075 r = save_env();
3076 if (r < 0) {
3077 error_message = "Failed to copy environment block";
3078 goto finish;
3079 }
3080
3081 /* Make sure that if the user says "syslog" we actually log to the journal. */
3082 log_set_upgrade_syslog_to_journal(true);
3083
3084 if (getpid_cached() == 1) {
3085 /* When we run as PID 1 force system mode */
3086 arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
3087
3088 /* Disable the umask logic */
3089 umask(0);
3090
3091 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
3092 * not be activated yet (even though the log socket for it exists). */
3093 log_set_prohibit_ipc(true);
3094
3095 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
3096 * is important so that we never end up logging to any foreign stderr, for example if we have
3097 * to log in a child process right before execve()'ing the actual binary, at a point in time
3098 * where socket activation stderr/stdout area already set up. */
3099 log_set_always_reopen_console(true);
3100
3101 if (detect_container() <= 0) {
3102
3103 /* Running outside of a container as PID 1 */
3104 log_set_target_and_open(LOG_TARGET_KMSG);
3105
3106 if (in_initrd())
3107 initrd_timestamp = userspace_timestamp;
3108
3109 if (!skip_setup) {
3110 r = mount_setup_early();
3111 if (r < 0) {
3112 error_message = "Failed to mount early API filesystems";
3113 goto finish;
3114 }
3115 }
3116
3117 /* We might have just mounted /proc, so let's try to parse the kernel
3118 * command line log arguments immediately. */
3119 log_parse_environment();
3120
3121 /* Let's open the log backend a second time, in case the first time didn't
3122 * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
3123 * available, and it previously wasn't. */
3124 log_open();
3125
3126 if (!skip_setup) {
3127 disable_printk_ratelimit();
3128
3129 r = initialize_security(
3130 &loaded_policy,
3131 &security_start_timestamp,
3132 &security_finish_timestamp,
3133 &error_message);
3134 if (r < 0)
3135 goto finish;
3136 }
3137
3138 r = mac_init();
3139 if (r < 0) {
3140 error_message = "Failed to initialize MAC support";
3141 goto finish;
3142 }
3143
3144 if (!skip_setup)
3145 initialize_clock_timewarp();
3146
3147 clock_apply_epoch(/* allow_backwards= */ !skip_setup);
3148
3149 /* Set the default for later on, but don't actually open the logs like this for
3150 * now. Note that if we are transitioning from the initrd there might still be
3151 * journal fd open, and we shouldn't attempt opening that before we parsed
3152 * /proc/cmdline which might redirect output elsewhere. */
3153 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
3154
3155 } else {
3156 /* Running inside a container, as PID 1 */
3157 log_set_target_and_open(LOG_TARGET_CONSOLE);
3158
3159 /* For later on, see above... */
3160 log_set_target(LOG_TARGET_JOURNAL);
3161
3162 /* clear the kernel timestamp, because we are in a container */
3163 kernel_timestamp = DUAL_TIMESTAMP_NULL;
3164 }
3165
3166 initialize_coredump(skip_setup);
3167
3168 r = fixup_environment();
3169 if (r < 0) {
3170 log_struct_errno(LOG_EMERG, r,
3171 LOG_MESSAGE("Failed to fix up PID 1 environment: %m"),
3172 LOG_MESSAGE_ID(SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR));
3173 error_message = "Failed to fix up PID1 environment";
3174 goto finish;
3175 }
3176
3177 /* Try to figure out if we can use colors with the console. No need to do that for user
3178 * instances since they never log into the console. */
3179 log_show_color(colors_enabled());
3180
3181 r = make_null_stdio();
3182 if (r < 0)
3183 log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
3184
3185 /* Load the kernel modules early. */
3186 if (!skip_setup)
3187 (void) kmod_setup();
3188
3189 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
3190 r = mount_setup(loaded_policy, skip_setup);
3191 if (r < 0) {
3192 error_message = "Failed to mount API filesystems";
3193 goto finish;
3194 }
3195
3196 /* The efivarfs is now mounted, let's lock down the system token. */
3197 lock_down_efi_variables();
3198 } else {
3199 /* Running as user instance */
3200 arg_runtime_scope = RUNTIME_SCOPE_USER;
3201 log_set_always_reopen_console(true);
3202 log_set_target_and_open(LOG_TARGET_AUTO);
3203
3204 /* clear the kernel timestamp, because we are not PID 1 */
3205 kernel_timestamp = DUAL_TIMESTAMP_NULL;
3206
3207 r = mac_init();
3208 if (r < 0) {
3209 error_message = "Failed to initialize MAC support";
3210 goto finish;
3211 }
3212 }
3213
3214 /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
3215 * transitioning from the initrd to the main systemd or suchlike. */
3216 save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
3217
3218 /* Reset all signal handlers. */
3219 (void) reset_all_signal_handlers();
3220 (void) ignore_signals(SIGNALS_IGNORE);
3221
3222 (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
3223
3224 r = parse_argv(argc, argv);
3225 if (r < 0) {
3226 error_message = "Failed to parse command line arguments";
3227 goto finish;
3228 }
3229
3230 r = safety_checks();
3231 if (r < 0)
3232 goto finish;
3233
3234 if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
3235 pager_open(arg_pager_flags);
3236
3237 if (arg_action != ACTION_RUN)
3238 skip_setup = true;
3239
3240 if (arg_action == ACTION_HELP) {
3241 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
3242 goto finish;
3243 } else if (arg_action == ACTION_VERSION) {
3244 retval = version();
3245 goto finish;
3246 } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
3247 unit_dump_config_items(stdout);
3248 retval = EXIT_SUCCESS;
3249 goto finish;
3250 } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
3251 dump_bus_properties(stdout);
3252 retval = EXIT_SUCCESS;
3253 goto finish;
3254 } else if (arg_action == ACTION_BUS_INTROSPECT) {
3255 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
3256 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
3257 goto finish;
3258 }
3259
3260 assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
3261
3262 /* Move out of the way, so that we won't block unmounts */
3263 assert_se(chdir("/") == 0);
3264
3265 if (arg_action == ACTION_RUN) {
3266 if (!skip_setup) {
3267 /* Apply the systemd.clock_usec= kernel command line switch */
3268 apply_clock_update();
3269
3270 /* Apply random seed from kernel command line */
3271 cmdline_take_random_seed();
3272 }
3273
3274 /* A core pattern might have been specified via the cmdline. */
3275 initialize_core_pattern(skip_setup);
3276
3277 /* Make /usr/ read-only */
3278 apply_protect_system(skip_setup);
3279
3280 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
3281 log_close();
3282
3283 /* Remember open file descriptors for later deserialization */
3284 r = collect_fds(&fds, &error_message);
3285 if (r < 0)
3286 goto finish;
3287
3288 /* Give up any control of the console, but make sure its initialized. */
3289 setup_console_terminal(skip_setup);
3290
3291 /* Open the logging devices, if possible and necessary */
3292 log_open();
3293 }
3294
3295 log_execution_mode(&first_boot);
3296
3297 r = cg_has_legacy();
3298 if (r < 0) {
3299 error_message = "Failed to check cgroup hierarchy";
3300 goto finish;
3301 }
3302 if (r > 0) {
3303 r = log_full_errno(LOG_EMERG, SYNTHETIC_ERRNO(EPROTO),
3304 "Detected cgroup v1 hierarchy at /sys/fs/cgroup/, which is no longer supported by current version of systemd.\n"
3305 "Please instruct your initrd to mount cgroup v2 (unified) hierarchy,\n"
3306 "possibly by removing any stale kernel command line options, such as:\n"
3307 " systemd.legacy_systemd_cgroup_controller=1\n"
3308 " systemd.unified_cgroup_hierarchy=0");
3309
3310 error_message = "Detected unsupported legacy cgroup hierarchy, refusing execution";
3311 goto finish;
3312 }
3313
3314 /* Building without libmount is allowed, but if it is compiled in, then we must be able to load it */
3315 r = dlopen_libmount();
3316 if (r < 0 && !ERRNO_IS_NEG_NOT_SUPPORTED(r)) {
3317 error_message = "Failed to load libmount.so";
3318 goto finish;
3319 }
3320
3321 r = initialize_runtime(skip_setup,
3322 first_boot,
3323 &saved_rlimit_nofile,
3324 &saved_rlimit_memlock,
3325 &saved_ambient_set,
3326 &error_message);
3327 if (r < 0)
3328 goto finish;
3329
3330 r = manager_new(arg_runtime_scope,
3331 arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
3332 &m);
3333 if (r < 0) {
3334 log_struct_errno(LOG_EMERG, r,
3335 LOG_MESSAGE("Failed to allocate manager object: %m"),
3336 LOG_MESSAGE_ID(SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR));
3337 error_message = "Failed to allocate manager object";
3338 goto finish;
3339 }
3340
3341 m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
3342 m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
3343 m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
3344 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
3345 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
3346
3347 m->saved_ambient_set = saved_ambient_set;
3348
3349 set_manager_defaults(m);
3350 set_manager_settings(m);
3351 manager_set_first_boot(m, first_boot);
3352 manager_set_switching_root(m, arg_switched_root);
3353
3354 /* Remember whether we should queue the default job */
3355 queue_default_job = !arg_serialization || arg_switched_root;
3356
3357 before_startup = now(CLOCK_MONOTONIC);
3358
3359 r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
3360 if (r < 0) {
3361 error_message = "Failed to start up manager";
3362 goto finish;
3363 }
3364
3365 /* This will close all file descriptors that were opened, but not claimed by any unit. */
3366 fds = fdset_free(fds);
3367 arg_serialization = safe_fclose(arg_serialization);
3368
3369 if (queue_default_job) {
3370 r = do_queue_default_job(m, &error_message);
3371 if (r < 0)
3372 goto finish;
3373 }
3374
3375 after_startup = now(CLOCK_MONOTONIC);
3376
3377 log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
3378 "Loaded units and determined initial transaction in %s.",
3379 FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
3380
3381 if (arg_action == ACTION_TEST) {
3382 manager_test_summary(m);
3383 retval = EXIT_SUCCESS;
3384 goto finish;
3385 }
3386
3387 r = invoke_main_loop(m,
3388 &saved_rlimit_nofile,
3389 &saved_rlimit_memlock,
3390 &retval,
3391 &fds,
3392 &switch_root_dir,
3393 &switch_root_init,
3394 &error_message);
3395 /* MANAGER_OK and MANAGER_RELOAD are not expected here. */
3396 assert(r < 0 || IN_SET(r, MANAGER_REEXECUTE, MANAGER_EXIT) ||
3397 (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3398 IN_SET(r, MANAGER_REBOOT,
3399 MANAGER_SOFT_REBOOT,
3400 MANAGER_POWEROFF,
3401 MANAGER_HALT,
3402 MANAGER_KEXEC,
3403 MANAGER_SWITCH_ROOT)));
3404
3405 finish:
3406 pager_close();
3407
3408 if (m) {
3409 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
3410 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
3411 m = manager_free(m);
3412 }
3413
3414 mac_selinux_finish();
3415
3416 if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
3417 r = do_reexecute(r,
3418 argc, argv,
3419 &saved_rlimit_nofile,
3420 &saved_rlimit_memlock,
3421 fds,
3422 switch_root_dir,
3423 switch_root_init,
3424 saved_ambient_set,
3425 &error_message); /* This only returns if reexecution failed */
3426
3427 arg_serialization = safe_fclose(arg_serialization);
3428 fds = fdset_free(fds);
3429
3430 saved_env = strv_free(saved_env);
3431
3432 #if HAVE_VALGRIND_VALGRIND_H
3433 /* If we are PID 1 and running under valgrind, then let's exit
3434 * here explicitly. valgrind will only generate nice output on
3435 * exit(), not on exec(), hence let's do the former not the
3436 * latter here. */
3437 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
3438 /* Cleanup watchdog_device strings for valgrind. We need them
3439 * in become_shutdown() so normally we cannot free them yet. */
3440 watchdog_free_device();
3441 reset_arguments();
3442 return retval;
3443 }
3444 #endif
3445
3446 #if HAS_FEATURE_ADDRESS_SANITIZER
3447 /* At this stage we most likely don't have stdio/stderr open, so the following
3448 * LSan check would not print any actionable information and would just crash
3449 * PID 1. To make this a bit more helpful, let's try to open /dev/console,
3450 * and if we succeed redirect LSan's report there. */
3451 if (getpid_cached() == 1) {
3452 _cleanup_close_ int tty_fd = -EBADF;
3453
3454 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
3455 if (tty_fd >= 0)
3456 __sanitizer_set_report_fd((void*) (intptr_t) tty_fd);
3457
3458 __lsan_do_leak_check();
3459 }
3460 #endif
3461
3462 if (r < 0)
3463 (void) sd_notifyf(/* unset_environment= */ false,
3464 "ERRNO=%i", -r);
3465
3466 /* Try to invoke the shutdown binary unless we already failed.
3467 * If we failed above, we want to freeze after finishing cleanup. */
3468 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3469 IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) {
3470 r = become_shutdown(r, retval);
3471 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
3472 error_message = "Failed to execute shutdown binary";
3473 }
3474
3475 /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
3476 * a mechanism to pick up systemd's exit status in the VM. */
3477 (void) sd_notifyf(/* unset_environment= */ false,
3478 "EXIT_STATUS=%i", retval);
3479
3480 watchdog_free_device();
3481 arg_watchdog_device = mfree(arg_watchdog_device);
3482
3483 if (getpid_cached() == 1) {
3484 if (error_message)
3485 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
3486 ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
3487 "%s.", error_message);
3488 freeze_or_exit_or_reboot();
3489 }
3490
3491 reset_arguments();
3492 return retval;
3493 }