]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/core/main.c
man/systemd-sysext: list ephemeral/ephemeral-import in the list of options
[thirdparty/systemd.git] / src / core / main.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <fcntl.h>
4#include <getopt.h>
5#include <linux/oom.h>
6#include <stdlib.h>
7#include <sys/mount.h>
8#include <sys/prctl.h>
9#include <sys/utsname.h>
10#include <unistd.h>
11
12#if HAVE_VALGRIND_VALGRIND_H
13# include <valgrind/valgrind.h>
14#endif
15
16#include "sd-bus.h"
17#include "sd-daemon.h"
18#include "sd-messages.h"
19
20#include "alloc-util.h"
21#include "apparmor-setup.h"
22#include "architecture.h"
23#include "argv-util.h"
24#include "build.h"
25#include "bus-error.h"
26#include "capability-util.h"
27#include "cgroup-setup.h"
28#include "chase.h"
29#include "clock-util.h"
30#include "clock-warp.h"
31#include "conf-parser.h"
32#include "confidential-virt.h"
33#include "constants.h"
34#include "copy.h"
35#include "coredump-util.h"
36#include "cpu-set-util.h"
37#include "crash-handler.h"
38#include "dbus.h"
39#include "dbus-manager.h"
40#include "dev-setup.h"
41#include "efi-random.h"
42#include "emergency-action.h"
43#include "env-util.h"
44#include "escape.h"
45#include "fd-util.h"
46#include "fdset.h"
47#include "fileio.h"
48#include "format-util.h"
49#include "getopt-defs.h"
50#include "hexdecoct.h"
51#include "hostname-setup.h"
52#include "id128-util.h"
53#include "ima-setup.h"
54#include "import-creds.h"
55#include "initrd-util.h"
56#include "io-util.h"
57#include "ipe-setup.h"
58#include "killall.h"
59#include "kmod-setup.h"
60#include "label-util.h"
61#include "limits-util.h"
62#include "load-fragment.h"
63#include "log.h"
64#include "loopback-setup.h"
65#include "machine-id-setup.h"
66#include "main.h"
67#include "manager.h"
68#include "manager-dump.h"
69#include "manager-serialize.h"
70#include "mkdir-label.h"
71#include "mount-setup.h"
72#include "mount-util.h"
73#include "os-util.h"
74#include "osc-context.h"
75#include "pager.h"
76#include "parse-argument.h"
77#include "parse-util.h"
78#include "path-util.h"
79#include "pretty-print.h"
80#include "proc-cmdline.h"
81#include "process-util.h"
82#include "random-util.h"
83#include "rlimit-util.h"
84#include "rm-rf.h"
85#include "seccomp-util.h"
86#include "selinux-setup.h"
87#include "selinux-util.h"
88#include "serialize.h"
89#include "set.h"
90#include "signal-util.h"
91#include "smack-setup.h"
92#include "special.h"
93#include "stat-util.h"
94#include "stdio-util.h"
95#include "strv.h"
96#include "switch-root.h"
97#include "sysctl-util.h"
98#include "terminal-util.h"
99#include "time-util.h"
100#include "umask-util.h"
101#include "unit-name.h"
102#include "user-util.h"
103#include "version.h"
104#include "virt.h"
105#include "watchdog.h"
106
107#if HAS_FEATURE_ADDRESS_SANITIZER
108#include <sanitizer/lsan_interface.h>
109#endif
110
111static enum {
112 ACTION_RUN,
113 ACTION_HELP,
114 ACTION_VERSION,
115 ACTION_TEST,
116 ACTION_DUMP_CONFIGURATION_ITEMS,
117 ACTION_DUMP_BUS_PROPERTIES,
118 ACTION_BUS_INTROSPECT,
119} arg_action = ACTION_RUN;
120
121static const char *arg_bus_introspect = NULL;
122
123/* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
124 * defaults are assigned in reset_arguments() below. */
125static char *arg_default_unit;
126static RuntimeScope arg_runtime_scope;
127bool arg_dump_core;
128int arg_crash_chvt;
129bool arg_crash_shell;
130CrashAction arg_crash_action;
131static char *arg_confirm_spawn;
132static ShowStatus arg_show_status;
133static StatusUnitFormat arg_status_unit_format;
134static bool arg_switched_root;
135static PagerFlags arg_pager_flags;
136static bool arg_service_watchdogs;
137static UnitDefaults arg_defaults;
138static usec_t arg_runtime_watchdog;
139static usec_t arg_reboot_watchdog;
140static usec_t arg_kexec_watchdog;
141static usec_t arg_pretimeout_watchdog;
142static char *arg_early_core_pattern;
143static char *arg_watchdog_pretimeout_governor;
144static char *arg_watchdog_device;
145static char **arg_default_environment;
146static char **arg_manager_environment;
147static uint64_t arg_capability_bounding_set;
148static bool arg_no_new_privs;
149static int arg_protect_system;
150static nsec_t arg_timer_slack_nsec;
151static Set* arg_syscall_archs;
152static FILE* arg_serialization;
153static sd_id128_t arg_machine_id;
154static bool arg_machine_id_from_firmware = false;
155static EmergencyAction arg_cad_burst_action;
156static CPUSet arg_cpu_affinity;
157static NUMAPolicy arg_numa_policy;
158static usec_t arg_clock_usec;
159static void *arg_random_seed;
160static size_t arg_random_seed_size;
161static usec_t arg_reload_limit_interval_sec;
162static unsigned arg_reload_limit_burst;
163
164/* A copy of the original environment block */
165static char **saved_env = NULL;
166
167static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
168 const struct rlimit *saved_rlimit_memlock);
169
170static DEFINE_CONFIG_PARSE_ENUM_WITH_DEFAULT(config_parse_crash_action, crash_action, CrashAction, CRASH_FREEZE);
171
172static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
173 _cleanup_free_ char *base = NULL;
174 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
175 int r;
176
177 r = xdg_user_config_dir("/systemd", &base);
178 if (r < 0)
179 return r;
180
181 r = strv_extendf(&files, "%s/user.conf", base);
182 if (r < 0)
183 return r;
184
185 r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
186 if (r < 0)
187 return r;
188
189 r = strv_consume(&dirs, TAKE_PTR(base));
190 if (r < 0)
191 return r;
192
193 r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
194 if (r < 0)
195 return r;
196
197 *ret_files = TAKE_PTR(files);
198 *ret_dirs = TAKE_PTR(dirs);
199 return 0;
200}
201
202static int save_console_winsize_in_environment(int tty_fd) {
203 int r;
204
205 assert(tty_fd >= 0);
206
207 struct winsize ws = {};
208 if (ioctl(tty_fd, TIOCGWINSZ, &ws) < 0) {
209 log_debug_errno(errno, "Failed to acquire console window size, ignoring.");
210 goto unset;
211 }
212
213 if (ws.ws_col <= 0 && ws.ws_row <= 0) {
214 log_debug("No console window size set, ignoring.");
215 goto unset;
216 }
217
218 r = setenvf("COLUMNS", /* overwrite= */ true, "%u", ws.ws_col);
219 if (r < 0) {
220 log_debug_errno(r, "Failed to set $COLUMNS, ignoring: %m");
221 goto unset;
222 }
223
224 r = setenvf("LINES", /* overwrite= */ true, "%u", ws.ws_row);
225 if (r < 0) {
226 log_debug_errno(r, "Failed to set $LINES, ignoring: %m");
227 goto unset;
228 }
229
230 log_debug("Recorded console dimensions in environment: $COLUMNS=%u $LINES=%u.", ws.ws_col, ws.ws_row);
231 return 1;
232
233unset:
234 (void) unsetenv("COLUMNS");
235 (void) unsetenv("LINES");
236 return 0;
237}
238
239static int console_setup(void) {
240
241 if (getpid_cached() != 1)
242 return 0;
243
244 _cleanup_close_ int tty_fd = -EBADF;
245
246 tty_fd = open_terminal("/dev/console", O_RDWR|O_NOCTTY|O_CLOEXEC);
247 if (tty_fd < 0)
248 return log_error_errno(tty_fd, "Failed to open %s: %m", "/dev/console");
249
250 /* We don't want to force text mode. Plymouth may be showing pictures already from initrd. */
251 reset_dev_console_fd(tty_fd, /* switch_to_text= */ false);
252
253 save_console_winsize_in_environment(tty_fd);
254
255 return 0;
256}
257
258static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
259 int r;
260
261 assert(key);
262
263 if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
264
265 if (proc_cmdline_value_missing(key, value))
266 return 0;
267
268 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
269 log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
270 else if (in_initrd() == !!startswith(key, "rd."))
271 return free_and_strdup_warn(&arg_default_unit, value);
272
273 } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
274
275 r = value ? parse_boolean(value) : true;
276 if (r < 0)
277 log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
278 else
279 arg_dump_core = r;
280
281 } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
282
283 if (proc_cmdline_value_missing(key, value))
284 return 0;
285
286 if (path_is_absolute(value))
287 (void) parse_path_argument(value, false, &arg_early_core_pattern);
288 else
289 log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
290
291 } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
292
293 if (!value)
294 arg_crash_chvt = 0; /* turn on */
295 else {
296 r = parse_crash_chvt(value, &arg_crash_chvt);
297 if (r < 0)
298 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
299 }
300
301 } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
302
303 r = value ? parse_boolean(value) : true;
304 if (r < 0)
305 log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
306 else
307 arg_crash_shell = r;
308
309 } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
310
311 r = value ? parse_boolean(value) : true;
312 if (r < 0)
313 log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
314 else
315 arg_crash_action = r ? CRASH_REBOOT : CRASH_FREEZE;
316
317 } else if (proc_cmdline_key_streq(key, "systemd.crash_action")) {
318
319 if (proc_cmdline_value_missing(key, value))
320 return 0;
321
322 r = crash_action_from_string(value);
323 if (r < 0)
324 log_warning_errno(r, "Failed to parse crash action switch %s, ignoring: %m", value);
325 else
326 arg_crash_action = r;
327
328 } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
329 char *s;
330
331 r = parse_confirm_spawn(value, &s);
332 if (r < 0)
333 log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
334 else
335 free_and_replace(arg_confirm_spawn, s);
336
337 } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
338
339 r = value ? parse_boolean(value) : true;
340 if (r < 0)
341 log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
342 else
343 arg_service_watchdogs = r;
344
345 } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
346
347 if (value) {
348 r = parse_show_status(value, &arg_show_status);
349 if (r < 0)
350 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
351 } else
352 arg_show_status = SHOW_STATUS_YES;
353
354 } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
355
356 if (proc_cmdline_value_missing(key, value))
357 return 0;
358
359 r = status_unit_format_from_string(value);
360 if (r < 0)
361 log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
362 else
363 arg_status_unit_format = r;
364
365 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
366
367 if (proc_cmdline_value_missing(key, value))
368 return 0;
369
370 r = exec_output_from_string(value);
371 if (r < 0)
372 log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
373 else
374 arg_defaults.std_output = r;
375
376 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
377
378 if (proc_cmdline_value_missing(key, value))
379 return 0;
380
381 r = exec_output_from_string(value);
382 if (r < 0)
383 log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
384 else
385 arg_defaults.std_error = r;
386
387 } else if (streq(key, "systemd.setenv")) {
388
389 if (proc_cmdline_value_missing(key, value))
390 return 0;
391
392 if (!env_assignment_is_valid(value))
393 log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
394 else {
395 r = strv_env_replace_strdup(&arg_default_environment, value);
396 if (r < 0)
397 return log_oom();
398 }
399
400 } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
401
402 if (proc_cmdline_value_missing(key, value))
403 return 0;
404
405 if (streq(value, "firmware"))
406 arg_machine_id_from_firmware = true;
407 else {
408 r = id128_from_string_nonzero(value, &arg_machine_id);
409 if (r < 0)
410 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
411 else
412 arg_machine_id_from_firmware = false;
413 }
414 } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
415
416 if (proc_cmdline_value_missing(key, value))
417 return 0;
418
419 r = parse_sec(value, &arg_defaults.timeout_start_usec);
420 if (r < 0)
421 log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
422
423 if (arg_defaults.timeout_start_usec <= 0)
424 arg_defaults.timeout_start_usec = USEC_INFINITY;
425
426 } else if (proc_cmdline_key_streq(key, "systemd.default_device_timeout_sec")) {
427
428 if (proc_cmdline_value_missing(key, value))
429 return 0;
430
431 r = parse_sec(value, &arg_defaults.device_timeout_usec);
432 if (r < 0)
433 log_warning_errno(r, "Failed to parse default device timeout '%s', ignoring: %m", value);
434
435 if (arg_defaults.device_timeout_usec <= 0)
436 arg_defaults.device_timeout_usec = USEC_INFINITY;
437
438 } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
439
440 if (proc_cmdline_value_missing(key, value))
441 return 0;
442
443 r = parse_cpu_set(value, &arg_cpu_affinity);
444 if (r < 0)
445 log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
446
447 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
448
449 if (proc_cmdline_value_missing(key, value))
450 return 0;
451
452 (void) parse_path_argument(value, false, &arg_watchdog_device);
453
454 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_sec")) {
455
456 if (proc_cmdline_value_missing(key, value))
457 return 0;
458
459 if (streq(value, "default"))
460 arg_runtime_watchdog = USEC_INFINITY;
461 else if (streq(value, "off"))
462 arg_runtime_watchdog = 0;
463 else {
464 r = parse_sec(value, &arg_runtime_watchdog);
465 if (r < 0) {
466 log_warning_errno(r, "Failed to parse systemd.watchdog_sec= argument '%s', ignoring: %m", value);
467 return 0;
468 }
469 }
470
471 arg_kexec_watchdog = arg_reboot_watchdog = arg_runtime_watchdog;
472
473 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pre_sec")) {
474
475 if (proc_cmdline_value_missing(key, value))
476 return 0;
477
478 if (streq(value, "default"))
479 arg_pretimeout_watchdog = USEC_INFINITY;
480 else if (streq(value, "off"))
481 arg_pretimeout_watchdog = 0;
482 else {
483 r = parse_sec(value, &arg_pretimeout_watchdog);
484 if (r < 0) {
485 log_warning_errno(r, "Failed to parse systemd.watchdog_pre_sec= argument '%s', ignoring: %m", value);
486 return 0;
487 }
488 }
489
490 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_pretimeout_governor")) {
491
492 if (proc_cmdline_value_missing(key, value) || isempty(value)) {
493 arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
494 return 0;
495 }
496
497 if (!string_is_safe(value)) {
498 log_warning("Watchdog pretimeout governor '%s' is not valid, ignoring.", value);
499 return 0;
500 }
501
502 return free_and_strdup_warn(&arg_watchdog_pretimeout_governor, value);
503
504 } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
505
506 if (proc_cmdline_value_missing(key, value))
507 return 0;
508
509 r = safe_atou64(value, &arg_clock_usec);
510 if (r < 0)
511 log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
512
513 } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
514 void *p;
515 size_t sz;
516
517 if (proc_cmdline_value_missing(key, value))
518 return 0;
519
520 r = unbase64mem(value, &p, &sz);
521 if (r < 0)
522 log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
523
524 free(arg_random_seed);
525 arg_random_seed = sz > 0 ? p : mfree(p);
526 arg_random_seed_size = sz;
527
528 } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_interval_sec")) {
529
530 if (proc_cmdline_value_missing(key, value))
531 return 0;
532
533 r = parse_sec(value, &arg_reload_limit_interval_sec);
534 if (r < 0) {
535 log_warning_errno(r, "Failed to parse systemd.reload_limit_interval_sec= argument '%s', ignoring: %m", value);
536 return 0;
537 }
538
539 } else if (proc_cmdline_key_streq(key, "systemd.reload_limit_burst")) {
540
541 if (proc_cmdline_value_missing(key, value))
542 return 0;
543
544 r = safe_atou(value, &arg_reload_limit_burst);
545 if (r < 0) {
546 log_warning_errno(r, "Failed to parse systemd.reload_limit_burst= argument '%s', ignoring: %m", value);
547 return 0;
548 }
549
550 } else if (streq(key, "quiet") && !value) {
551
552 if (arg_show_status == _SHOW_STATUS_INVALID)
553 arg_show_status = SHOW_STATUS_ERROR;
554
555 } else if (streq(key, "debug") && !value) {
556
557 /* Note that log_parse_environment() handles 'debug'
558 * too, and sets the log level to LOG_DEBUG. */
559
560 if (detect_container() > 0)
561 log_set_target(LOG_TARGET_CONSOLE);
562
563 } else if (!value) {
564 const char *target;
565
566 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
567 target = runlevel_to_target(key);
568 if (target)
569 return free_and_strdup_warn(&arg_default_unit, target);
570 }
571
572 return 0;
573}
574
575#define DEFINE_SETTER(name, func, descr) \
576 static int name(const char *unit, \
577 const char *filename, \
578 unsigned line, \
579 const char *section, \
580 unsigned section_line, \
581 const char *lvalue, \
582 int ltype, \
583 const char *rvalue, \
584 void *data, \
585 void *userdata) { \
586 \
587 int r; \
588 \
589 assert(filename); \
590 assert(lvalue); \
591 assert(rvalue); \
592 \
593 r = func(rvalue); \
594 if (r < 0) \
595 log_syntax(unit, LOG_ERR, filename, line, r, \
596 "Invalid " descr "'%s': %m", \
597 rvalue); \
598 \
599 return 0; \
600 }
601
602DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
603DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
604DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
605DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
606DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
607
608static int config_parse_default_timeout_abort(
609 const char *unit,
610 const char *filename,
611 unsigned line,
612 const char *section,
613 unsigned section_line,
614 const char *lvalue,
615 int ltype,
616 const char *rvalue,
617 void *data,
618 void *userdata) {
619 int r;
620
621 r = config_parse_timeout_abort(
622 unit,
623 filename,
624 line,
625 section,
626 section_line,
627 lvalue,
628 ltype,
629 rvalue,
630 &arg_defaults.timeout_abort_usec,
631 userdata);
632 if (r >= 0)
633 arg_defaults.timeout_abort_set = r;
634 return 0;
635}
636
637static int config_parse_oom_score_adjust(
638 const char *unit,
639 const char *filename,
640 unsigned line,
641 const char *section,
642 unsigned section_line,
643 const char *lvalue,
644 int ltype,
645 const char *rvalue,
646 void *data,
647 void *userdata) {
648
649 int oa, r;
650
651 if (isempty(rvalue)) {
652 arg_defaults.oom_score_adjust_set = false;
653 return 0;
654 }
655
656 r = parse_oom_score_adjust(rvalue, &oa);
657 if (r < 0)
658 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
659
660 arg_defaults.oom_score_adjust = oa;
661 arg_defaults.oom_score_adjust_set = true;
662
663 return 0;
664}
665
666static int config_parse_protect_system_pid1(
667 const char *unit,
668 const char *filename,
669 unsigned line,
670 const char *section,
671 unsigned section_line,
672 const char *lvalue,
673 int ltype,
674 const char *rvalue,
675 void *data,
676 void *userdata) {
677
678 int *v = ASSERT_PTR(data), r;
679
680 /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
681 * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
682 * "full"). And we will enable this automatically for the initrd unless configured otherwise.
683 *
684 * We might extend this later to match more closely what the per-service ProtectSystem= can do, but
685 * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
686 * at the moment we enable this logic. */
687
688 if (isempty(rvalue) || streq(rvalue, "auto")) {
689 *v = -1;
690 return 0;
691 }
692
693 r = parse_boolean(rvalue);
694 if (r < 0)
695 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
696
697 *v = r;
698 return 0;
699}
700
701static int config_parse_crash_reboot(
702 const char *unit,
703 const char *filename,
704 unsigned line,
705 const char *section,
706 unsigned section_line,
707 const char *lvalue,
708 int ltype,
709 const char *rvalue,
710 void *data,
711 void *userdata) {
712
713 CrashAction *v = ASSERT_PTR(data);
714 int r;
715
716 if (isempty(rvalue)) {
717 *v = CRASH_REBOOT;
718 return 0;
719 }
720
721 r = parse_boolean(rvalue);
722 if (r < 0)
723 return log_syntax_parse_error(unit, filename, line, r, lvalue, rvalue);
724
725 *v = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
726 return 0;
727}
728
729static int parse_config_file(void) {
730 const ConfigTableItem items[] = {
731 { "Manager", "LogLevel", config_parse_level2, 0, NULL },
732 { "Manager", "LogTarget", config_parse_target, 0, NULL },
733 { "Manager", "LogColor", config_parse_color, 0, NULL },
734 { "Manager", "LogLocation", config_parse_location, 0, NULL },
735 { "Manager", "LogTime", config_parse_time, 0, NULL },
736 { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
737 { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
738 { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
739 { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
740 { "Manager", "CrashReboot", config_parse_crash_reboot, 0, &arg_crash_action },
741 { "Manager", "CrashAction", config_parse_crash_action, 0, &arg_crash_action },
742 { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
743 { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
744 { "Manager", "CPUAffinity", config_parse_cpu_set, 0, &arg_cpu_affinity },
745 { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
746 { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy.nodes },
747 { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_LEGACY, NULL },
748 { "Manager", "RuntimeWatchdogSec", config_parse_watchdog_sec, 0, &arg_runtime_watchdog },
749 { "Manager", "RuntimeWatchdogPreSec", config_parse_watchdog_sec, 0, &arg_pretimeout_watchdog },
750 { "Manager", "RebootWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog },
751 { "Manager", "ShutdownWatchdogSec", config_parse_watchdog_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
752 { "Manager", "KExecWatchdogSec", config_parse_watchdog_sec, 0, &arg_kexec_watchdog },
753 { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
754 { "Manager", "RuntimeWatchdogPreGovernor", config_parse_string, CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
755 { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
756 { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
757 { "Manager", "ProtectSystem", config_parse_protect_system_pid1, 0, &arg_protect_system },
758#if HAVE_SECCOMP
759 { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
760#else
761 { "Manager", "SystemCallArchitectures", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
762
763#endif
764 { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
765 { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_defaults.timer_accuracy_usec },
766 { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_defaults.std_output },
767 { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_defaults.std_error },
768 { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_defaults.timeout_start_usec },
769 { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_defaults.timeout_stop_usec },
770 { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
771 { "Manager", "DefaultDeviceTimeoutSec", config_parse_sec, 0, &arg_defaults.device_timeout_usec },
772 { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_defaults.restart_usec },
773 { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_defaults.start_limit.interval}, /* obsolete alias */
774 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_defaults.start_limit.interval},
775 { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_defaults.start_limit.burst },
776 { "Manager", "DefaultEnvironment", config_parse_environ, arg_runtime_scope, &arg_default_environment },
777 { "Manager", "ManagerEnvironment", config_parse_environ, arg_runtime_scope, &arg_manager_environment },
778 { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_defaults.rlimit },
779 { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_defaults.rlimit },
780 { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_defaults.rlimit },
781 { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_defaults.rlimit },
782 { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_defaults.rlimit },
783 { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_defaults.rlimit },
784 { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_defaults.rlimit },
785 { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_defaults.rlimit },
786 { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_defaults.rlimit },
787 { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_defaults.rlimit },
788 { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_defaults.rlimit },
789 { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_defaults.rlimit },
790 { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_defaults.rlimit },
791 { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_defaults.rlimit },
792 { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_defaults.rlimit },
793 { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_defaults.rlimit },
794 { "Manager", "DefaultCPUAccounting", config_parse_warn_compat, DISABLED_LEGACY, NULL },
795 { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_defaults.io_accounting },
796 { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_defaults.ip_accounting },
797 { "Manager", "DefaultBlockIOAccounting", config_parse_warn_compat, DISABLED_LEGACY, NULL },
798 { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_defaults.memory_accounting },
799 { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_defaults.tasks_accounting },
800 { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_defaults.tasks_max },
801 { "Manager", "DefaultMemoryPressureThresholdSec", config_parse_sec, 0, &arg_defaults.memory_pressure_threshold_usec },
802 { "Manager", "DefaultMemoryPressureWatch", config_parse_memory_pressure_watch, 0, &arg_defaults.memory_pressure_watch },
803 { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, arg_runtime_scope, &arg_cad_burst_action },
804 { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_defaults.oom_policy },
805 { "Manager", "DefaultOOMScoreAdjust", config_parse_oom_score_adjust, 0, NULL },
806 { "Manager", "ReloadLimitIntervalSec", config_parse_sec, 0, &arg_reload_limit_interval_sec },
807 { "Manager", "ReloadLimitBurst", config_parse_unsigned, 0, &arg_reload_limit_burst },
808#if ENABLE_SMACK
809 { "Manager", "DefaultSmackProcessLabel", config_parse_string, 0, &arg_defaults.smack_process_label },
810#else
811 { "Manager", "DefaultSmackProcessLabel", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
812#endif
813 {}
814 };
815
816 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM)
817 (void) config_parse_standard_file_with_dropins(
818 "systemd/system.conf",
819 "Manager\0",
820 config_item_table_lookup, items,
821 CONFIG_PARSE_WARN,
822 /* userdata= */ NULL);
823 else {
824 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
825 int r;
826
827 assert(arg_runtime_scope == RUNTIME_SCOPE_USER);
828
829 r = manager_find_user_config_paths(&files, &dirs);
830 if (r < 0)
831 return log_error_errno(r, "Failed to determine config file paths: %m");
832
833 (void) config_parse_many(
834 (const char* const*) files,
835 (const char* const*) dirs,
836 "user.conf.d",
837 /* root = */ NULL,
838 "Manager\0",
839 config_item_table_lookup, items,
840 CONFIG_PARSE_WARN,
841 NULL, NULL, NULL);
842 }
843
844 /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
845 * USEC_INFINITY like everywhere else. */
846 if (arg_defaults.timeout_start_usec <= 0)
847 arg_defaults.timeout_start_usec = USEC_INFINITY;
848 if (arg_defaults.timeout_stop_usec <= 0)
849 arg_defaults.timeout_stop_usec = USEC_INFINITY;
850
851 return 0;
852}
853
854static void set_manager_defaults(Manager *m) {
855 int r;
856
857 assert(m);
858
859 /* Propagates the various default unit property settings into the manager object, i.e. properties
860 * that do not affect the manager itself, but are just what newly allocated units will have set if
861 * they haven't set anything else. (Also see set_manager_settings() for the settings that affect the
862 * manager's own behaviour) */
863
864 r = manager_set_unit_defaults(m, &arg_defaults);
865 if (r < 0)
866 log_warning_errno(r, "Failed to set manager defaults, ignoring: %m");
867
868 r = manager_default_environment(m);
869 if (r < 0)
870 log_warning_errno(r, "Failed to set manager default environment, ignoring: %m");
871
872 r = manager_transient_environment_add(m, arg_default_environment);
873 if (r < 0)
874 log_warning_errno(r, "Failed to add to transient environment, ignoring: %m");
875}
876
877static void set_manager_settings(Manager *m) {
878 int r;
879
880 assert(m);
881
882 /* Propagates the various manager settings into the manager object, i.e. properties that
883 * affect the manager itself (as opposed to just being inherited into newly allocated
884 * units, see set_manager_defaults() above). */
885
886 m->confirm_spawn = arg_confirm_spawn;
887 m->service_watchdogs = arg_service_watchdogs;
888 m->cad_burst_action = arg_cad_burst_action;
889 /* Note that we don't do structured initialization here, otherwise it will reset the rate limit
890 * counter on every daemon-reload. */
891 m->reload_reexec_ratelimit.interval = arg_reload_limit_interval_sec;
892 m->reload_reexec_ratelimit.burst = arg_reload_limit_burst;
893
894 manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
895 manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
896 manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
897 manager_set_watchdog(m, WATCHDOG_PRETIMEOUT, arg_pretimeout_watchdog);
898 r = manager_set_watchdog_pretimeout_governor(m, arg_watchdog_pretimeout_governor);
899 if (r < 0)
900 log_warning_errno(r, "Failed to set watchdog pretimeout governor to '%s', ignoring: %m", arg_watchdog_pretimeout_governor);
901
902 manager_set_show_status(m, arg_show_status, "command line");
903 m->status_unit_format = arg_status_unit_format;
904}
905
906static int parse_argv(int argc, char *argv[]) {
907 enum {
908 COMMON_GETOPT_ARGS,
909 SYSTEMD_GETOPT_ARGS,
910 };
911
912 static const struct option options[] = {
913 COMMON_GETOPT_OPTIONS,
914 SYSTEMD_GETOPT_OPTIONS,
915 {}
916 };
917
918 int c, r;
919 bool user_arg_seen = false;
920
921 assert(argc >= 1);
922 assert(argv);
923
924 if (getpid_cached() == 1)
925 opterr = 0;
926
927 while ((c = getopt_long(argc, argv, SYSTEMD_GETOPT_SHORT_OPTIONS, options, NULL)) >= 0)
928
929 switch (c) {
930
931 case ARG_LOG_LEVEL:
932 r = log_set_max_level_from_string(optarg);
933 if (r < 0)
934 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
935
936 break;
937
938 case ARG_LOG_TARGET:
939 r = log_set_target_from_string(optarg);
940 if (r < 0)
941 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
942
943 break;
944
945 case ARG_LOG_COLOR:
946
947 if (optarg) {
948 r = log_show_color_from_string(optarg);
949 if (r < 0)
950 return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
951 optarg);
952 } else
953 log_show_color(true);
954
955 break;
956
957 case ARG_LOG_LOCATION:
958 if (optarg) {
959 r = log_show_location_from_string(optarg);
960 if (r < 0)
961 return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
962 optarg);
963 } else
964 log_show_location(true);
965
966 break;
967
968 case ARG_LOG_TIME:
969
970 if (optarg) {
971 r = log_show_time_from_string(optarg);
972 if (r < 0)
973 return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
974 optarg);
975 } else
976 log_show_time(true);
977
978 break;
979
980 case ARG_DEFAULT_STD_OUTPUT:
981 r = exec_output_from_string(optarg);
982 if (r < 0)
983 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
984 optarg);
985 arg_defaults.std_output = r;
986 break;
987
988 case ARG_DEFAULT_STD_ERROR:
989 r = exec_output_from_string(optarg);
990 if (r < 0)
991 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
992 optarg);
993 arg_defaults.std_error = r;
994 break;
995
996 case ARG_UNIT:
997 r = free_and_strdup(&arg_default_unit, optarg);
998 if (r < 0)
999 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
1000
1001 break;
1002
1003 case ARG_SYSTEM:
1004 arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
1005 break;
1006
1007 case ARG_USER:
1008 arg_runtime_scope = RUNTIME_SCOPE_USER;
1009 user_arg_seen = true;
1010 break;
1011
1012 case ARG_TEST:
1013 arg_action = ACTION_TEST;
1014 break;
1015
1016 case ARG_NO_PAGER:
1017 arg_pager_flags |= PAGER_DISABLE;
1018 break;
1019
1020 case ARG_VERSION:
1021 arg_action = ACTION_VERSION;
1022 break;
1023
1024 case ARG_DUMP_CONFIGURATION_ITEMS:
1025 arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
1026 break;
1027
1028 case ARG_DUMP_BUS_PROPERTIES:
1029 arg_action = ACTION_DUMP_BUS_PROPERTIES;
1030 break;
1031
1032 case ARG_BUS_INTROSPECT:
1033 arg_bus_introspect = optarg;
1034 arg_action = ACTION_BUS_INTROSPECT;
1035 break;
1036
1037 case ARG_DUMP_CORE:
1038 r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
1039 if (r < 0)
1040 return r;
1041 break;
1042
1043 case ARG_CRASH_CHVT:
1044 r = parse_crash_chvt(optarg, &arg_crash_chvt);
1045 if (r < 0)
1046 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
1047 optarg);
1048 break;
1049
1050 case ARG_CRASH_SHELL:
1051 r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
1052 if (r < 0)
1053 return r;
1054 break;
1055
1056 case ARG_CRASH_REBOOT:
1057 r = parse_boolean_argument("--crash-reboot", optarg, NULL);
1058 if (r < 0)
1059 return r;
1060 arg_crash_action = r > 0 ? CRASH_REBOOT : CRASH_FREEZE;
1061 break;
1062
1063 case ARG_CRASH_ACTION:
1064 r = crash_action_from_string(optarg);
1065 if (r < 0)
1066 return log_error_errno(r, "Failed to parse crash action \"%s\": %m", optarg);
1067 arg_crash_action = r;
1068 break;
1069
1070 case ARG_CONFIRM_SPAWN:
1071 arg_confirm_spawn = mfree(arg_confirm_spawn);
1072
1073 r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
1074 if (r < 0)
1075 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
1076 optarg);
1077 break;
1078
1079 case ARG_SERVICE_WATCHDOGS:
1080 r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
1081 if (r < 0)
1082 return r;
1083 break;
1084
1085 case ARG_SHOW_STATUS:
1086 if (optarg) {
1087 r = parse_show_status(optarg, &arg_show_status);
1088 if (r < 0)
1089 return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1090 optarg);
1091 } else
1092 arg_show_status = SHOW_STATUS_YES;
1093 break;
1094
1095 case ARG_DESERIALIZE: {
1096 int fd;
1097 FILE *f;
1098
1099 fd = parse_fd(optarg);
1100 if (fd < 0)
1101 return log_error_errno(fd, "Failed to parse serialization fd \"%s\": %m", optarg);
1102
1103 (void) fd_cloexec(fd, true);
1104
1105 f = fdopen(fd, "r");
1106 if (!f)
1107 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1108
1109 safe_fclose(arg_serialization);
1110 arg_serialization = f;
1111
1112 break;
1113 }
1114
1115 case ARG_SWITCHED_ROOT:
1116 arg_switched_root = true;
1117 break;
1118
1119 case ARG_MACHINE_ID:
1120 r = id128_from_string_nonzero(optarg, &arg_machine_id);
1121 if (r < 0)
1122 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1123 break;
1124
1125 case 'h':
1126 arg_action = ACTION_HELP;
1127 break;
1128
1129 case 'D':
1130 log_set_max_level(LOG_DEBUG);
1131 break;
1132
1133 case 'b':
1134 case 's':
1135 case 'z':
1136 /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1137 * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1138 */
1139 case '?':
1140 if (getpid_cached() != 1)
1141 return -EINVAL;
1142 else
1143 return 0;
1144
1145 default:
1146 assert_not_reached();
1147 }
1148
1149 if (optind < argc && getpid_cached() != 1)
1150 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1151 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1152
1153 if (arg_action == ACTION_RUN && arg_runtime_scope == RUNTIME_SCOPE_USER && !user_arg_seen)
1154 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1155 "Explicit --user argument required to run as user manager.");
1156
1157 return 0;
1158}
1159
1160static int help(void) {
1161 _cleanup_free_ char *link = NULL;
1162 int r;
1163
1164 r = terminal_urlify_man("systemd", "1", &link);
1165 if (r < 0)
1166 return log_oom();
1167
1168 printf("%s [OPTIONS...]\n\n"
1169 "%sStarts and monitors system and user services.%s\n\n"
1170 "This program takes no positional arguments.\n\n"
1171 "%sOptions%s:\n"
1172 " -h --help Show this help\n"
1173 " --version Show version\n"
1174 " --test Determine initial transaction, dump it and exit\n"
1175 " --system Combined with --test: operate in system mode\n"
1176 " --user Combined with --test: operate in user mode\n"
1177 " --dump-configuration-items Dump understood unit configuration items\n"
1178 " --dump-bus-properties Dump exposed bus properties\n"
1179 " --bus-introspect=PATH Write XML introspection data\n"
1180 " --unit=UNIT Set default unit\n"
1181 " --dump-core[=BOOL] Dump core on crash\n"
1182 " --crash-vt=NR Change to specified VT on crash\n"
1183 " --crash-action=ACTION Specify what to do on crash\n"
1184 " --crash-shell[=BOOL] Run shell on crash\n"
1185 " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
1186 " --show-status[=BOOL] Show status updates on the console during boot\n"
1187 " --log-target=TARGET Set log target (console, journal, kmsg,\n"
1188 " journal-or-kmsg, null)\n"
1189 " --log-level=LEVEL Set log level (debug, info, notice, warning,\n"
1190 " err, crit, alert, emerg)\n"
1191 " --log-color[=BOOL] Highlight important log messages\n"
1192 " --log-location[=BOOL] Include code location in log messages\n"
1193 " --log-time[=BOOL] Prefix log messages with current time\n"
1194 " --default-standard-output= Set default standard output for services\n"
1195 " --default-standard-error= Set default standard error output for services\n"
1196 " --no-pager Do not pipe output into a pager\n"
1197 "\nSee the %s for details.\n",
1198 program_invocation_short_name,
1199 ansi_highlight(),
1200 ansi_normal(),
1201 ansi_underline(),
1202 ansi_normal(),
1203 link);
1204
1205 return 0;
1206}
1207
1208static int prepare_reexecute(
1209 Manager *m,
1210 FILE **ret_f,
1211 FDSet **ret_fds,
1212 bool switching_root) {
1213
1214 _cleanup_fdset_free_ FDSet *fds = NULL;
1215 _cleanup_fclose_ FILE *f = NULL;
1216 int r;
1217
1218 assert(m);
1219 assert(ret_f);
1220 assert(ret_fds);
1221
1222 /* Make sure nothing is really destructed when we shut down */
1223 m->n_reloading++;
1224 bus_manager_send_reloading(m, true);
1225
1226 r = manager_open_serialization(m, &f);
1227 if (r < 0)
1228 return log_error_errno(r, "Failed to create serialization file: %m");
1229
1230 fds = fdset_new();
1231 if (!fds)
1232 return log_oom();
1233
1234 r = manager_serialize(m, f, fds, switching_root);
1235 if (r < 0)
1236 return r;
1237
1238 r = finish_serialization_file(f);
1239 if (r < 0)
1240 return log_error_errno(r, "Failed to finish serialization file: %m");
1241
1242 r = fd_cloexec(fileno(f), false);
1243 if (r < 0)
1244 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1245
1246 r = fdset_cloexec(fds, false);
1247 if (r < 0)
1248 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1249
1250 *ret_f = TAKE_PTR(f);
1251 *ret_fds = TAKE_PTR(fds);
1252
1253 return 0;
1254}
1255
1256static void bump_file_max_and_nr_open(void) {
1257
1258 /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large
1259 * numbers of file descriptors are no longer a performance problem and their memory is properly
1260 * tracked by memcg, thus counting them and limiting them in another two layers of limits is
1261 * unnecessary and just complicates things. This function hence turns off 2 of the 4 levels of limits
1262 * on file descriptors, and makes RLIMIT_NOLIMIT (soft + hard) the only ones that really matter. */
1263
1264#if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1265 int r;
1266#endif
1267
1268#if BUMP_PROC_SYS_FS_FILE_MAX
1269 /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously things were
1270 * different, but the operation would fail silently.) */
1271 r = sysctl_write("fs/file-max", LONG_MAX_STR);
1272 if (r < 0)
1273 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING,
1274 r, "Failed to bump fs.file-max, ignoring: %m");
1275#endif
1276
1277#if BUMP_PROC_SYS_FS_NR_OPEN
1278 int v = INT_MAX;
1279
1280 /* Argh! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know
1281 * what they are. The expression by which the maximum is determined is dependent on the architecture,
1282 * and is something we don't really want to copy to userspace, as it is dependent on implementation
1283 * details of the kernel. Since the kernel doesn't expose the maximum value to us, we can only try
1284 * and hope. Hence, let's start with INT_MAX, and then keep halving the value until we find one that
1285 * works. Ugly? Yes, absolutely, but kernel APIs are kernel APIs, so what do can we do... 🤯 */
1286
1287 for (;;) {
1288 int k;
1289
1290 v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1291 if (v < 1024) {
1292 log_warning("Can't bump fs.nr_open, value too small.");
1293 break;
1294 }
1295
1296 k = read_nr_open();
1297 if (k < 0) {
1298 log_error_errno(k, "Failed to read fs.nr_open: %m");
1299 break;
1300 }
1301 if (k >= v) { /* Already larger */
1302 log_debug("Skipping bump, value is already larger.");
1303 break;
1304 }
1305
1306 r = sysctl_writef("fs/nr_open", "%i", v);
1307 if (r == -EINVAL) {
1308 log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1309 v /= 2;
1310 continue;
1311 }
1312 if (r < 0) {
1313 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1314 break;
1315 }
1316
1317 log_debug("Successfully bumped fs.nr_open to %i", v);
1318 break;
1319 }
1320#endif
1321}
1322
1323static int bump_rlimit_nofile(const struct rlimit *saved_rlimit) {
1324 struct rlimit new_rlimit;
1325 int r, nr;
1326
1327 /* Get the underlying absolute limit the kernel enforces */
1328 nr = read_nr_open();
1329
1330 /* Calculate the new limits to use for us. Never lower from what we inherited. */
1331 new_rlimit = (struct rlimit) {
1332 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1333 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1334 };
1335
1336 /* Shortcut if nothing changes. */
1337 if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1338 saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1339 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1340 return 0;
1341 }
1342
1343 /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1344 * both hard and soft. */
1345 r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1346 if (r < 0)
1347 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1348
1349 return 0;
1350}
1351
1352static int bump_rlimit_memlock(const struct rlimit *saved_rlimit) {
1353 struct rlimit new_rlimit;
1354 uint64_t mm;
1355 int r;
1356
1357 /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK
1358 * which should normally disable such checks. We need them to implement IPAddressAllow= and
1359 * IPAddressDeny=, hence let's bump the value high enough for our user. */
1360
1361 /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1362 * must be unsigned, hence this is a given, but let's make this clear here. */
1363 assert_cc(RLIM_INFINITY > 0);
1364
1365 mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of
1366 * physical RAM. We allow an eighth to be locked by us, just to
1367 * pick a value. */
1368
1369 new_rlimit = (struct rlimit) {
1370 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1371 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1372 };
1373
1374 if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1375 saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1376 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1377 return 0;
1378 }
1379
1380 r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1381 if (r < 0)
1382 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1383
1384 return 0;
1385}
1386
1387static int enforce_syscall_archs(Set *archs) {
1388#if HAVE_SECCOMP
1389 int r;
1390
1391 if (!is_seccomp_available())
1392 return 0;
1393
1394 r = seccomp_restrict_archs(arg_syscall_archs);
1395 if (r < 0)
1396 return log_error_errno(r, "Failed to enforce system call architecture restriction: %m");
1397#endif
1398 return 0;
1399}
1400
1401static int os_release_status(void) {
1402 _cleanup_free_ char *pretty_name = NULL, *name = NULL, *version = NULL,
1403 *ansi_color = NULL, *support_end = NULL;
1404 int r;
1405
1406 r = parse_os_release(NULL,
1407 "PRETTY_NAME", &pretty_name,
1408 "NAME", &name,
1409 "VERSION", &version,
1410 "ANSI_COLOR", &ansi_color,
1411 "SUPPORT_END", &support_end);
1412 if (r < 0)
1413 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1414 "Failed to read os-release file, ignoring: %m");
1415
1416 const char *label = os_release_pretty_name(pretty_name, name);
1417 const char *color = empty_to_null(ansi_color) ?: "1";
1418
1419 if (show_status_on(arg_show_status)) {
1420 if (in_initrd()) {
1421 if (log_get_show_color())
1422 status_printf(NULL, 0,
1423 ANSI_HIGHLIGHT "Booting initrd of " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "." ANSI_NORMAL,
1424 color, label);
1425 else
1426 status_printf(NULL, 0,
1427 "Booting initrd of %s...", label);
1428 } else {
1429 if (log_get_show_color())
1430 status_printf(NULL, 0,
1431 "\n" ANSI_HIGHLIGHT "Welcome to " ANSI_NORMAL "\x1B[%sm%s" ANSI_NORMAL ANSI_HIGHLIGHT "!" ANSI_NORMAL "\n",
1432 color, label);
1433 else
1434 status_printf(NULL, 0,
1435 "\nWelcome to %s!\n",
1436 label);
1437 }
1438 }
1439
1440 if (support_end && os_release_support_ended(support_end, /* quiet */ false, NULL) > 0)
1441 /* pretty_name may include the version already, so we'll print the version only if we
1442 * have it and we're not using pretty_name. */
1443 status_printf(ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL, 0,
1444 "This OS version (%s%s%s) is past its end-of-support date (%s)",
1445 label,
1446 (pretty_name || !version) ? "" : " version ",
1447 (pretty_name || !version) ? "" : version,
1448 support_end);
1449
1450 return 0;
1451}
1452
1453static int setup_os_release(RuntimeScope scope) {
1454 char os_release_dst[STRLEN("/run/user//systemd/propagate/.os-release-stage/os-release") + DECIMAL_STR_MAX(uid_t)] =
1455 "/run/systemd/propagate/.os-release-stage/os-release";
1456 const char *os_release_src = "/etc/os-release";
1457 int r;
1458
1459 assert(IN_SET(scope, RUNTIME_SCOPE_SYSTEM, RUNTIME_SCOPE_USER));
1460
1461 if (access("/etc/os-release", F_OK) < 0) {
1462 if (errno != ENOENT)
1463 log_debug_errno(errno, "Failed to check if /etc/os-release exists, ignoring: %m");
1464
1465 os_release_src = "/usr/lib/os-release";
1466 }
1467
1468 if (scope == RUNTIME_SCOPE_USER)
1469 xsprintf(os_release_dst, "/run/user/" UID_FMT "/systemd/propagate/.os-release-stage/os-release", geteuid());
1470
1471 r = mkdir_parents_label(os_release_dst, 0755);
1472 if (r < 0)
1473 return log_debug_errno(r, "Failed to create parent directory of '%s', ignoring: %m", os_release_dst);
1474
1475 r = copy_file_atomic(os_release_src, os_release_dst, 0644, COPY_MAC_CREATE|COPY_REPLACE);
1476 if (r < 0)
1477 return log_debug_errno(r, "Failed to copy '%s' to '%s', ignoring: %m",
1478 os_release_src, os_release_dst);
1479
1480 return 0;
1481}
1482
1483static int write_container_id(void) {
1484 const char *c;
1485 int r = 0; /* avoid false maybe-uninitialized warning */
1486
1487 c = getenv("container");
1488 if (isempty(c))
1489 return 0;
1490
1491 WITH_UMASK(0022)
1492 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1493 if (r < 0)
1494 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1495
1496 return 1;
1497}
1498
1499static int write_boot_or_shutdown_osc(const char *type) {
1500 int r;
1501
1502 assert(STRPTR_IN_SET(type, "boot", "shutdown"));
1503
1504 if (getenv_terminal_is_dumb())
1505 return 0;
1506
1507 _cleanup_close_ int fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
1508 if (fd < 0)
1509 return log_debug_errno(fd, "Failed to open /dev/console to print %s OSC, ignoring: %m", type);
1510
1511 _cleanup_free_ char *seq = NULL;
1512 if (streq(type, "boot"))
1513 r = osc_context_open_boot(&seq);
1514 else
1515 r = osc_context_close(SD_ID128_ALLF, &seq);
1516 if (r < 0)
1517 return log_debug_errno(r, "Failed to acquire %s OSC sequence, ignoring: %m", type);
1518
1519 r = loop_write(fd, seq, SIZE_MAX);
1520 if (r < 0)
1521 return log_debug_errno(r, "Failed to write %s OSC sequence, ignoring: %m", type);
1522
1523 if (DEBUG_LOGGING) {
1524 _cleanup_free_ char *h = cescape(seq);
1525 log_debug("OSC sequence for %s successfully written: %s", type, strna(h));
1526 }
1527
1528 return 0;
1529}
1530
1531static int bump_unix_max_dgram_qlen(void) {
1532 _cleanup_free_ char *qlen = NULL;
1533 unsigned long v;
1534 int r;
1535
1536 /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set
1537 * the value really really early during boot, so that it is actually applied to all our sockets,
1538 * including the $NOTIFY_SOCKET one. */
1539
1540 r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1541 if (r < 0)
1542 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1543 "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1544
1545 r = safe_atolu(qlen, &v);
1546 if (r < 0)
1547 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1548
1549 if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1550 return 0;
1551
1552 r = sysctl_write("net/unix/max_dgram_qlen", STRINGIFY(DEFAULT_UNIX_MAX_DGRAM_QLEN));
1553 if (r < 0)
1554 return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1555 "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1556
1557 return 1;
1558}
1559
1560static int fixup_environment(void) {
1561 int r;
1562
1563 /* Only fix up the environment when we are started as PID 1 */
1564 if (getpid_cached() != 1)
1565 return 0;
1566
1567 /* We expect the environment to be set correctly if run inside a container. */
1568 if (detect_container() > 0)
1569 return 0;
1570
1571 /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the
1572 * backend device used by the console. We try to make a better guess here since some consoles might
1573 * not have support for color mode for example.
1574 *
1575 * However if TERM was configured through the kernel command line then leave it alone. */
1576 _cleanup_free_ char *term = NULL;
1577 r = proc_cmdline_get_key("TERM", 0, &term);
1578 if (r < 0)
1579 return r;
1580 if (r > 0) {
1581 /* If we pick up $TERM, then also pick up $COLORTERM, $NO_COLOR */
1582 FOREACH_STRING(v, "COLORTERM", "NO_COLOR") {
1583 _cleanup_free_ char *vv = NULL;
1584 r = proc_cmdline_get_key(v, 0, &vv);
1585 if (r < 0)
1586 return r;
1587 if (r > 0 && setenv(v, vv, /* overwrite= */ true) < 0)
1588 return -errno;
1589 }
1590 } else {
1591 /* If no $TERM is set then look for the per-tty variable instead */
1592 r = proc_cmdline_get_key("systemd.tty.term.console", 0, &term);
1593 if (r < 0)
1594 return r;
1595 }
1596
1597 if (!term)
1598 (void) query_term_for_tty("/dev/console", &term);
1599
1600 if (setenv("TERM", term ?: FALLBACK_TERM, /* overwrite= */ true) < 0)
1601 return -errno;
1602
1603 /* The kernels sets HOME=/ for init. Let's undo this. */
1604 if (path_equal(getenv("HOME"), "/"))
1605 assert_se(unsetenv("HOME") == 0);
1606
1607 return 0;
1608}
1609
1610static void redirect_telinit(int argc, char *argv[]) {
1611
1612 /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
1613
1614#if HAVE_SYSV_COMPAT
1615 if (getpid_cached() == 1)
1616 return;
1617
1618 if (!invoked_as(argv, "init"))
1619 return;
1620
1621 execv(SYSTEMCTL_BINARY_PATH, argv);
1622 log_error_errno(errno, "Failed to execute %s: %m", SYSTEMCTL_BINARY_PATH);
1623 exit(EXIT_FAILURE);
1624#endif
1625}
1626
1627static int become_shutdown(int objective, int retval) {
1628 static const char* const table[_MANAGER_OBJECTIVE_MAX] = {
1629 [MANAGER_EXIT] = "exit",
1630 [MANAGER_REBOOT] = "reboot",
1631 [MANAGER_POWEROFF] = "poweroff",
1632 [MANAGER_HALT] = "halt",
1633 [MANAGER_KEXEC] = "kexec",
1634 };
1635
1636 char timeout[STRLEN("--timeout=") + DECIMAL_STR_MAX(usec_t) + STRLEN("us")],
1637 exit_code[STRLEN("--exit-code=") + DECIMAL_STR_MAX(uint8_t)];
1638
1639 _cleanup_strv_free_ char **env_block = NULL;
1640 _cleanup_free_ char *max_log_levels = NULL;
1641 usec_t watchdog_timer = 0;
1642 int r;
1643
1644 assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1645 assert(table[objective]);
1646
1647 xsprintf(timeout, "--timeout=%" PRI_USEC "us", arg_defaults.timeout_stop_usec);
1648
1649 const char* command_line[11] = {
1650 SYSTEMD_SHUTDOWN_BINARY_PATH,
1651 table[objective],
1652 timeout,
1653 /* Note that the last position is a terminator and must contain NULL. */
1654 };
1655 size_t pos = 3;
1656
1657 assert(command_line[pos-1]);
1658 assert(!command_line[pos]);
1659
1660 (void) log_max_levels_to_string(log_get_max_level(), &max_log_levels);
1661
1662 if (max_log_levels) {
1663 command_line[pos++] = "--log-level";
1664 command_line[pos++] = max_log_levels;
1665 }
1666
1667 switch (log_get_target()) {
1668
1669 case LOG_TARGET_KMSG:
1670 case LOG_TARGET_JOURNAL_OR_KMSG:
1671 case LOG_TARGET_SYSLOG_OR_KMSG:
1672 command_line[pos++] = "--log-target=kmsg";
1673 break;
1674
1675 case LOG_TARGET_NULL:
1676 command_line[pos++] = "--log-target=null";
1677 break;
1678
1679 case LOG_TARGET_CONSOLE:
1680 default:
1681 command_line[pos++] = "--log-target=console";
1682 };
1683
1684 if (log_get_show_color())
1685 command_line[pos++] = "--log-color";
1686
1687 if (log_get_show_location())
1688 command_line[pos++] = "--log-location";
1689
1690 if (log_get_show_time())
1691 command_line[pos++] = "--log-time";
1692
1693 xsprintf(exit_code, "--exit-code=%d", retval);
1694 command_line[pos++] = exit_code;
1695
1696 assert(pos < ELEMENTSOF(command_line));
1697
1698 /* The watchdog: */
1699
1700 if (objective == MANAGER_REBOOT)
1701 watchdog_timer = arg_reboot_watchdog;
1702 else if (objective == MANAGER_KEXEC)
1703 watchdog_timer = arg_kexec_watchdog;
1704
1705 /* If we reboot or kexec let's set the shutdown watchdog and tell the
1706 * shutdown binary to repeatedly ping it.
1707 * Disable the pretimeout watchdog, as we do not support it from the shutdown binary. */
1708 (void) watchdog_setup_pretimeout(0);
1709 (void) watchdog_setup_pretimeout_governor(NULL);
1710 r = watchdog_setup(watchdog_timer);
1711 watchdog_close(/* disarm= */ r < 0);
1712
1713 /* The environment block: */
1714
1715 env_block = strv_copy(environ);
1716
1717 /* Tell the binary how often to ping, ignore failure */
1718 (void) strv_extendf(&env_block, "WATCHDOG_USEC="USEC_FMT, watchdog_timer);
1719
1720 /* Make sure that tools that look for $WATCHDOG_USEC (and might get started by the exitrd) don't get
1721 * confused by the variable, because the sd_watchdog_enabled() protocol uses the same variable for
1722 * the same purposes. */
1723 (void) strv_extendf(&env_block, "WATCHDOG_PID=" PID_FMT, getpid_cached());
1724
1725 if (arg_watchdog_device)
1726 (void) strv_extendf(&env_block, "WATCHDOG_DEVICE=%s", arg_watchdog_device);
1727
1728 (void) write_boot_or_shutdown_osc("shutdown");
1729
1730 execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1731 return -errno;
1732}
1733
1734static void initialize_clock_timewarp(void) {
1735 int r;
1736
1737 /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1738 * we are running, but only once. */
1739
1740 if (clock_is_localtime(NULL) > 0) {
1741 int min;
1742
1743 /* The very first call of settimeofday() also does a time warp in the kernel.
1744 *
1745 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to
1746 * take care of maintaining the RTC and do all adjustments. This matches the behavior of
1747 * Windows, which leaves the RTC alone if the registry tells that the RTC runs in UTC.
1748 */
1749 r = clock_set_timezone(&min);
1750 if (r < 0)
1751 log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1752 else
1753 log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1754
1755 } else if (!in_initrd())
1756 /*
1757 * Do a dummy very first call to seal the kernel's time warp magic.
1758 *
1759 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with
1760 * LOCAL, but the real system could be set up that way. In such case, we need to delay the
1761 * time-warp or the sealing until we reach the real system.
1762 *
1763 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably,
1764 * the time will jump or be incorrect at every daylight saving time change. All kernel local
1765 * time concepts will be treated as UTC that way.
1766 */
1767 (void) clock_reset_timewarp();
1768}
1769
1770static void apply_clock_update(void) {
1771 /* This is called later than clock_apply_epoch(), i.e. after we have parsed
1772 * configuration files/kernel command line and such. */
1773
1774 if (arg_clock_usec == 0)
1775 return;
1776
1777 if (getpid_cached() != 1)
1778 return;
1779
1780 if (clock_settime(CLOCK_REALTIME, TIMESPEC_STORE(arg_clock_usec)) < 0)
1781 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1782 else
1783 log_info("Set system clock to %s, as specified on the kernel command line.",
1784 FORMAT_TIMESTAMP(arg_clock_usec));
1785}
1786
1787static void cmdline_take_random_seed(void) {
1788 size_t suggested;
1789 int r;
1790
1791 if (arg_random_seed_size == 0)
1792 return;
1793
1794 if (getpid_cached() != 1)
1795 return;
1796
1797 assert(arg_random_seed);
1798 suggested = random_pool_size();
1799
1800 if (arg_random_seed_size < suggested)
1801 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1802 arg_random_seed_size, suggested);
1803
1804 r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1805 if (r < 0) {
1806 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1807 return;
1808 }
1809
1810 log_notice("Successfully credited entropy passed on kernel command line.\n"
1811 "Note that the seed provided this way is accessible to unprivileged programs. "
1812 "This functionality should not be used outside of testing environments.");
1813}
1814
1815static void initialize_coredump(bool skip_setup) {
1816 if (getpid_cached() != 1)
1817 return;
1818
1819 /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour
1820 * the limit) will process core dumps for system services by default. */
1821 if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1822 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1823
1824 /* But at the same time, turn off the core_pattern logic by default, so that no coredumps are stored
1825 * until the systemd-coredump tool is enabled via sysctl. However it can be changed via the kernel
1826 * command line later so core dumps can still be generated during early startup and in initrd. */
1827 if (!skip_setup)
1828 disable_coredumps();
1829}
1830
1831static void initialize_core_pattern(bool skip_setup) {
1832 int r;
1833
1834 if (skip_setup || !arg_early_core_pattern)
1835 return;
1836
1837 if (getpid_cached() != 1)
1838 return;
1839
1840 r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1841 if (r < 0)
1842 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m",
1843 arg_early_core_pattern);
1844}
1845
1846static void apply_protect_system(bool skip_setup) {
1847 int r;
1848
1849 if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
1850 return;
1851
1852 if (arg_protect_system < 0 && !in_initrd()) {
1853 log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
1854 return;
1855 }
1856
1857 r = make_mount_point("/usr");
1858 if (r < 0) {
1859 log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
1860 return;
1861 }
1862
1863 if (mount_nofollow_verbose(
1864 LOG_WARNING,
1865 /* what= */ NULL,
1866 "/usr",
1867 /* fstype= */ NULL,
1868 MS_BIND|MS_REMOUNT|MS_RDONLY,
1869 /* options= */ NULL) < 0)
1870 return;
1871
1872 log_info("Successfully made /usr/ read-only.");
1873}
1874
1875static void update_cpu_affinity(bool skip_setup) {
1876 _cleanup_free_ char *mask = NULL;
1877
1878 if (skip_setup || !arg_cpu_affinity.set)
1879 return;
1880
1881 assert(arg_cpu_affinity.allocated > 0);
1882
1883 mask = cpu_set_to_range_string(&arg_cpu_affinity);
1884 log_debug("Setting CPU affinity to {%s}.", strnull(mask));
1885
1886 if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1887 log_warning_errno(errno, "Failed to set CPU affinity, ignoring: %m");
1888}
1889
1890static void update_numa_policy(bool skip_setup) {
1891 int r;
1892 _cleanup_free_ char *nodes = NULL;
1893 const char * policy = NULL;
1894
1895 if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1896 return;
1897
1898 if (DEBUG_LOGGING) {
1899 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1900 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1901 log_debug("Setting NUMA policy to %s, with nodes {%s}.", strnull(policy), strnull(nodes));
1902 }
1903
1904 r = apply_numa_policy(&arg_numa_policy);
1905 if (r == -EOPNOTSUPP)
1906 log_debug_errno(r, "NUMA support not available, ignoring.");
1907 else if (r < 0)
1908 log_warning_errno(r, "Failed to set NUMA memory policy, ignoring: %m");
1909}
1910
1911static void filter_args(
1912 const char* dst[],
1913 size_t *dst_index,
1914 char **src,
1915 int argc) {
1916
1917 assert(dst);
1918 assert(dst_index);
1919
1920 /* Copy some filtered arguments into the dst array from src. */
1921 for (int i = 1; i < argc; i++) {
1922 if (STR_IN_SET(src[i],
1923 "--switched-root",
1924 "--system",
1925 "--user"))
1926 continue;
1927
1928 if (startswith(src[i], "--deserialize="))
1929 continue;
1930 if (streq(src[i], "--deserialize")) {
1931 i++; /* Skip the argument too */
1932 continue;
1933 }
1934
1935 /* Skip target unit designators. We already acted upon this information and have queued
1936 * appropriate jobs. We don't want to redo all this after reexecution. */
1937 if (startswith(src[i], "--unit="))
1938 continue;
1939 if (streq(src[i], "--unit")) {
1940 i++; /* Skip the argument too */
1941 continue;
1942 }
1943
1944 /* Seems we have a good old option. Let's pass it over to the new instance. */
1945 dst[(*dst_index)++] = src[i];
1946 }
1947}
1948
1949static void finish_remaining_processes(ManagerObjective objective) {
1950 assert(objective >= 0 && objective < _MANAGER_OBJECTIVE_MAX);
1951
1952 /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1953 * SIGCHLD for them after deserializing. */
1954 if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
1955 broadcast_signal(SIGTERM, /* wait_for_exit= */ false, /* send_sighup= */ true, arg_defaults.timeout_stop_usec);
1956
1957 /* On soft reboot really make sure nothing is left. Note that this will skip cgroups
1958 * of units that were configured with SurviveFinalKillSignal=yes. */
1959 if (objective == MANAGER_SOFT_REBOOT)
1960 broadcast_signal(SIGKILL, /* wait_for_exit= */ false, /* send_sighup= */ false, arg_defaults.timeout_stop_usec);
1961}
1962
1963static int do_reexecute(
1964 ManagerObjective objective,
1965 int argc,
1966 char* argv[],
1967 const struct rlimit *saved_rlimit_nofile,
1968 const struct rlimit *saved_rlimit_memlock,
1969 FDSet *fds,
1970 const char *switch_root_dir,
1971 const char *switch_root_init,
1972 uint64_t saved_capability_ambient_set,
1973 const char **ret_error_message) {
1974
1975 size_t i, args_size;
1976 const char **args;
1977 int r;
1978
1979 assert(IN_SET(objective, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT));
1980 assert(argc >= 0);
1981 assert(saved_rlimit_nofile);
1982 assert(saved_rlimit_memlock);
1983 assert(ret_error_message);
1984
1985 /* Close and disarm the watchdog, so that the new instance can reinitialize it, but the machine
1986 * doesn't get rebooted while we do that. */
1987 watchdog_close(/* disarm= */ true);
1988
1989 if (!switch_root_dir && objective == MANAGER_SOFT_REBOOT) {
1990 /* If no switch root dir is specified, then check if /run/nextroot/ qualifies and use that */
1991 r = path_is_os_tree("/run/nextroot");
1992 if (r < 0 && r != -ENOENT)
1993 log_debug_errno(r, "Failed to determine if /run/nextroot/ is a valid OS tree, ignoring: %m");
1994 else if (r > 0)
1995 switch_root_dir = "/run/nextroot";
1996 }
1997
1998 if (switch_root_dir) {
1999 /* If we're supposed to switch root, preemptively check the existence of a usable init.
2000 * Otherwise the system might end up in a completely undebuggable state afterwards. */
2001 if (switch_root_init) {
2002 r = chase_and_access(switch_root_init, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2003 if (r < 0)
2004 log_warning_errno(r, "Failed to chase configured init %s/%s: %m",
2005 switch_root_dir, switch_root_init);
2006 } else {
2007 r = chase_and_access(SYSTEMD_BINARY_PATH, switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2008 if (r < 0)
2009 log_debug_errno(r, "Failed to chase our own binary %s/%s: %m",
2010 switch_root_dir, SYSTEMD_BINARY_PATH);
2011 }
2012
2013 if (r < 0) {
2014 r = chase_and_access("/sbin/init", switch_root_dir, CHASE_PREFIX_ROOT, X_OK, /* ret_path = */ NULL);
2015 if (r < 0) {
2016 *ret_error_message = "Switch root target contains no usable init";
2017 return log_error_errno(r, "Failed to chase %s/sbin/init", switch_root_dir);
2018 }
2019 }
2020 }
2021
2022 /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
2023 * the kernel default to its child processes */
2024 if (saved_rlimit_nofile->rlim_cur != 0)
2025 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
2026 if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
2027 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
2028
2029 finish_remaining_processes(objective);
2030
2031 if (switch_root_dir) {
2032 r = switch_root(/* new_root= */ switch_root_dir,
2033 /* old_root_after= */ NULL,
2034 /* flags= */ (objective == MANAGER_SWITCH_ROOT ? SWITCH_ROOT_DESTROY_OLD_ROOT : 0) |
2035 (objective == MANAGER_SOFT_REBOOT ? 0 : SWITCH_ROOT_RECURSIVE_RUN));
2036 if (r < 0)
2037 log_error_errno(r, "Failed to switch root, trying to continue: %m");
2038 }
2039
2040 r = capability_ambient_set_apply(saved_capability_ambient_set, /* also_inherit= */ false);
2041 if (r < 0)
2042 log_warning_errno(r, "Failed to apply the starting ambient set, ignoring: %m");
2043
2044 args_size = argc + 5;
2045 args = newa(const char*, args_size);
2046
2047 if (!switch_root_init) {
2048 char sfd[STRLEN("--deserialize=") + DECIMAL_STR_MAX(int)];
2049
2050 /* First try to spawn ourselves with the right path, and with full serialization. We do this
2051 * only if the user didn't specify an explicit init to spawn. */
2052
2053 assert(arg_serialization);
2054 assert(fds);
2055
2056 xsprintf(sfd, "--deserialize=%i", fileno(arg_serialization));
2057
2058 i = 1; /* Leave args[0] empty for now. */
2059
2060 /* Put our stuff first to make sure it always gets parsed in case
2061 * we get weird stuff from the kernel cmdline (like --) */
2062 if (IN_SET(objective, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
2063 args[i++] = "--switched-root";
2064 args[i++] = runtime_scope_cmdline_option_to_string(arg_runtime_scope);
2065 args[i++] = sfd;
2066
2067 filter_args(args, &i, argv, argc);
2068
2069 args[i++] = NULL;
2070
2071 assert(i <= args_size);
2072
2073 /*
2074 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do
2075 * this is on its own on exec(), but it will do it on exit(). Hence, to ensure we get a
2076 * summary here, fork() off a child, let it exit() cleanly, so that it prints the summary,
2077 * and wait() for it in the parent, before proceeding into the exec().
2078 */
2079 valgrind_summary_hack();
2080
2081 args[0] = SYSTEMD_BINARY_PATH;
2082 (void) execv(args[0], (char* const*) args);
2083
2084 if (objective == MANAGER_REEXECUTE) {
2085 *ret_error_message = "Failed to execute our own binary";
2086 return log_error_errno(errno, "Failed to execute our own binary %s: %m", args[0]);
2087 }
2088
2089 log_debug_errno(errno, "Failed to execute our own binary %s, trying fallback: %m", args[0]);
2090 }
2091
2092 /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and
2093 * envp[]. (Well, modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[],
2094 * but let's hope that doesn't matter.) */
2095
2096 arg_serialization = safe_fclose(arg_serialization);
2097 fds = fdset_free(fds);
2098
2099 /* Drop /run/systemd directory. Some of its content can be used as a flag indicating that systemd is
2100 * the init system but we might be replacing it with something different. If systemd is used again it
2101 * will recreate the directory and its content anyway. */
2102 r = rm_rf("/run/systemd.pre-switch-root", REMOVE_ROOT|REMOVE_MISSING_OK);
2103 if (r < 0)
2104 log_warning_errno(r, "Failed to prepare /run/systemd.pre-switch-root/, ignoring: %m");
2105
2106 r = RET_NERRNO(rename("/run/systemd", "/run/systemd.pre-switch-root"));
2107 if (r < 0)
2108 log_warning_errno(r, "Failed to move /run/systemd/ to /run/systemd.pre-switch-root/, ignoring: %m");
2109
2110 /* Reopen the console */
2111 (void) make_console_stdio();
2112
2113 i = 1; /* Leave args[0] empty for now. */
2114 for (int j = 1; j <= argc; j++)
2115 args[i++] = argv[j];
2116 assert(i <= args_size);
2117
2118 /* Re-enable any blocked signals, especially important if we switch from initrd to init=... */
2119 (void) reset_all_signal_handlers();
2120 (void) reset_signal_mask();
2121 (void) rlimit_nofile_safe();
2122
2123 if (switch_root_init) {
2124 args[0] = switch_root_init;
2125 (void) execve(args[0], (char* const*) args, saved_env);
2126 log_warning_errno(errno, "Failed to execute configured init %s, trying fallback: %m", args[0]);
2127 }
2128
2129 args[0] = "/sbin/init";
2130 (void) execv(args[0], (char* const*) args);
2131 r = -errno;
2132 *ret_error_message = "Failed to execute /sbin/init";
2133
2134 if (r == -ENOENT) {
2135 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2136 ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
2137 "%s", *ret_error_message);
2138
2139 log_warning_errno(r, "No /sbin/init, trying fallback shell");
2140
2141 args[0] = "/bin/sh";
2142 args[1] = NULL;
2143 (void) execve(args[0], (char* const*) args, saved_env);
2144 r = -errno;
2145 *ret_error_message = "Failed to execute fallback shell";
2146 }
2147
2148 return log_error_errno(r, "%s, giving up: %m", *ret_error_message);
2149}
2150
2151static int invoke_main_loop(
2152 Manager *m,
2153 const struct rlimit *saved_rlimit_nofile,
2154 const struct rlimit *saved_rlimit_memlock,
2155 int *ret_retval, /* Return parameters relevant for shutting down */
2156 FDSet **ret_fds, /* Return parameters for reexecuting */
2157 char **ret_switch_root_dir, /* … */
2158 char **ret_switch_root_init, /* … */
2159 const char **ret_error_message) {
2160
2161 int r;
2162
2163 assert(m);
2164 assert(saved_rlimit_nofile);
2165 assert(saved_rlimit_memlock);
2166 assert(ret_retval);
2167 assert(ret_fds);
2168 assert(ret_switch_root_dir);
2169 assert(ret_switch_root_init);
2170 assert(ret_error_message);
2171
2172 for (;;) {
2173 int objective = manager_loop(m);
2174 if (objective < 0) {
2175 *ret_error_message = "Failed to run main loop";
2176 return log_struct_errno(LOG_EMERG, objective,
2177 LOG_MESSAGE("Failed to run main loop: %m"),
2178 LOG_MESSAGE_ID(SD_MESSAGE_CORE_MAINLOOP_FAILED_STR));
2179 }
2180
2181 /* Ensure shutdown timestamp is taken even when bypassing the job engine */
2182 if (IN_SET(objective,
2183 MANAGER_SOFT_REBOOT,
2184 MANAGER_REBOOT,
2185 MANAGER_KEXEC,
2186 MANAGER_HALT,
2187 MANAGER_POWEROFF) &&
2188 !dual_timestamp_is_set(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START))
2189 dual_timestamp_now(m->timestamps + MANAGER_TIMESTAMP_SHUTDOWN_START);
2190
2191 switch (objective) {
2192
2193 case MANAGER_RELOAD: {
2194 LogTarget saved_log_target;
2195 int saved_log_level;
2196
2197 manager_send_reloading(m);
2198
2199 log_info("Reloading...");
2200
2201 /* First, save any overridden log level/target, then parse the configuration file,
2202 * which might change the log level to new settings. */
2203
2204 saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
2205 saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
2206
2207 (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
2208
2209 set_manager_defaults(m);
2210 set_manager_settings(m);
2211
2212 update_cpu_affinity(false);
2213 update_numa_policy(false);
2214
2215 if (saved_log_level >= 0)
2216 manager_override_log_level(m, saved_log_level);
2217 if (saved_log_target >= 0)
2218 manager_override_log_target(m, saved_log_target);
2219
2220 if (manager_reload(m) < 0)
2221 /* Reloading failed before the point of no return.
2222 * Let's continue running as if nothing happened. */
2223 m->objective = MANAGER_OK;
2224 else
2225 log_info("Reloading finished in " USEC_FMT " ms.",
2226 usec_sub_unsigned(now(CLOCK_MONOTONIC), m->timestamps[MANAGER_TIMESTAMP_UNITS_LOAD].monotonic) / USEC_PER_MSEC);
2227
2228 continue;
2229 }
2230
2231 case MANAGER_REEXECUTE:
2232
2233 manager_send_reloading(m); /* From the perspective of the manager calling us this is
2234 * pretty much the same as a reload */
2235
2236 r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
2237 if (r < 0) {
2238 *ret_error_message = "Failed to prepare for reexecution";
2239 return r;
2240 }
2241
2242 log_notice("Reexecuting.");
2243
2244 *ret_retval = EXIT_FAILURE;
2245 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2246
2247 return objective;
2248
2249 case MANAGER_SWITCH_ROOT:
2250
2251 manager_send_reloading(m); /* From the perspective of the manager calling us this is
2252 * pretty much the same as a reload */
2253
2254 manager_set_switching_root(m, true);
2255
2256 if (!m->switch_root_init) {
2257 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
2258 if (r < 0) {
2259 *ret_error_message = "Failed to prepare for reexecution";
2260 return r;
2261 }
2262 } else
2263 *ret_fds = NULL;
2264
2265 log_notice("Switching root.");
2266
2267 *ret_retval = EXIT_FAILURE;
2268
2269 /* Steal the switch root parameters */
2270 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2271 *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
2272
2273 return objective;
2274
2275 case MANAGER_SOFT_REBOOT:
2276 manager_send_reloading(m);
2277 manager_set_switching_root(m, true);
2278
2279 r = prepare_reexecute(m, &arg_serialization, ret_fds, /* switching_root= */ true);
2280 if (r < 0) {
2281 *ret_error_message = "Failed to prepare for reexecution";
2282 return r;
2283 }
2284
2285 log_notice("Soft-rebooting.");
2286
2287 *ret_retval = EXIT_FAILURE;
2288 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
2289 *ret_switch_root_init = NULL;
2290
2291 return objective;
2292
2293 case MANAGER_EXIT:
2294 if (MANAGER_IS_USER(m)) {
2295 log_debug("Exit.");
2296
2297 *ret_retval = m->return_value;
2298 *ret_fds = NULL;
2299 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2300
2301 return objective;
2302 }
2303
2304 _fallthrough_;
2305 case MANAGER_REBOOT:
2306 case MANAGER_POWEROFF:
2307 case MANAGER_HALT:
2308 case MANAGER_KEXEC: {
2309 log_notice("Shutting down.");
2310
2311 *ret_retval = m->return_value;
2312 *ret_fds = NULL;
2313 *ret_switch_root_dir = *ret_switch_root_init = NULL;
2314
2315 return objective;
2316 }
2317
2318 default:
2319 assert_not_reached();
2320 }
2321 }
2322}
2323
2324static void log_execution_mode(bool *ret_first_boot) {
2325 bool first_boot = false;
2326 int r;
2327
2328 assert(ret_first_boot);
2329
2330 switch (arg_runtime_scope) {
2331
2332 case RUNTIME_SCOPE_SYSTEM: {
2333 struct utsname uts;
2334 int v;
2335
2336 log_info("systemd " GIT_VERSION " running in %ssystem mode (%s)",
2337 arg_action == ACTION_TEST ? "test " : "",
2338 systemd_features);
2339
2340 v = detect_virtualization();
2341 if (v > 0)
2342 log_info("Detected virtualization %s.", virtualization_to_string(v));
2343
2344 v = detect_confidential_virtualization();
2345 if (v > 0)
2346 log_info("Detected confidential virtualization %s.", confidential_virtualization_to_string(v));
2347
2348 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
2349
2350 if (in_initrd())
2351 log_info("Running in initrd.");
2352 else {
2353 _cleanup_free_ char *id_text = NULL;
2354
2355 /* Let's check whether we are in first boot. First, check if an override was
2356 * specified on the kernel command line. If yes, we honour that. */
2357
2358 r = proc_cmdline_get_bool("systemd.condition_first_boot", /* flags = */ 0, &first_boot);
2359 if (r < 0)
2360 log_debug_errno(r, "Failed to parse systemd.condition_first_boot= kernel command line argument, ignoring: %m");
2361
2362 if (r > 0)
2363 log_full(first_boot ? LOG_INFO : LOG_DEBUG,
2364 "Kernel command line argument says we are %s first boot.",
2365 first_boot ? "in" : "not in");
2366 else {
2367 /* Second, perform autodetection. We use /etc/machine-id as flag file for
2368 * this: If it is missing or contains the value "uninitialized", this is the
2369 * first boot. In other cases, it is not. This allows container managers and
2370 * installers to provision a couple of files in /etc but still permit the
2371 * first-boot initialization to occur. If the container manager wants to
2372 * provision the machine ID it should pass $container_uuid to PID 1. */
2373
2374 r = read_one_line_file("/etc/machine-id", &id_text);
2375 if (r < 0 || streq(id_text, "uninitialized")) {
2376 if (r < 0 && r != -ENOENT)
2377 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, assuming first boot: %m");
2378
2379 first_boot = true;
2380 log_info("Detected first boot.");
2381 } else
2382 log_debug("Detected initialized system, this is not the first boot.");
2383 }
2384 }
2385
2386 assert_se(uname(&uts) >= 0);
2387
2388 if (strverscmp_improved(uts.release, KERNEL_BASELINE_VERSION) < 0)
2389 log_warning("Warning! Reported kernel version %s is older than systemd's required baseline kernel version %s. "
2390 "Your mileage may vary.", uts.release, KERNEL_BASELINE_VERSION);
2391 else
2392 log_debug("Kernel version %s, our baseline is %s", uts.release, KERNEL_BASELINE_VERSION);
2393
2394 break;
2395 }
2396
2397 case RUNTIME_SCOPE_USER:
2398 if (DEBUG_LOGGING) {
2399 _cleanup_free_ char *t = NULL;
2400
2401 t = uid_to_name(getuid());
2402 log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2403 arg_action == ACTION_TEST ? " test" : "",
2404 getuid(), strna(t), systemd_features);
2405 }
2406
2407 break;
2408
2409 default:
2410 assert_not_reached();
2411 }
2412
2413 *ret_first_boot = first_boot;
2414}
2415
2416static int initialize_runtime(
2417 bool skip_setup,
2418 bool first_boot,
2419 struct rlimit *saved_rlimit_nofile,
2420 struct rlimit *saved_rlimit_memlock,
2421 uint64_t *saved_ambient_set,
2422 const char **ret_error_message) {
2423
2424 int r;
2425
2426 assert(saved_ambient_set);
2427 assert(ret_error_message);
2428
2429 /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2430 *
2431 * - Some only apply to --system instances
2432 * - Some only apply to --user instances
2433 * - Some only apply when we first start up, but not when we reexecute
2434 */
2435
2436 if (arg_action != ACTION_RUN)
2437 return 0;
2438
2439 update_cpu_affinity(skip_setup);
2440 update_numa_policy(skip_setup);
2441
2442 switch (arg_runtime_scope) {
2443
2444 case RUNTIME_SCOPE_SYSTEM:
2445 /* Make sure we leave a core dump without panicking the kernel. */
2446 install_crash_handler();
2447
2448 if (!skip_setup) {
2449 /* Check that /usr/ is either on the same file system as / or mounted already. */
2450 if (dir_is_empty("/usr", /* ignore_hidden_or_backup = */ true) > 0) {
2451 *ret_error_message = "Refusing to run in unsupported environment where /usr/ is not populated";
2452 return -ENOEXEC;
2453 }
2454
2455 /* Pull credentials from various sources into a common credential directory (we do
2456 * this here, before setting up the machine ID, so that we can use credential info
2457 * for setting up the machine ID) */
2458 (void) import_credentials();
2459
2460 (void) os_release_status();
2461 (void) machine_id_setup(/* root = */ NULL, arg_machine_id,
2462 (first_boot ? MACHINE_ID_SETUP_FORCE_TRANSIENT : 0) |
2463 (arg_machine_id_from_firmware ? MACHINE_ID_SETUP_FORCE_FIRMWARE : 0),
2464 /* ret = */ NULL);
2465 (void) hostname_setup(/* really = */ true);
2466 (void) loopback_setup();
2467
2468 bump_unix_max_dgram_qlen();
2469 bump_file_max_and_nr_open();
2470
2471 write_container_id();
2472
2473 (void) write_boot_or_shutdown_osc("boot");
2474
2475 /* Copy os-release to the propagate directory, so that we update it for services running
2476 * under RootDirectory=/RootImage= when we do a soft reboot. */
2477 r = setup_os_release(RUNTIME_SCOPE_SYSTEM);
2478 if (r < 0)
2479 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2480 }
2481
2482 r = watchdog_set_device(arg_watchdog_device);
2483 if (r < 0)
2484 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2485
2486 if (!cap_test_all(arg_capability_bounding_set)) {
2487 r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2488 if (r < 0) {
2489 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2490 return log_struct_errno(LOG_EMERG, r,
2491 LOG_MESSAGE("Failed to drop capability bounding set of usermode helpers: %m"),
2492 LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_USER_STR));
2493 }
2494
2495 r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2496 if (r < 0) {
2497 *ret_error_message = "Failed to drop capability bounding set";
2498 return log_struct_errno(LOG_EMERG, r,
2499 LOG_MESSAGE("Failed to drop capability bounding set: %m"),
2500 LOG_MESSAGE_ID(SD_MESSAGE_CORE_CAPABILITY_BOUNDING_STR));
2501 }
2502 }
2503
2504 if (arg_no_new_privs) {
2505 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2506 *ret_error_message = "Failed to disable new privileges";
2507 return log_struct_errno(LOG_EMERG, errno,
2508 LOG_MESSAGE("Failed to disable new privileges: %m"),
2509 LOG_MESSAGE_ID(SD_MESSAGE_CORE_DISABLE_PRIVILEGES_STR));
2510 }
2511 }
2512
2513 break;
2514
2515 case RUNTIME_SCOPE_USER: {
2516 _cleanup_free_ char *p = NULL;
2517
2518 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2519 * user mode. In system mode mount_setup() already did that. */
2520
2521 r = xdg_user_runtime_dir("/systemd", &p);
2522 if (r < 0) {
2523 *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2524 return log_struct_errno(LOG_EMERG, r,
2525 LOG_MESSAGE("Failed to determine $XDG_RUNTIME_DIR path: %m"),
2526 LOG_MESSAGE_ID(SD_MESSAGE_CORE_NO_XDGDIR_PATH_STR));
2527 }
2528
2529 if (!skip_setup) {
2530 (void) mkdir_p_label(p, 0755);
2531 (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2532
2533 r = setup_os_release(RUNTIME_SCOPE_USER);
2534 if (r < 0)
2535 log_warning_errno(r, "Failed to copy os-release for propagation, ignoring: %m");
2536 }
2537
2538 break;
2539 }
2540
2541 default:
2542 assert_not_reached();
2543 }
2544
2545 /* The two operations on the ambient set are meant for a user serssion manager. They do not affect
2546 * system manager operation, because by default it starts with an empty ambient set.
2547 *
2548 * Preserve the ambient set for later use with sd-executor processes. */
2549 r = capability_get_ambient(saved_ambient_set);
2550 if (r < 0)
2551 log_warning_errno(r, "Failed to save ambient capabilities, ignoring: %m");
2552
2553 /* Clear ambient capabilities, so services do not inherit them implicitly. Dropping them does
2554 * not affect the permitted and effective sets which are important for the manager itself to
2555 * operate. */
2556 r = capability_ambient_set_apply(0, /* also_inherit= */ false);
2557 if (r < 0)
2558 log_warning_errno(r, "Failed to reset ambient capability set, ignoring: %m");
2559
2560 if (arg_timer_slack_nsec != NSEC_INFINITY)
2561 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2562 log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2563
2564 if (arg_syscall_archs) {
2565 r = enforce_syscall_archs(arg_syscall_archs);
2566 if (r < 0) {
2567 *ret_error_message = "Failed to set syscall architectures";
2568 return r;
2569 }
2570 }
2571
2572 r = make_reaper_process(true);
2573 if (r < 0)
2574 log_warning_errno(r, "Failed to make us a subreaper, ignoring: %m");
2575
2576 /* Bump up RLIMIT_NOFILE for systemd itself */
2577 (void) bump_rlimit_nofile(saved_rlimit_nofile);
2578 (void) bump_rlimit_memlock(saved_rlimit_memlock);
2579
2580 return 0;
2581}
2582
2583static int do_queue_default_job(
2584 Manager *m,
2585 const char **ret_error_message) {
2586
2587 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2588 const char *unit;
2589 Job *job;
2590 Unit *target;
2591 int r;
2592
2593 if (arg_default_unit)
2594 unit = arg_default_unit;
2595 else if (in_initrd())
2596 unit = SPECIAL_INITRD_TARGET;
2597 else
2598 unit = SPECIAL_DEFAULT_TARGET;
2599
2600 log_debug("Activating default unit: %s", unit);
2601
2602 r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2603 if (r < 0 && in_initrd() && !arg_default_unit) {
2604 /* Fall back to default.target, which we used to always use by default. Only do this if no
2605 * explicit configuration was given. */
2606
2607 log_info("Falling back to %s.", SPECIAL_DEFAULT_TARGET);
2608
2609 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2610 }
2611 if (r < 0) {
2612 log_info("Falling back to %s.", SPECIAL_RESCUE_TARGET);
2613
2614 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2615 if (r < 0) {
2616 *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2617 : "Failed to load " SPECIAL_RESCUE_TARGET;
2618 return r;
2619 }
2620 }
2621
2622 assert(target->load_state == UNIT_LOADED);
2623
2624 r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, &error, &job);
2625 if (r == -EPERM) {
2626 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2627
2628 sd_bus_error_free(&error);
2629
2630 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, &error, &job);
2631 if (r < 0) {
2632 *ret_error_message = "Failed to start default target";
2633 return log_struct_errno(LOG_EMERG, r,
2634 LOG_MESSAGE("Failed to start default target: %s", bus_error_message(&error, r)),
2635 LOG_MESSAGE_ID(SD_MESSAGE_CORE_START_TARGET_FAILED_STR));
2636 }
2637
2638 } else if (r < 0) {
2639 *ret_error_message = "Failed to isolate default target";
2640 return log_struct_errno(LOG_EMERG, r,
2641 LOG_MESSAGE("Failed to isolate default target: %s", bus_error_message(&error, r)),
2642 LOG_MESSAGE_ID(SD_MESSAGE_CORE_ISOLATE_TARGET_FAILED_STR));
2643 } else
2644 log_info("Queued %s job for default target %s.",
2645 job_type_to_string(job->type),
2646 unit_status_string(job->unit, NULL));
2647
2648 m->default_unit_job_id = job->id;
2649
2650 return 0;
2651}
2652
2653static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2654 struct rlimit *saved_rlimit_memlock) {
2655
2656 assert(saved_rlimit_nofile);
2657 assert(saved_rlimit_memlock);
2658
2659 if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2660 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2661
2662 if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2663 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2664}
2665
2666static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2667 struct rlimit *rl;
2668
2669 if (arg_defaults.rlimit[RLIMIT_NOFILE])
2670 return;
2671
2672 /* Make sure forked processes get limits based on the original kernel setting */
2673
2674 rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2675 if (!rl) {
2676 log_oom();
2677 return;
2678 }
2679
2680 /* Bump the hard limit for system services to a substantially higher value. The default
2681 * hard limit current kernels set is pretty low (4K), mostly for historical
2682 * reasons. According to kernel developers, the fd handling in recent kernels has been
2683 * optimized substantially enough, so that we can bump the limit now, without paying too
2684 * high a price in memory or performance. Note however that we only bump the hard limit,
2685 * not the soft limit. That's because select() works the way it works, and chokes on fds
2686 * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2687 * unexpecting programs that they get fds higher than what they can process using
2688 * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2689 * this pitfall: programs that are written by folks aware of the select() problem in mind
2690 * (and thus use poll()/epoll instead of select(), the way everybody should) can
2691 * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2692 * we pass. */
2693 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2694 int nr;
2695
2696 /* Get the underlying absolute limit the kernel enforces */
2697 nr = read_nr_open();
2698
2699 rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2700 }
2701
2702 /* If for some reason we were invoked with a soft limit above 1024 (which should never
2703 * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2704 * instance), then lower what we pass on to not confuse our children */
2705 rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2706
2707 arg_defaults.rlimit[RLIMIT_NOFILE] = rl;
2708}
2709
2710static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2711 struct rlimit *rl;
2712
2713 /* Pass the original value down to invoked processes */
2714
2715 if (arg_defaults.rlimit[RLIMIT_MEMLOCK])
2716 return;
2717
2718 rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2719 if (!rl) {
2720 log_oom();
2721 return;
2722 }
2723
2724 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2725 /* Raise the default limit to 8M also on old kernels and in containers (8M is the kernel
2726 * default for this since kernel 5.16) */
2727 rl->rlim_max = MAX(rl->rlim_max, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2728 rl->rlim_cur = MAX(rl->rlim_cur, (rlim_t) DEFAULT_RLIMIT_MEMLOCK);
2729 }
2730
2731 arg_defaults.rlimit[RLIMIT_MEMLOCK] = rl;
2732}
2733
2734static void setenv_manager_environment(void) {
2735 int r;
2736
2737 STRV_FOREACH(p, arg_manager_environment) {
2738 log_debug("Setting '%s' in our own environment.", *p);
2739
2740 r = putenv_dup(*p, true);
2741 if (r < 0)
2742 log_warning_errno(r, "Failed to setenv \"%s\", ignoring: %m", *p);
2743 }
2744}
2745
2746static void reset_arguments(void) {
2747 /* Frees/resets arg_* variables, with a few exceptions commented below. */
2748
2749 arg_default_unit = mfree(arg_default_unit);
2750
2751 /* arg_runtime_scope — ignore */
2752
2753 arg_dump_core = true;
2754 arg_crash_chvt = -1;
2755 arg_crash_shell = false;
2756 arg_crash_action = CRASH_FREEZE;
2757 arg_confirm_spawn = mfree(arg_confirm_spawn);
2758 arg_show_status = _SHOW_STATUS_INVALID;
2759 arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2760 arg_switched_root = false;
2761 arg_pager_flags = 0;
2762 arg_service_watchdogs = true;
2763
2764 unit_defaults_done(&arg_defaults);
2765 unit_defaults_init(&arg_defaults, arg_runtime_scope);
2766
2767 arg_runtime_watchdog = 0;
2768 arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2769 arg_kexec_watchdog = 0;
2770 arg_pretimeout_watchdog = 0;
2771 arg_early_core_pattern = mfree(arg_early_core_pattern);
2772 arg_watchdog_device = mfree(arg_watchdog_device);
2773 arg_watchdog_pretimeout_governor = mfree(arg_watchdog_pretimeout_governor);
2774
2775 arg_default_environment = strv_free(arg_default_environment);
2776 arg_manager_environment = strv_free(arg_manager_environment);
2777
2778 arg_capability_bounding_set = CAP_MASK_UNSET;
2779 arg_no_new_privs = false;
2780 arg_protect_system = -1;
2781 arg_timer_slack_nsec = NSEC_INFINITY;
2782
2783 arg_syscall_archs = set_free(arg_syscall_archs);
2784
2785 /* arg_serialization — ignore */
2786
2787 arg_machine_id = (sd_id128_t) {};
2788 arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2789
2790 cpu_set_done(&arg_cpu_affinity);
2791 numa_policy_reset(&arg_numa_policy);
2792
2793 arg_random_seed = mfree(arg_random_seed);
2794 arg_random_seed_size = 0;
2795 arg_clock_usec = 0;
2796
2797 arg_reload_limit_interval_sec = 0;
2798 arg_reload_limit_burst = 0;
2799}
2800
2801static void determine_default_oom_score_adjust(void) {
2802 int r, a, b;
2803
2804 /* Run our services at slightly higher OOM score than ourselves. But let's be conservative here, and
2805 * do this only if we don't run as root (i.e. only if we are run in user mode, for an unprivileged
2806 * user). */
2807
2808 if (arg_defaults.oom_score_adjust_set)
2809 return;
2810
2811 if (getuid() == 0)
2812 return;
2813
2814 r = get_oom_score_adjust(&a);
2815 if (r < 0)
2816 return (void) log_warning_errno(r, "Failed to determine current OOM score adjustment value, ignoring: %m");
2817
2818 assert_cc(100 <= OOM_SCORE_ADJ_MAX);
2819 b = a >= OOM_SCORE_ADJ_MAX - 100 ? OOM_SCORE_ADJ_MAX : a + 100;
2820
2821 if (a == b)
2822 return;
2823
2824 arg_defaults.oom_score_adjust = b;
2825 arg_defaults.oom_score_adjust_set = true;
2826}
2827
2828static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2829 const struct rlimit *saved_rlimit_memlock) {
2830 int r;
2831
2832 assert(saved_rlimit_nofile);
2833 assert(saved_rlimit_memlock);
2834
2835 /* Assign configuration defaults */
2836 reset_arguments();
2837
2838 r = parse_config_file();
2839 if (r < 0)
2840 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2841
2842 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM) {
2843 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2844 if (r < 0)
2845 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2846 }
2847
2848 /* Initialize some default rlimits for services if they haven't been configured */
2849 fallback_rlimit_nofile(saved_rlimit_nofile);
2850 fallback_rlimit_memlock(saved_rlimit_memlock);
2851
2852 /* Note that this also parses bits from the kernel command line, including "debug". */
2853 log_parse_environment();
2854
2855 /* Initialize the show status setting if it hasn't been set explicitly yet */
2856 if (arg_show_status == _SHOW_STATUS_INVALID)
2857 arg_show_status = SHOW_STATUS_YES;
2858
2859 /* Slightly raise the OOM score for our services if we are running for unprivileged users. */
2860 determine_default_oom_score_adjust();
2861
2862 /* Push variables into the manager environment block */
2863 setenv_manager_environment();
2864
2865 /* Parse log environment variables again to take into account any new environment variables. */
2866 log_parse_environment();
2867
2868 return 0;
2869}
2870
2871static int safety_checks(void) {
2872
2873 if (getpid_cached() == 1 &&
2874 arg_action != ACTION_RUN)
2875 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2876 "Unsupported execution mode while PID 1.");
2877
2878 if (getpid_cached() == 1 &&
2879 arg_runtime_scope == RUNTIME_SCOPE_USER)
2880 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2881 "Can't run --user mode as PID 1.");
2882
2883 if (arg_action == ACTION_RUN &&
2884 arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
2885 getpid_cached() != 1)
2886 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2887 "Can't run system mode unless PID 1.");
2888
2889 if (arg_action == ACTION_TEST &&
2890 geteuid() == 0)
2891 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2892 "Don't run test mode as root.");
2893
2894 switch (arg_runtime_scope) {
2895
2896 case RUNTIME_SCOPE_USER:
2897
2898 if (arg_action == ACTION_RUN &&
2899 sd_booted() <= 0)
2900 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2901 "Trying to run as user instance, but the system has not been booted with systemd.");
2902
2903 if (arg_action == ACTION_RUN &&
2904 !getenv("XDG_RUNTIME_DIR"))
2905 return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2906 "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2907
2908 break;
2909
2910 case RUNTIME_SCOPE_SYSTEM:
2911 if (arg_action == ACTION_RUN &&
2912 running_in_chroot() > 0)
2913 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2914 "Cannot be run in a chroot() environment.");
2915 break;
2916
2917 default:
2918 assert_not_reached();
2919 }
2920
2921 return 0;
2922}
2923
2924static int initialize_security(
2925 bool *loaded_policy,
2926 dual_timestamp *security_start_timestamp,
2927 dual_timestamp *security_finish_timestamp,
2928 const char **ret_error_message) {
2929
2930 int r;
2931
2932 assert(loaded_policy);
2933 assert(security_start_timestamp);
2934 assert(security_finish_timestamp);
2935 assert(ret_error_message);
2936
2937 dual_timestamp_now(security_start_timestamp);
2938
2939 r = mac_selinux_setup(loaded_policy);
2940 if (r < 0) {
2941 *ret_error_message = "Failed to load SELinux policy";
2942 return r;
2943 }
2944
2945 r = mac_smack_setup(loaded_policy);
2946 if (r < 0) {
2947 *ret_error_message = "Failed to load SMACK policy";
2948 return r;
2949 }
2950
2951 r = mac_apparmor_setup();
2952 if (r < 0) {
2953 *ret_error_message = "Failed to load AppArmor policy";
2954 return r;
2955 }
2956
2957 r = ima_setup();
2958 if (r < 0) {
2959 *ret_error_message = "Failed to load IMA policy";
2960 return r;
2961 }
2962
2963 r = ipe_setup();
2964 if (r < 0) {
2965 *ret_error_message = "Failed to load IPE policy";
2966 return r;
2967 }
2968
2969 dual_timestamp_now(security_finish_timestamp);
2970 return 0;
2971}
2972
2973static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2974 int r;
2975
2976 assert(ret_fds);
2977 assert(ret_error_message);
2978
2979 /* Pick up all fds passed to us. We apply a filter here: we only take the fds that have O_CLOEXEC
2980 * off. All fds passed via execve() to us must have O_CLOEXEC off, and our own code and dependencies
2981 * should be clean enough to set O_CLOEXEC universally. Thus checking the bit should be a safe
2982 * mechanism to distinguish passed in fds from our own.
2983 *
2984 * Why bother? Some subsystems we initialize early, specifically selinux might keep fds open in our
2985 * process behind our back. We should not take possession of that (and then accidentally close
2986 * it). SELinux thankfully sets O_CLOEXEC on its fds, so this test should work. */
2987 r = fdset_new_fill(/* filter_cloexec= */ 0, ret_fds);
2988 if (r < 0) {
2989 *ret_error_message = "Failed to allocate fd set";
2990 return log_struct_errno(LOG_EMERG, r,
2991 LOG_MESSAGE("Failed to allocate fd set: %m"),
2992 LOG_MESSAGE_ID(SD_MESSAGE_CORE_FD_SET_FAILED_STR));
2993 }
2994
2995 /* The serialization fd should have O_CLOEXEC turned on already, let's verify that we didn't pick it up here */
2996 assert_se(!arg_serialization || !fdset_contains(*ret_fds, fileno(arg_serialization)));
2997
2998 return 0;
2999}
3000
3001static void setup_console_terminal(bool skip_setup) {
3002
3003 if (arg_runtime_scope != RUNTIME_SCOPE_SYSTEM)
3004 return;
3005
3006 /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a
3007 * controlling tty. */
3008 terminal_detach_session();
3009
3010 /* Reset the console, but only if this is really init and we are freshly booted */
3011 if (!skip_setup)
3012 (void) console_setup();
3013}
3014
3015static bool early_skip_setup_check(int argc, char *argv[]) {
3016 bool found_deserialize = false;
3017
3018 /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much
3019 * later, so let's just have a quick peek here. Note that if we have switched root, do all the
3020 * special setup things anyway, even if in that case we also do deserialization. */
3021
3022 for (int i = 1; i < argc; i++)
3023 if (streq(argv[i], "--switched-root"))
3024 return false; /* If we switched root, don't skip the setup. */
3025 else if (startswith(argv[i], "--deserialize=") || streq(argv[i], "--deserialize"))
3026 found_deserialize = true;
3027
3028 return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
3029}
3030
3031static int save_env(void) {
3032 char **l;
3033
3034 l = strv_copy(environ);
3035 if (!l)
3036 return log_oom();
3037
3038 strv_free_and_replace(saved_env, l);
3039 return 0;
3040}
3041
3042int main(int argc, char *argv[]) {
3043 dual_timestamp
3044 initrd_timestamp = DUAL_TIMESTAMP_NULL,
3045 userspace_timestamp = DUAL_TIMESTAMP_NULL,
3046 kernel_timestamp = DUAL_TIMESTAMP_NULL,
3047 security_start_timestamp = DUAL_TIMESTAMP_NULL,
3048 security_finish_timestamp = DUAL_TIMESTAMP_NULL;
3049 struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
3050 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
3051 * in. Note we use different values
3052 * for the two that indicate whether
3053 * these fields are initialized! */
3054 bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false;
3055 char *switch_root_dir = NULL, *switch_root_init = NULL;
3056 usec_t before_startup, after_startup;
3057 static char systemd[] = "systemd";
3058 const char *error_message = NULL;
3059 uint64_t saved_ambient_set = 0;
3060 int r, retval = EXIT_FAILURE;
3061 Manager *m = NULL;
3062 FDSet *fds = NULL;
3063
3064 assert_se(argc > 0 && !isempty(argv[0]));
3065
3066 /* SysV compatibility: redirect init → telinit */
3067 redirect_telinit(argc, argv);
3068
3069 /* Take timestamps early on */
3070 dual_timestamp_from_monotonic(&kernel_timestamp, 0);
3071 dual_timestamp_now(&userspace_timestamp);
3072
3073 /* Figure out whether we need to do initialize the system, or if we already did that because we are
3074 * reexecuting. */
3075 skip_setup = early_skip_setup_check(argc, argv);
3076
3077 /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent
3078 * reexecution we are then called 'systemd'. That is confusing, hence let's call us systemd
3079 * right-away. */
3080 program_invocation_short_name = systemd;
3081 (void) prctl(PR_SET_NAME, systemd);
3082
3083 /* Save the original command line */
3084 save_argc_argv(argc, argv);
3085
3086 /* Save the original environment as we might need to restore it if we're requested to execute another
3087 * system manager later. */
3088 r = save_env();
3089 if (r < 0) {
3090 error_message = "Failed to copy environment block";
3091 goto finish;
3092 }
3093
3094 /* Make sure that if the user says "syslog" we actually log to the journal. */
3095 log_set_upgrade_syslog_to_journal(true);
3096
3097 if (getpid_cached() == 1) {
3098 /* When we run as PID 1 force system mode */
3099 arg_runtime_scope = RUNTIME_SCOPE_SYSTEM;
3100
3101 /* Disable the umask logic */
3102 umask(0);
3103
3104 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might
3105 * not be activated yet (even though the log socket for it exists). */
3106 log_set_prohibit_ipc(true);
3107
3108 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This
3109 * is important so that we never end up logging to any foreign stderr, for example if we have
3110 * to log in a child process right before execve()'ing the actual binary, at a point in time
3111 * where socket activation stderr/stdout area already set up. */
3112 log_set_always_reopen_console(true);
3113
3114 if (detect_container() <= 0) {
3115
3116 /* Running outside of a container as PID 1 */
3117 log_set_target_and_open(LOG_TARGET_KMSG);
3118
3119 if (in_initrd())
3120 initrd_timestamp = userspace_timestamp;
3121
3122 if (!skip_setup) {
3123 r = mount_setup_early();
3124 if (r < 0) {
3125 error_message = "Failed to mount early API filesystems";
3126 goto finish;
3127 }
3128 }
3129
3130 /* We might have just mounted /proc, so let's try to parse the kernel
3131 * command line log arguments immediately. */
3132 log_parse_environment();
3133
3134 /* Let's open the log backend a second time, in case the first time didn't
3135 * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
3136 * available, and it previously wasn't. */
3137 log_open();
3138
3139 if (!skip_setup) {
3140 disable_printk_ratelimit();
3141
3142 r = initialize_security(
3143 &loaded_policy,
3144 &security_start_timestamp,
3145 &security_finish_timestamp,
3146 &error_message);
3147 if (r < 0)
3148 goto finish;
3149 }
3150
3151 r = mac_init();
3152 if (r < 0) {
3153 error_message = "Failed to initialize MAC support";
3154 goto finish;
3155 }
3156
3157 if (!skip_setup)
3158 initialize_clock_timewarp();
3159
3160 clock_apply_epoch(/* allow_backwards= */ !skip_setup);
3161
3162 /* Set the default for later on, but don't actually open the logs like this for
3163 * now. Note that if we are transitioning from the initrd there might still be
3164 * journal fd open, and we shouldn't attempt opening that before we parsed
3165 * /proc/cmdline which might redirect output elsewhere. */
3166 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
3167
3168 } else {
3169 /* Running inside a container, as PID 1 */
3170 log_set_target_and_open(LOG_TARGET_CONSOLE);
3171
3172 /* For later on, see above... */
3173 log_set_target(LOG_TARGET_JOURNAL);
3174
3175 /* clear the kernel timestamp, because we are in a container */
3176 kernel_timestamp = DUAL_TIMESTAMP_NULL;
3177 }
3178
3179 initialize_coredump(skip_setup);
3180
3181 r = fixup_environment();
3182 if (r < 0) {
3183 log_struct_errno(LOG_EMERG, r,
3184 LOG_MESSAGE("Failed to fix up PID 1 environment: %m"),
3185 LOG_MESSAGE_ID(SD_MESSAGE_CORE_PID1_ENVIRONMENT_STR));
3186 error_message = "Failed to fix up PID1 environment";
3187 goto finish;
3188 }
3189
3190 /* Try to figure out if we can use colors with the console. No need to do that for user
3191 * instances since they never log into the console. */
3192 log_show_color(colors_enabled());
3193
3194 r = make_null_stdio();
3195 if (r < 0)
3196 log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
3197
3198 /* Load the kernel modules early. */
3199 if (!skip_setup)
3200 (void) kmod_setup();
3201
3202 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
3203 r = mount_setup(loaded_policy, skip_setup);
3204 if (r < 0) {
3205 error_message = "Failed to mount API filesystems";
3206 goto finish;
3207 }
3208
3209 /* The efivarfs is now mounted, let's lock down the system token. */
3210 lock_down_efi_variables();
3211 } else {
3212 /* Running as user instance */
3213 arg_runtime_scope = RUNTIME_SCOPE_USER;
3214 log_set_always_reopen_console(true);
3215 log_set_target_and_open(LOG_TARGET_AUTO);
3216
3217 /* clear the kernel timestamp, because we are not PID 1 */
3218 kernel_timestamp = DUAL_TIMESTAMP_NULL;
3219
3220 r = mac_init();
3221 if (r < 0) {
3222 error_message = "Failed to initialize MAC support";
3223 goto finish;
3224 }
3225 }
3226
3227 /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
3228 * transitioning from the initrd to the main systemd or suchlike. */
3229 save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
3230
3231 /* Reset all signal handlers. */
3232 (void) reset_all_signal_handlers();
3233 (void) ignore_signals(SIGNALS_IGNORE);
3234
3235 (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
3236
3237 r = parse_argv(argc, argv);
3238 if (r < 0) {
3239 error_message = "Failed to parse command line arguments";
3240 goto finish;
3241 }
3242
3243 r = safety_checks();
3244 if (r < 0)
3245 goto finish;
3246
3247 if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
3248 pager_open(arg_pager_flags);
3249
3250 if (arg_action != ACTION_RUN)
3251 skip_setup = true;
3252
3253 if (arg_action == ACTION_HELP) {
3254 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
3255 goto finish;
3256 } else if (arg_action == ACTION_VERSION) {
3257 retval = version();
3258 goto finish;
3259 } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
3260 unit_dump_config_items(stdout);
3261 retval = EXIT_SUCCESS;
3262 goto finish;
3263 } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
3264 dump_bus_properties(stdout);
3265 retval = EXIT_SUCCESS;
3266 goto finish;
3267 } else if (arg_action == ACTION_BUS_INTROSPECT) {
3268 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
3269 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
3270 goto finish;
3271 }
3272
3273 assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
3274
3275 /* Move out of the way, so that we won't block unmounts */
3276 assert_se(chdir("/") == 0);
3277
3278 if (arg_action == ACTION_RUN) {
3279 if (!skip_setup) {
3280 /* Apply the systemd.clock_usec= kernel command line switch */
3281 apply_clock_update();
3282
3283 /* Apply random seed from kernel command line */
3284 cmdline_take_random_seed();
3285 }
3286
3287 /* A core pattern might have been specified via the cmdline. */
3288 initialize_core_pattern(skip_setup);
3289
3290 /* Make /usr/ read-only */
3291 apply_protect_system(skip_setup);
3292
3293 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
3294 log_close();
3295
3296 /* Remember open file descriptors for later deserialization */
3297 r = collect_fds(&fds, &error_message);
3298 if (r < 0)
3299 goto finish;
3300
3301 /* Give up any control of the console, but make sure its initialized. */
3302 setup_console_terminal(skip_setup);
3303
3304 /* Open the logging devices, if possible and necessary */
3305 log_open();
3306 }
3307
3308 log_execution_mode(&first_boot);
3309
3310 r = cg_has_legacy();
3311 if (r < 0) {
3312 error_message = "Failed to check cgroup hierarchy";
3313 goto finish;
3314 }
3315 if (r > 0) {
3316 r = log_full_errno(LOG_EMERG, SYNTHETIC_ERRNO(EPROTO),
3317 "Detected cgroup v1 hierarchy at /sys/fs/cgroup/, which is no longer supported by current version of systemd.\n"
3318 "Please instruct your initrd to mount cgroup v2 (unified) hierarchy,\n"
3319 "possibly by removing any stale kernel command line options, such as:\n"
3320 " systemd.legacy_systemd_cgroup_controller=1\n"
3321 " systemd.unified_cgroup_hierarchy=0");
3322
3323 error_message = "Detected unsupported legacy cgroup hierarchy, refusing execution";
3324 goto finish;
3325 }
3326
3327 r = initialize_runtime(skip_setup,
3328 first_boot,
3329 &saved_rlimit_nofile,
3330 &saved_rlimit_memlock,
3331 &saved_ambient_set,
3332 &error_message);
3333 if (r < 0)
3334 goto finish;
3335
3336 r = manager_new(arg_runtime_scope,
3337 arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
3338 &m);
3339 if (r < 0) {
3340 log_struct_errno(LOG_EMERG, r,
3341 LOG_MESSAGE("Failed to allocate manager object: %m"),
3342 LOG_MESSAGE_ID(SD_MESSAGE_CORE_MANAGER_ALLOCATE_STR));
3343 error_message = "Failed to allocate manager object";
3344 goto finish;
3345 }
3346
3347 m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
3348 m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
3349 m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
3350 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
3351 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
3352
3353 m->saved_ambient_set = saved_ambient_set;
3354
3355 set_manager_defaults(m);
3356 set_manager_settings(m);
3357 manager_set_first_boot(m, first_boot);
3358 manager_set_switching_root(m, arg_switched_root);
3359
3360 /* Remember whether we should queue the default job */
3361 queue_default_job = !arg_serialization || arg_switched_root;
3362
3363 before_startup = now(CLOCK_MONOTONIC);
3364
3365 r = manager_startup(m, arg_serialization, fds, /* root= */ NULL);
3366 if (r < 0) {
3367 error_message = "Failed to start up manager";
3368 goto finish;
3369 }
3370
3371 /* This will close all file descriptors that were opened, but not claimed by any unit. */
3372 fds = fdset_free(fds);
3373 arg_serialization = safe_fclose(arg_serialization);
3374
3375 if (queue_default_job) {
3376 r = do_queue_default_job(m, &error_message);
3377 if (r < 0)
3378 goto finish;
3379 }
3380
3381 after_startup = now(CLOCK_MONOTONIC);
3382
3383 log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
3384 "Loaded units and determined initial transaction in %s.",
3385 FORMAT_TIMESPAN(after_startup - before_startup, 100 * USEC_PER_MSEC));
3386
3387 if (arg_action == ACTION_TEST) {
3388 manager_test_summary(m);
3389 retval = EXIT_SUCCESS;
3390 goto finish;
3391 }
3392
3393 r = invoke_main_loop(m,
3394 &saved_rlimit_nofile,
3395 &saved_rlimit_memlock,
3396 &retval,
3397 &fds,
3398 &switch_root_dir,
3399 &switch_root_init,
3400 &error_message);
3401 /* MANAGER_OK and MANAGER_RELOAD are not expected here. */
3402 assert(r < 0 || IN_SET(r, MANAGER_REEXECUTE, MANAGER_EXIT) ||
3403 (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3404 IN_SET(r, MANAGER_REBOOT,
3405 MANAGER_SOFT_REBOOT,
3406 MANAGER_POWEROFF,
3407 MANAGER_HALT,
3408 MANAGER_KEXEC,
3409 MANAGER_SWITCH_ROOT)));
3410
3411finish:
3412 pager_close();
3413
3414 if (m) {
3415 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
3416 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
3417 m = manager_free(m);
3418 }
3419
3420 mac_selinux_finish();
3421
3422 if (IN_SET(r, MANAGER_REEXECUTE, MANAGER_SWITCH_ROOT, MANAGER_SOFT_REBOOT))
3423 r = do_reexecute(r,
3424 argc, argv,
3425 &saved_rlimit_nofile,
3426 &saved_rlimit_memlock,
3427 fds,
3428 switch_root_dir,
3429 switch_root_init,
3430 saved_ambient_set,
3431 &error_message); /* This only returns if reexecution failed */
3432
3433 arg_serialization = safe_fclose(arg_serialization);
3434 fds = fdset_free(fds);
3435
3436 saved_env = strv_free(saved_env);
3437
3438#if HAVE_VALGRIND_VALGRIND_H
3439 /* If we are PID 1 and running under valgrind, then let's exit
3440 * here explicitly. valgrind will only generate nice output on
3441 * exit(), not on exec(), hence let's do the former not the
3442 * latter here. */
3443 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
3444 /* Cleanup watchdog_device strings for valgrind. We need them
3445 * in become_shutdown() so normally we cannot free them yet. */
3446 watchdog_free_device();
3447 reset_arguments();
3448 return retval;
3449 }
3450#endif
3451
3452#if HAS_FEATURE_ADDRESS_SANITIZER
3453 /* At this stage we most likely don't have stdio/stderr open, so the following
3454 * LSan check would not print any actionable information and would just crash
3455 * PID 1. To make this a bit more helpful, let's try to open /dev/console,
3456 * and if we succeed redirect LSan's report there. */
3457 if (getpid_cached() == 1) {
3458 _cleanup_close_ int tty_fd = -EBADF;
3459
3460 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
3461 if (tty_fd >= 0)
3462 __sanitizer_set_report_fd((void*) (intptr_t) tty_fd);
3463
3464 __lsan_do_leak_check();
3465 }
3466#endif
3467
3468 if (r < 0)
3469 (void) sd_notifyf(/* unset_environment= */ false,
3470 "ERRNO=%i", -r);
3471
3472 /* Try to invoke the shutdown binary unless we already failed.
3473 * If we failed above, we want to freeze after finishing cleanup. */
3474 if (arg_runtime_scope == RUNTIME_SCOPE_SYSTEM &&
3475 IN_SET(r, MANAGER_EXIT, MANAGER_REBOOT, MANAGER_POWEROFF, MANAGER_HALT, MANAGER_KEXEC)) {
3476 r = become_shutdown(r, retval);
3477 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
3478 error_message = "Failed to execute shutdown binary";
3479 }
3480
3481 /* This is primarily useful when running systemd in a VM, as it provides the user running the VM with
3482 * a mechanism to pick up systemd's exit status in the VM. */
3483 (void) sd_notifyf(/* unset_environment= */ false,
3484 "EXIT_STATUS=%i", retval);
3485
3486 watchdog_free_device();
3487 arg_watchdog_device = mfree(arg_watchdog_device);
3488
3489 if (getpid_cached() == 1) {
3490 if (error_message)
3491 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
3492 ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
3493 "%s.", error_message);
3494 freeze_or_exit_or_reboot();
3495 }
3496
3497 reset_arguments();
3498 return retval;
3499}