]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/main.c
Merge pull request #18601 from keszybz/env-assign-cleanup
[thirdparty/systemd.git] / src / core / main.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <getopt.h>
6 #include <sys/mount.h>
7 #include <sys/prctl.h>
8 #include <sys/reboot.h>
9 #include <unistd.h>
10 #if HAVE_SECCOMP
11 #include <seccomp.h>
12 #endif
13 #if HAVE_VALGRIND_VALGRIND_H
14 #include <valgrind/valgrind.h>
15 #endif
16
17 #include "sd-bus.h"
18 #include "sd-daemon.h"
19 #include "sd-messages.h"
20
21 #include "alloc-util.h"
22 #include "apparmor-setup.h"
23 #include "architecture.h"
24 #include "build.h"
25 #include "bus-error.h"
26 #include "bus-util.h"
27 #include "capability-util.h"
28 #include "cgroup-util.h"
29 #include "clock-util.h"
30 #include "conf-parser.h"
31 #include "cpu-set-util.h"
32 #include "dbus-manager.h"
33 #include "dbus.h"
34 #include "def.h"
35 #include "dev-setup.h"
36 #include "efi-random.h"
37 #include "efivars.h"
38 #include "emergency-action.h"
39 #include "env-util.h"
40 #include "exit-status.h"
41 #include "fd-util.h"
42 #include "fdset.h"
43 #include "fileio.h"
44 #include "format-util.h"
45 #include "fs-util.h"
46 #include "hexdecoct.h"
47 #include "hostname-setup.h"
48 #include "ima-setup.h"
49 #include "killall.h"
50 #include "kmod-setup.h"
51 #include "limits-util.h"
52 #include "load-fragment.h"
53 #include "log.h"
54 #include "loopback-setup.h"
55 #include "machine-id-setup.h"
56 #include "manager.h"
57 #include "mkdir.h"
58 #include "mount-setup.h"
59 #include "os-util.h"
60 #include "pager.h"
61 #include "parse-argument.h"
62 #include "parse-util.h"
63 #include "path-util.h"
64 #include "pretty-print.h"
65 #include "proc-cmdline.h"
66 #include "process-util.h"
67 #include "random-util.h"
68 #include "raw-clone.h"
69 #include "rlimit-util.h"
70 #if HAVE_SECCOMP
71 #include "seccomp-util.h"
72 #endif
73 #include "selinux-setup.h"
74 #include "selinux-util.h"
75 #include "signal-util.h"
76 #include "smack-setup.h"
77 #include "special.h"
78 #include "stat-util.h"
79 #include "stdio-util.h"
80 #include "strv.h"
81 #include "switch-root.h"
82 #include "sysctl-util.h"
83 #include "terminal-util.h"
84 #include "umask-util.h"
85 #include "user-util.h"
86 #include "util.h"
87 #include "virt.h"
88 #include "watchdog.h"
89
90 #if HAS_FEATURE_ADDRESS_SANITIZER
91 #include <sanitizer/lsan_interface.h>
92 #endif
93
94 #define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */
95
96 static enum {
97 ACTION_RUN,
98 ACTION_HELP,
99 ACTION_VERSION,
100 ACTION_TEST,
101 ACTION_DUMP_CONFIGURATION_ITEMS,
102 ACTION_DUMP_BUS_PROPERTIES,
103 ACTION_BUS_INTROSPECT,
104 } arg_action = ACTION_RUN;
105
106 static const char *arg_bus_introspect = NULL;
107
108 /* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
109 * defaults are assigned in reset_arguments() below. */
110 static char *arg_default_unit;
111 static bool arg_system;
112 static bool arg_dump_core;
113 static int arg_crash_chvt;
114 static bool arg_crash_shell;
115 static bool arg_crash_reboot;
116 static char *arg_confirm_spawn;
117 static ShowStatus arg_show_status;
118 static StatusUnitFormat arg_status_unit_format;
119 static bool arg_switched_root;
120 static PagerFlags arg_pager_flags;
121 static bool arg_service_watchdogs;
122 static ExecOutput arg_default_std_output;
123 static ExecOutput arg_default_std_error;
124 static usec_t arg_default_restart_usec;
125 static usec_t arg_default_timeout_start_usec;
126 static usec_t arg_default_timeout_stop_usec;
127 static usec_t arg_default_timeout_abort_usec;
128 static bool arg_default_timeout_abort_set;
129 static usec_t arg_default_start_limit_interval;
130 static unsigned arg_default_start_limit_burst;
131 static usec_t arg_runtime_watchdog;
132 static usec_t arg_reboot_watchdog;
133 static usec_t arg_kexec_watchdog;
134 static char *arg_early_core_pattern;
135 static char *arg_watchdog_device;
136 static char **arg_default_environment;
137 static struct rlimit *arg_default_rlimit[_RLIMIT_MAX];
138 static uint64_t arg_capability_bounding_set;
139 static bool arg_no_new_privs;
140 static nsec_t arg_timer_slack_nsec;
141 static usec_t arg_default_timer_accuracy_usec;
142 static Set* arg_syscall_archs;
143 static FILE* arg_serialization;
144 static int arg_default_cpu_accounting;
145 static bool arg_default_io_accounting;
146 static bool arg_default_ip_accounting;
147 static bool arg_default_blockio_accounting;
148 static bool arg_default_memory_accounting;
149 static bool arg_default_tasks_accounting;
150 static TasksMax arg_default_tasks_max;
151 static sd_id128_t arg_machine_id;
152 static EmergencyAction arg_cad_burst_action;
153 static OOMPolicy arg_default_oom_policy;
154 static CPUSet arg_cpu_affinity;
155 static NUMAPolicy arg_numa_policy;
156 static usec_t arg_clock_usec;
157 static void *arg_random_seed;
158 static size_t arg_random_seed_size;
159
160 /* A copy of the original environment block */
161 static char **saved_env = NULL;
162
163 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
164 const struct rlimit *saved_rlimit_memlock);
165
166 _noreturn_ static void freeze_or_exit_or_reboot(void) {
167
168 /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
169 * the container manager, and thus inform it that something went wrong. */
170 if (detect_container() > 0) {
171 log_emergency("Exiting PID 1...");
172 _exit(EXIT_EXCEPTION);
173 }
174
175 if (arg_crash_reboot) {
176 log_notice("Rebooting in 10s...");
177 (void) sleep(10);
178
179 log_notice("Rebooting now...");
180 (void) reboot(RB_AUTOBOOT);
181 log_emergency_errno(errno, "Failed to reboot: %m");
182 }
183
184 log_emergency("Freezing execution.");
185 freeze();
186 }
187
188 _noreturn_ static void crash(int sig) {
189 struct sigaction sa;
190 pid_t pid;
191
192 if (getpid_cached() != 1)
193 /* Pass this on immediately, if this is not PID 1 */
194 (void) raise(sig);
195 else if (!arg_dump_core)
196 log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig));
197 else {
198 sa = (struct sigaction) {
199 .sa_handler = nop_signal_handler,
200 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
201 };
202
203 /* We want to wait for the core process, hence let's enable SIGCHLD */
204 (void) sigaction(SIGCHLD, &sa, NULL);
205
206 pid = raw_clone(SIGCHLD);
207 if (pid < 0)
208 log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig));
209 else if (pid == 0) {
210 /* Enable default signal handler for core dump */
211
212 sa = (struct sigaction) {
213 .sa_handler = SIG_DFL,
214 };
215 (void) sigaction(sig, &sa, NULL);
216
217 /* Don't limit the coredump size */
218 (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
219
220 /* Just to be sure... */
221 (void) chdir("/");
222
223 /* Raise the signal again */
224 pid = raw_getpid();
225 (void) kill(pid, sig); /* raise() would kill the parent */
226
227 assert_not_reached("We shouldn't be here...");
228 _exit(EXIT_EXCEPTION);
229 } else {
230 siginfo_t status;
231 int r;
232
233 /* Order things nicely. */
234 r = wait_for_terminate(pid, &status);
235 if (r < 0)
236 log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig));
237 else if (status.si_code != CLD_DUMPED) {
238 const char *s = status.si_code == CLD_EXITED
239 ? exit_status_to_string(status.si_status, EXIT_STATUS_LIBC)
240 : signal_to_string(status.si_status);
241
242 log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
243 signal_to_string(sig),
244 pid,
245 sigchld_code_to_string(status.si_code),
246 status.si_status, strna(s));
247 } else
248 log_emergency("Caught <%s>, dumped core as pid "PID_FMT".",
249 signal_to_string(sig), pid);
250 }
251 }
252
253 if (arg_crash_chvt >= 0)
254 (void) chvt(arg_crash_chvt);
255
256 sa = (struct sigaction) {
257 .sa_handler = SIG_IGN,
258 .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
259 };
260
261 /* Let the kernel reap children for us */
262 (void) sigaction(SIGCHLD, &sa, NULL);
263
264 if (arg_crash_shell) {
265 log_notice("Executing crash shell in 10s...");
266 (void) sleep(10);
267
268 pid = raw_clone(SIGCHLD);
269 if (pid < 0)
270 log_emergency_errno(errno, "Failed to fork off crash shell: %m");
271 else if (pid == 0) {
272 (void) setsid();
273 (void) make_console_stdio();
274 (void) rlimit_nofile_safe();
275 (void) execle("/bin/sh", "/bin/sh", NULL, environ);
276
277 log_emergency_errno(errno, "execle() failed: %m");
278 _exit(EXIT_EXCEPTION);
279 } else {
280 log_info("Spawned crash shell as PID "PID_FMT".", pid);
281 (void) wait_for_terminate(pid, NULL);
282 }
283 }
284
285 freeze_or_exit_or_reboot();
286 }
287
288 static void install_crash_handler(void) {
289 static const struct sigaction sa = {
290 .sa_handler = crash,
291 .sa_flags = SA_NODEFER, /* So that we can raise the signal again from the signal handler */
292 };
293 int r;
294
295 /* We ignore the return value here, since, we don't mind if we
296 * cannot set up a crash handler */
297 r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER, -1);
298 if (r < 0)
299 log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
300 }
301
302 static int console_setup(void) {
303 _cleanup_close_ int tty_fd = -1;
304 int r;
305
306 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
307 if (tty_fd < 0)
308 return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
309
310 /* We don't want to force text mode. plymouth may be showing
311 * pictures already from initrd. */
312 r = reset_terminal_fd(tty_fd, false);
313 if (r < 0)
314 return log_error_errno(r, "Failed to reset /dev/console: %m");
315
316 return 0;
317 }
318
319 static int set_machine_id(const char *m) {
320 sd_id128_t t;
321 assert(m);
322
323 if (sd_id128_from_string(m, &t) < 0)
324 return -EINVAL;
325
326 if (sd_id128_is_null(t))
327 return -EINVAL;
328
329 arg_machine_id = t;
330 return 0;
331 }
332
333 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
334 int r;
335
336 assert(key);
337
338 if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
339
340 if (proc_cmdline_value_missing(key, value))
341 return 0;
342
343 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
344 log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
345 else if (in_initrd() == !!startswith(key, "rd."))
346 return free_and_strdup_warn(&arg_default_unit, value);
347
348 } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
349
350 r = value ? parse_boolean(value) : true;
351 if (r < 0)
352 log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
353 else
354 arg_dump_core = r;
355
356 } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
357
358 if (proc_cmdline_value_missing(key, value))
359 return 0;
360
361 if (path_is_absolute(value))
362 (void) parse_path_argument(value, false, &arg_early_core_pattern);
363 else
364 log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
365
366 } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
367
368 if (!value)
369 arg_crash_chvt = 0; /* turn on */
370 else {
371 r = parse_crash_chvt(value, &arg_crash_chvt);
372 if (r < 0)
373 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
374 }
375
376 } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
377
378 r = value ? parse_boolean(value) : true;
379 if (r < 0)
380 log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
381 else
382 arg_crash_shell = r;
383
384 } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
385
386 r = value ? parse_boolean(value) : true;
387 if (r < 0)
388 log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
389 else
390 arg_crash_reboot = r;
391
392 } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
393 char *s;
394
395 r = parse_confirm_spawn(value, &s);
396 if (r < 0)
397 log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
398 else
399 free_and_replace(arg_confirm_spawn, s);
400
401 } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
402
403 r = value ? parse_boolean(value) : true;
404 if (r < 0)
405 log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
406 else
407 arg_service_watchdogs = r;
408
409 } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
410
411 if (value) {
412 r = parse_show_status(value, &arg_show_status);
413 if (r < 0)
414 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
415 } else
416 arg_show_status = SHOW_STATUS_YES;
417
418 } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
419
420 if (proc_cmdline_value_missing(key, value))
421 return 0;
422
423 r = status_unit_format_from_string(value);
424 if (r < 0)
425 log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
426 else
427 arg_status_unit_format = r;
428
429 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
430
431 if (proc_cmdline_value_missing(key, value))
432 return 0;
433
434 r = exec_output_from_string(value);
435 if (r < 0)
436 log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
437 else
438 arg_default_std_output = r;
439
440 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
441
442 if (proc_cmdline_value_missing(key, value))
443 return 0;
444
445 r = exec_output_from_string(value);
446 if (r < 0)
447 log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
448 else
449 arg_default_std_error = r;
450
451 } else if (streq(key, "systemd.setenv")) {
452
453 if (proc_cmdline_value_missing(key, value))
454 return 0;
455
456 if (!env_assignment_is_valid(value))
457 log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
458 else {
459 r = strv_env_replace_strdup(&arg_default_environment, value);
460 if (r < 0)
461 return log_oom();
462 }
463
464 } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
465
466 if (proc_cmdline_value_missing(key, value))
467 return 0;
468
469 r = set_machine_id(value);
470 if (r < 0)
471 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
472
473 } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
474
475 if (proc_cmdline_value_missing(key, value))
476 return 0;
477
478 r = parse_sec(value, &arg_default_timeout_start_usec);
479 if (r < 0)
480 log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
481
482 if (arg_default_timeout_start_usec <= 0)
483 arg_default_timeout_start_usec = USEC_INFINITY;
484
485 } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
486
487 if (proc_cmdline_value_missing(key, value))
488 return 0;
489
490 r = parse_cpu_set(value, &arg_cpu_affinity);
491 if (r < 0)
492 log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
493
494 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
495
496 if (proc_cmdline_value_missing(key, value))
497 return 0;
498
499 (void) parse_path_argument(value, false, &arg_watchdog_device);
500
501 } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
502
503 if (proc_cmdline_value_missing(key, value))
504 return 0;
505
506 r = safe_atou64(value, &arg_clock_usec);
507 if (r < 0)
508 log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
509
510 } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
511 void *p;
512 size_t sz;
513
514 if (proc_cmdline_value_missing(key, value))
515 return 0;
516
517 r = unbase64mem(value, (size_t) -1, &p, &sz);
518 if (r < 0)
519 log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
520
521 free(arg_random_seed);
522 arg_random_seed = sz > 0 ? p : mfree(p);
523 arg_random_seed_size = sz;
524
525 } else if (streq(key, "quiet") && !value) {
526
527 if (arg_show_status == _SHOW_STATUS_INVALID)
528 arg_show_status = SHOW_STATUS_ERROR;
529
530 } else if (streq(key, "debug") && !value) {
531
532 /* Note that log_parse_environment() handles 'debug'
533 * too, and sets the log level to LOG_DEBUG. */
534
535 if (detect_container() > 0)
536 log_set_target(LOG_TARGET_CONSOLE);
537
538 } else if (!value) {
539 const char *target;
540
541 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
542 target = runlevel_to_target(key);
543 if (target)
544 return free_and_strdup_warn(&arg_default_unit, target);
545 }
546
547 return 0;
548 }
549
550 #define DEFINE_SETTER(name, func, descr) \
551 static int name(const char *unit, \
552 const char *filename, \
553 unsigned line, \
554 const char *section, \
555 unsigned section_line, \
556 const char *lvalue, \
557 int ltype, \
558 const char *rvalue, \
559 void *data, \
560 void *userdata) { \
561 \
562 int r; \
563 \
564 assert(filename); \
565 assert(lvalue); \
566 assert(rvalue); \
567 \
568 r = func(rvalue); \
569 if (r < 0) \
570 log_syntax(unit, LOG_ERR, filename, line, r, \
571 "Invalid " descr "'%s': %m", \
572 rvalue); \
573 \
574 return 0; \
575 }
576
577 DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
578 DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
579 DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
580 DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
581 DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
582
583 static int config_parse_default_timeout_abort(
584 const char *unit,
585 const char *filename,
586 unsigned line,
587 const char *section,
588 unsigned section_line,
589 const char *lvalue,
590 int ltype,
591 const char *rvalue,
592 void *data,
593 void *userdata) {
594 int r;
595
596 r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
597 &arg_default_timeout_abort_usec, userdata);
598 if (r >= 0)
599 arg_default_timeout_abort_set = r;
600 return 0;
601 }
602
603 static int parse_config_file(void) {
604 const ConfigTableItem items[] = {
605 { "Manager", "LogLevel", config_parse_level2, 0, NULL },
606 { "Manager", "LogTarget", config_parse_target, 0, NULL },
607 { "Manager", "LogColor", config_parse_color, 0, NULL },
608 { "Manager", "LogLocation", config_parse_location, 0, NULL },
609 { "Manager", "LogTime", config_parse_time, 0, NULL },
610 { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
611 { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
612 { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
613 { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
614 { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
615 { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
616 { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
617 { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
618 { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
619 { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
620 { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
621 { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog },
622 { "Manager", "RebootWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog },
623 { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
624 { "Manager", "KExecWatchdogSec", config_parse_sec, 0, &arg_kexec_watchdog },
625 { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
626 { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
627 { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
628 #if HAVE_SECCOMP
629 { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
630 #endif
631 { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
632 { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec },
633 { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_default_std_output },
634 { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_default_std_error },
635 { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
636 { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
637 { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
638 { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
639 { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
640 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_default_start_limit_interval },
641 { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst },
642 { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment },
643 { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit },
644 { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit },
645 { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit },
646 { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit },
647 { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit },
648 { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit },
649 { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit },
650 { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit },
651 { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit },
652 { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit },
653 { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit },
654 { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit },
655 { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit },
656 { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit },
657 { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit },
658 { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit },
659 { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting },
660 { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting },
661 { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting },
662 { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting },
663 { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting },
664 { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
665 { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
666 { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
667 { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
668 {}
669 };
670
671 const char *fn, *conf_dirs_nulstr;
672
673 fn = arg_system ?
674 PKGSYSCONFDIR "/system.conf" :
675 PKGSYSCONFDIR "/user.conf";
676
677 conf_dirs_nulstr = arg_system ?
678 CONF_PATHS_NULSTR("systemd/system.conf.d") :
679 CONF_PATHS_NULSTR("systemd/user.conf.d");
680
681 (void) config_parse_many_nulstr(
682 fn, conf_dirs_nulstr,
683 "Manager\0",
684 config_item_table_lookup, items,
685 CONFIG_PARSE_WARN,
686 NULL,
687 NULL);
688
689 /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we used USEC_INFINITY
690 * like everywhere else. */
691 if (arg_default_timeout_start_usec <= 0)
692 arg_default_timeout_start_usec = USEC_INFINITY;
693 if (arg_default_timeout_stop_usec <= 0)
694 arg_default_timeout_stop_usec = USEC_INFINITY;
695
696 return 0;
697 }
698
699 static void set_manager_defaults(Manager *m) {
700
701 assert(m);
702
703 /* Propagates the various default unit property settings into the manager object, i.e. properties that do not
704 * affect the manager itself, but are just what newly allocated units will have set if they haven't set
705 * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */
706
707 m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec;
708 m->default_std_output = arg_default_std_output;
709 m->default_std_error = arg_default_std_error;
710 m->default_timeout_start_usec = arg_default_timeout_start_usec;
711 m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
712 m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
713 m->default_timeout_abort_set = arg_default_timeout_abort_set;
714 m->default_restart_usec = arg_default_restart_usec;
715 m->default_start_limit_interval = arg_default_start_limit_interval;
716 m->default_start_limit_burst = arg_default_start_limit_burst;
717
718 /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
719 * controller to be enabled, so the default is to enable it unless we got told otherwise. */
720 if (arg_default_cpu_accounting >= 0)
721 m->default_cpu_accounting = arg_default_cpu_accounting;
722 else
723 m->default_cpu_accounting = cpu_accounting_is_cheap();
724
725 m->default_io_accounting = arg_default_io_accounting;
726 m->default_ip_accounting = arg_default_ip_accounting;
727 m->default_blockio_accounting = arg_default_blockio_accounting;
728 m->default_memory_accounting = arg_default_memory_accounting;
729 m->default_tasks_accounting = arg_default_tasks_accounting;
730 m->default_tasks_max = arg_default_tasks_max;
731 m->default_oom_policy = arg_default_oom_policy;
732
733 (void) manager_set_default_rlimits(m, arg_default_rlimit);
734
735 (void) manager_default_environment(m);
736 (void) manager_transient_environment_add(m, arg_default_environment);
737 }
738
739 static void set_manager_settings(Manager *m) {
740
741 assert(m);
742
743 /* Propagates the various manager settings into the manager object, i.e. properties that
744 * effect the manager itself (as opposed to just being inherited into newly allocated
745 * units, see set_manager_defaults() above). */
746
747 m->confirm_spawn = arg_confirm_spawn;
748 m->service_watchdogs = arg_service_watchdogs;
749 m->cad_burst_action = arg_cad_burst_action;
750
751 manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
752 manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
753 manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
754
755 manager_set_show_status(m, arg_show_status, "commandline");
756 m->status_unit_format = arg_status_unit_format;
757 }
758
759 static int parse_argv(int argc, char *argv[]) {
760 enum {
761 ARG_LOG_LEVEL = 0x100,
762 ARG_LOG_TARGET,
763 ARG_LOG_COLOR,
764 ARG_LOG_LOCATION,
765 ARG_LOG_TIME,
766 ARG_UNIT,
767 ARG_SYSTEM,
768 ARG_USER,
769 ARG_TEST,
770 ARG_NO_PAGER,
771 ARG_VERSION,
772 ARG_DUMP_CONFIGURATION_ITEMS,
773 ARG_DUMP_BUS_PROPERTIES,
774 ARG_BUS_INTROSPECT,
775 ARG_DUMP_CORE,
776 ARG_CRASH_CHVT,
777 ARG_CRASH_SHELL,
778 ARG_CRASH_REBOOT,
779 ARG_CONFIRM_SPAWN,
780 ARG_SHOW_STATUS,
781 ARG_DESERIALIZE,
782 ARG_SWITCHED_ROOT,
783 ARG_DEFAULT_STD_OUTPUT,
784 ARG_DEFAULT_STD_ERROR,
785 ARG_MACHINE_ID,
786 ARG_SERVICE_WATCHDOGS,
787 };
788
789 static const struct option options[] = {
790 { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
791 { "log-target", required_argument, NULL, ARG_LOG_TARGET },
792 { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
793 { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
794 { "log-time", optional_argument, NULL, ARG_LOG_TIME },
795 { "unit", required_argument, NULL, ARG_UNIT },
796 { "system", no_argument, NULL, ARG_SYSTEM },
797 { "user", no_argument, NULL, ARG_USER },
798 { "test", no_argument, NULL, ARG_TEST },
799 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
800 { "help", no_argument, NULL, 'h' },
801 { "version", no_argument, NULL, ARG_VERSION },
802 { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS },
803 { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES },
804 { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
805 { "dump-core", optional_argument, NULL, ARG_DUMP_CORE },
806 { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT },
807 { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL },
808 { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT },
809 { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN },
810 { "show-status", optional_argument, NULL, ARG_SHOW_STATUS },
811 { "deserialize", required_argument, NULL, ARG_DESERIALIZE },
812 { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT },
813 { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, },
814 { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, },
815 { "machine-id", required_argument, NULL, ARG_MACHINE_ID },
816 { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS },
817 {}
818 };
819
820 int c, r;
821 bool user_arg_seen = false;
822
823 assert(argc >= 1);
824 assert(argv);
825
826 if (getpid_cached() == 1)
827 opterr = 0;
828
829 while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0)
830
831 switch (c) {
832
833 case ARG_LOG_LEVEL:
834 r = log_set_max_level_from_string(optarg);
835 if (r < 0)
836 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
837
838 break;
839
840 case ARG_LOG_TARGET:
841 r = log_set_target_from_string(optarg);
842 if (r < 0)
843 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
844
845 break;
846
847 case ARG_LOG_COLOR:
848
849 if (optarg) {
850 r = log_show_color_from_string(optarg);
851 if (r < 0)
852 return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
853 optarg);
854 } else
855 log_show_color(true);
856
857 break;
858
859 case ARG_LOG_LOCATION:
860 if (optarg) {
861 r = log_show_location_from_string(optarg);
862 if (r < 0)
863 return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
864 optarg);
865 } else
866 log_show_location(true);
867
868 break;
869
870 case ARG_LOG_TIME:
871
872 if (optarg) {
873 r = log_show_time_from_string(optarg);
874 if (r < 0)
875 return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
876 optarg);
877 } else
878 log_show_time(true);
879
880 break;
881
882 case ARG_DEFAULT_STD_OUTPUT:
883 r = exec_output_from_string(optarg);
884 if (r < 0)
885 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
886 optarg);
887 arg_default_std_output = r;
888 break;
889
890 case ARG_DEFAULT_STD_ERROR:
891 r = exec_output_from_string(optarg);
892 if (r < 0)
893 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
894 optarg);
895 arg_default_std_error = r;
896 break;
897
898 case ARG_UNIT:
899 r = free_and_strdup(&arg_default_unit, optarg);
900 if (r < 0)
901 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
902
903 break;
904
905 case ARG_SYSTEM:
906 arg_system = true;
907 break;
908
909 case ARG_USER:
910 arg_system = false;
911 user_arg_seen = true;
912 break;
913
914 case ARG_TEST:
915 arg_action = ACTION_TEST;
916 break;
917
918 case ARG_NO_PAGER:
919 arg_pager_flags |= PAGER_DISABLE;
920 break;
921
922 case ARG_VERSION:
923 arg_action = ACTION_VERSION;
924 break;
925
926 case ARG_DUMP_CONFIGURATION_ITEMS:
927 arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
928 break;
929
930 case ARG_DUMP_BUS_PROPERTIES:
931 arg_action = ACTION_DUMP_BUS_PROPERTIES;
932 break;
933
934 case ARG_BUS_INTROSPECT:
935 arg_bus_introspect = optarg;
936 arg_action = ACTION_BUS_INTROSPECT;
937 break;
938
939 case ARG_DUMP_CORE:
940 if (!optarg)
941 arg_dump_core = true;
942 else {
943 r = parse_boolean(optarg);
944 if (r < 0)
945 return log_error_errno(r, "Failed to parse dump core boolean: \"%s\": %m",
946 optarg);
947 arg_dump_core = r;
948 }
949 break;
950
951 case ARG_CRASH_CHVT:
952 r = parse_crash_chvt(optarg, &arg_crash_chvt);
953 if (r < 0)
954 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
955 optarg);
956 break;
957
958 case ARG_CRASH_SHELL:
959 if (!optarg)
960 arg_crash_shell = true;
961 else {
962 r = parse_boolean(optarg);
963 if (r < 0)
964 return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m",
965 optarg);
966 arg_crash_shell = r;
967 }
968 break;
969
970 case ARG_CRASH_REBOOT:
971 if (!optarg)
972 arg_crash_reboot = true;
973 else {
974 r = parse_boolean(optarg);
975 if (r < 0)
976 return log_error_errno(r, "Failed to parse crash shell boolean: \"%s\": %m",
977 optarg);
978 arg_crash_reboot = r;
979 }
980 break;
981
982 case ARG_CONFIRM_SPAWN:
983 arg_confirm_spawn = mfree(arg_confirm_spawn);
984
985 r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
986 if (r < 0)
987 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
988 optarg);
989 break;
990
991 case ARG_SERVICE_WATCHDOGS:
992 r = parse_boolean(optarg);
993 if (r < 0)
994 return log_error_errno(r, "Failed to parse service watchdogs boolean: \"%s\": %m",
995 optarg);
996 arg_service_watchdogs = r;
997 break;
998
999 case ARG_SHOW_STATUS:
1000 if (optarg) {
1001 r = parse_show_status(optarg, &arg_show_status);
1002 if (r < 0)
1003 return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1004 optarg);
1005 } else
1006 arg_show_status = SHOW_STATUS_YES;
1007 break;
1008
1009 case ARG_DESERIALIZE: {
1010 int fd;
1011 FILE *f;
1012
1013 r = safe_atoi(optarg, &fd);
1014 if (r < 0)
1015 log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg);
1016 if (fd < 0)
1017 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1018 "Invalid deserialize fd: %d",
1019 fd);
1020
1021 (void) fd_cloexec(fd, true);
1022
1023 f = fdopen(fd, "r");
1024 if (!f)
1025 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1026
1027 safe_fclose(arg_serialization);
1028 arg_serialization = f;
1029
1030 break;
1031 }
1032
1033 case ARG_SWITCHED_ROOT:
1034 arg_switched_root = true;
1035 break;
1036
1037 case ARG_MACHINE_ID:
1038 r = set_machine_id(optarg);
1039 if (r < 0)
1040 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1041 break;
1042
1043 case 'h':
1044 arg_action = ACTION_HELP;
1045 break;
1046
1047 case 'D':
1048 log_set_max_level(LOG_DEBUG);
1049 break;
1050
1051 case 'b':
1052 case 's':
1053 case 'z':
1054 /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1055 * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1056 */
1057 case '?':
1058 if (getpid_cached() != 1)
1059 return -EINVAL;
1060 else
1061 return 0;
1062
1063 default:
1064 assert_not_reached("Unhandled option code.");
1065 }
1066
1067 if (optind < argc && getpid_cached() != 1)
1068 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1069 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1070
1071 if (arg_action == ACTION_RUN && !arg_system && !user_arg_seen)
1072 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1073 "Explicit --user argument required to run as user manager.");
1074
1075 return 0;
1076 }
1077
1078 static int help(void) {
1079 _cleanup_free_ char *link = NULL;
1080 int r;
1081
1082 r = terminal_urlify_man("systemd", "1", &link);
1083 if (r < 0)
1084 return log_oom();
1085
1086 printf("%s [OPTIONS...]\n\n"
1087 "%sStarts and monitors system and user services.%s\n\n"
1088 "This program takes no positional arguments.\n\n"
1089 "%sOptions%s:\n"
1090 " -h --help Show this help\n"
1091 " --version Show version\n"
1092 " --test Determine initial transaction, dump it and exit\n"
1093 " --system In combination with --test: operate as system service manager\n"
1094 " --user In combination with --test: operate as per-user service manager\n"
1095 " --no-pager Do not pipe output into a pager\n"
1096 " --dump-configuration-items Dump understood unit configuration items\n"
1097 " --dump-bus-properties Dump exposed bus properties\n"
1098 " --bus-introspect=PATH Write XML introspection data\n"
1099 " --unit=UNIT Set default unit\n"
1100 " --dump-core[=BOOL] Dump core on crash\n"
1101 " --crash-vt=NR Change to specified VT on crash\n"
1102 " --crash-reboot[=BOOL] Reboot on crash\n"
1103 " --crash-shell[=BOOL] Run shell on crash\n"
1104 " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
1105 " --show-status[=BOOL] Show status updates on the console during bootup\n"
1106 " --log-target=TARGET Set log target (console, journal, kmsg, journal-or-kmsg, null)\n"
1107 " --log-level=LEVEL Set log level (debug, info, notice, warning, err, crit, alert, emerg)\n"
1108 " --log-color[=BOOL] Highlight important log messages\n"
1109 " --log-location[=BOOL] Include code location in log messages\n"
1110 " --log-time[=BOOL] Prefix log messages with current time\n"
1111 " --default-standard-output= Set default standard output for services\n"
1112 " --default-standard-error= Set default standard error output for services\n"
1113 "\nSee the %s for details.\n",
1114 program_invocation_short_name,
1115 ansi_highlight(),
1116 ansi_normal(),
1117 ansi_underline(),
1118 ansi_normal(),
1119 link);
1120
1121 return 0;
1122 }
1123
1124 static int prepare_reexecute(
1125 Manager *m,
1126 FILE **ret_f,
1127 FDSet **ret_fds,
1128 bool switching_root) {
1129
1130 _cleanup_fdset_free_ FDSet *fds = NULL;
1131 _cleanup_fclose_ FILE *f = NULL;
1132 int r;
1133
1134 assert(m);
1135 assert(ret_f);
1136 assert(ret_fds);
1137
1138 r = manager_open_serialization(m, &f);
1139 if (r < 0)
1140 return log_error_errno(r, "Failed to create serialization file: %m");
1141
1142 /* Make sure nothing is really destructed when we shut down */
1143 m->n_reloading++;
1144 bus_manager_send_reloading(m, true);
1145
1146 fds = fdset_new();
1147 if (!fds)
1148 return log_oom();
1149
1150 r = manager_serialize(m, f, fds, switching_root);
1151 if (r < 0)
1152 return r;
1153
1154 if (fseeko(f, 0, SEEK_SET) == (off_t) -1)
1155 return log_error_errno(errno, "Failed to rewind serialization fd: %m");
1156
1157 r = fd_cloexec(fileno(f), false);
1158 if (r < 0)
1159 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1160
1161 r = fdset_cloexec(fds, false);
1162 if (r < 0)
1163 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1164
1165 *ret_f = TAKE_PTR(f);
1166 *ret_fds = TAKE_PTR(fds);
1167
1168 return 0;
1169 }
1170
1171 static void bump_file_max_and_nr_open(void) {
1172
1173 /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file
1174 * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting
1175 * them and limiting them in another two layers of limits is unnecessary and just complicates things. This
1176 * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft +
1177 * hard) the only ones that really matter. */
1178
1179 #if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1180 int r;
1181 #endif
1182
1183 #if BUMP_PROC_SYS_FS_FILE_MAX
1184 /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously thing where
1185 * different but the operation would fail silently.) */
1186 r = sysctl_writef("fs/file-max", "%li\n", LONG_MAX);
1187 if (r < 0)
1188 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m");
1189 #endif
1190
1191 #if BUMP_PROC_SYS_FS_NR_OPEN
1192 int v = INT_MAX;
1193
1194 /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they
1195 * are. The expression by which the maximum is determined is dependent on the architecture, and is something we
1196 * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since
1197 * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with
1198 * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel
1199 * APIs are kernel APIs, so what do can we do... 🤯 */
1200
1201 for (;;) {
1202 int k;
1203
1204 v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1205 if (v < 1024) {
1206 log_warning("Can't bump fs.nr_open, value too small.");
1207 break;
1208 }
1209
1210 k = read_nr_open();
1211 if (k < 0) {
1212 log_error_errno(k, "Failed to read fs.nr_open: %m");
1213 break;
1214 }
1215 if (k >= v) { /* Already larger */
1216 log_debug("Skipping bump, value is already larger.");
1217 break;
1218 }
1219
1220 r = sysctl_writef("fs/nr_open", "%i\n", v);
1221 if (r == -EINVAL) {
1222 log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1223 v /= 2;
1224 continue;
1225 }
1226 if (r < 0) {
1227 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1228 break;
1229 }
1230
1231 log_debug("Successfully bumped fs.nr_open to %i", v);
1232 break;
1233 }
1234 #endif
1235 }
1236
1237 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
1238 struct rlimit new_rlimit;
1239 int r, nr;
1240
1241 /* Get the underlying absolute limit the kernel enforces */
1242 nr = read_nr_open();
1243
1244 /* Calculate the new limits to use for us. Never lower from what we inherited. */
1245 new_rlimit = (struct rlimit) {
1246 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1247 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1248 };
1249
1250 /* Shortcut if nothing changes. */
1251 if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1252 saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1253 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1254 return 0;
1255 }
1256
1257 /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1258 * both hard and soft. */
1259 r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1260 if (r < 0)
1261 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1262
1263 return 0;
1264 }
1265
1266 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
1267 struct rlimit new_rlimit;
1268 uint64_t mm;
1269 int r;
1270
1271 /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should
1272 * normally disable such checks. We need them to implement IPAddressAllow= and IPAddressDeny=, hence let's bump
1273 * the value high enough for our user. */
1274
1275 /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1276 * must be unsigned, hence this is a given, but let's make this clear here. */
1277 assert_cc(RLIM_INFINITY > 0);
1278
1279 mm = physical_memory() / 8; /* Let's scale how much we allow to be locked by the amount of physical
1280 * RAM. We allow an eighth to be locked by us, just to pick a value. */
1281
1282 new_rlimit = (struct rlimit) {
1283 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1284 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1285 };
1286
1287 if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1288 saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1289 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1290 return 0;
1291 }
1292
1293 r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1294 if (r < 0)
1295 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1296
1297 return 0;
1298 }
1299
1300 static void test_usr(void) {
1301
1302 /* Check that /usr is either on the same file system as / or mounted already. */
1303
1304 if (dir_is_empty("/usr") <= 0)
1305 return;
1306
1307 log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
1308 "Some things will probably break (sometimes even silently) in mysterious ways. "
1309 "Consult http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
1310 }
1311
1312 static int enforce_syscall_archs(Set *archs) {
1313 #if HAVE_SECCOMP
1314 int r;
1315
1316 if (!is_seccomp_available())
1317 return 0;
1318
1319 r = seccomp_restrict_archs(arg_syscall_archs);
1320 if (r < 0)
1321 return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
1322 #endif
1323 return 0;
1324 }
1325
1326 static int status_welcome(void) {
1327 _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL;
1328 int r;
1329
1330 if (!show_status_on(arg_show_status))
1331 return 0;
1332
1333 r = parse_os_release(NULL,
1334 "PRETTY_NAME", &pretty_name,
1335 "ANSI_COLOR", &ansi_color,
1336 NULL);
1337 if (r < 0)
1338 log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1339 "Failed to read os-release file, ignoring: %m");
1340
1341 if (log_get_show_color())
1342 return status_printf(NULL, 0,
1343 "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
1344 isempty(ansi_color) ? "1" : ansi_color,
1345 isempty(pretty_name) ? "Linux" : pretty_name);
1346 else
1347 return status_printf(NULL, 0,
1348 "\nWelcome to %s!\n",
1349 isempty(pretty_name) ? "Linux" : pretty_name);
1350 }
1351
1352 static int write_container_id(void) {
1353 const char *c;
1354 int r;
1355
1356 c = getenv("container");
1357 if (isempty(c))
1358 return 0;
1359
1360 RUN_WITH_UMASK(0022)
1361 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1362 if (r < 0)
1363 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1364
1365 return 1;
1366 }
1367
1368 static int bump_unix_max_dgram_qlen(void) {
1369 _cleanup_free_ char *qlen = NULL;
1370 unsigned long v;
1371 int r;
1372
1373 /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set the value
1374 * really really early during boot, so that it is actually applied to all our sockets, including the
1375 * $NOTIFY_SOCKET one. */
1376
1377 r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1378 if (r < 0)
1379 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1380
1381 r = safe_atolu(qlen, &v);
1382 if (r < 0)
1383 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1384
1385 if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1386 return 0;
1387
1388 r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN);
1389 if (r < 0)
1390 return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1391 "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1392
1393 return 1;
1394 }
1395
1396 static int fixup_environment(void) {
1397 _cleanup_free_ char *term = NULL;
1398 const char *t;
1399 int r;
1400
1401 /* Only fix up the environment when we are started as PID 1 */
1402 if (getpid_cached() != 1)
1403 return 0;
1404
1405 /* We expect the environment to be set correctly if run inside a container. */
1406 if (detect_container() > 0)
1407 return 0;
1408
1409 /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the backend
1410 * device used by the console. We try to make a better guess here since some consoles might not have support
1411 * for color mode for example.
1412 *
1413 * However if TERM was configured through the kernel command line then leave it alone. */
1414 r = proc_cmdline_get_key("TERM", 0, &term);
1415 if (r < 0)
1416 return r;
1417
1418 t = term ?: default_term_for_tty("/dev/console");
1419
1420 if (setenv("TERM", t, 1) < 0)
1421 return -errno;
1422
1423 /* The kernels sets HOME=/ for init. Let's undo this. */
1424 if (path_equal_ptr(getenv("HOME"), "/"))
1425 assert_se(unsetenv("HOME") == 0);
1426
1427 return 0;
1428 }
1429
1430 static void redirect_telinit(int argc, char *argv[]) {
1431
1432 /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
1433
1434 #if HAVE_SYSV_COMPAT
1435 if (getpid_cached() == 1)
1436 return;
1437
1438 if (!strstr(program_invocation_short_name, "init"))
1439 return;
1440
1441 execv(SYSTEMCTL_BINARY_PATH, argv);
1442 log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
1443 exit(EXIT_FAILURE);
1444 #endif
1445 }
1446
1447 static int become_shutdown(
1448 const char *shutdown_verb,
1449 int retval) {
1450
1451 char log_level[DECIMAL_STR_MAX(int) + 1],
1452 exit_code[DECIMAL_STR_MAX(uint8_t) + 1],
1453 timeout[DECIMAL_STR_MAX(usec_t) + 1];
1454
1455 const char* command_line[13] = {
1456 SYSTEMD_SHUTDOWN_BINARY_PATH,
1457 shutdown_verb,
1458 "--timeout", timeout,
1459 "--log-level", log_level,
1460 "--log-target",
1461 };
1462
1463 _cleanup_strv_free_ char **env_block = NULL;
1464 size_t pos = 7;
1465 int r;
1466 usec_t watchdog_timer = 0;
1467
1468 assert(shutdown_verb);
1469 assert(!command_line[pos]);
1470 env_block = strv_copy(environ);
1471
1472 xsprintf(log_level, "%d", log_get_max_level());
1473 xsprintf(timeout, "%" PRI_USEC "us", arg_default_timeout_stop_usec);
1474
1475 switch (log_get_target()) {
1476
1477 case LOG_TARGET_KMSG:
1478 case LOG_TARGET_JOURNAL_OR_KMSG:
1479 case LOG_TARGET_SYSLOG_OR_KMSG:
1480 command_line[pos++] = "kmsg";
1481 break;
1482
1483 case LOG_TARGET_NULL:
1484 command_line[pos++] = "null";
1485 break;
1486
1487 case LOG_TARGET_CONSOLE:
1488 default:
1489 command_line[pos++] = "console";
1490 break;
1491 };
1492
1493 if (log_get_show_color())
1494 command_line[pos++] = "--log-color";
1495
1496 if (log_get_show_location())
1497 command_line[pos++] = "--log-location";
1498
1499 if (log_get_show_time())
1500 command_line[pos++] = "--log-time";
1501
1502 if (streq(shutdown_verb, "exit")) {
1503 command_line[pos++] = "--exit-code";
1504 command_line[pos++] = exit_code;
1505 xsprintf(exit_code, "%d", retval);
1506 }
1507
1508 assert(pos < ELEMENTSOF(command_line));
1509
1510 if (streq(shutdown_verb, "reboot"))
1511 watchdog_timer = arg_reboot_watchdog;
1512 else if (streq(shutdown_verb, "kexec"))
1513 watchdog_timer = arg_kexec_watchdog;
1514
1515 if (watchdog_timer > 0 && watchdog_timer != USEC_INFINITY) {
1516
1517 char *e;
1518
1519 /* If we reboot or kexec let's set the shutdown
1520 * watchdog and tell the shutdown binary to
1521 * repeatedly ping it */
1522 r = watchdog_set_timeout(&watchdog_timer);
1523 watchdog_close(r < 0);
1524
1525 /* Tell the binary how often to ping, ignore failure */
1526 if (asprintf(&e, "WATCHDOG_USEC="USEC_FMT, watchdog_timer) > 0)
1527 (void) strv_consume(&env_block, e);
1528
1529 if (arg_watchdog_device &&
1530 asprintf(&e, "WATCHDOG_DEVICE=%s", arg_watchdog_device) > 0)
1531 (void) strv_consume(&env_block, e);
1532 } else
1533 watchdog_close(true);
1534
1535 /* Avoid the creation of new processes forked by the
1536 * kernel; at this point, we will not listen to the
1537 * signals anyway */
1538 if (detect_container() <= 0)
1539 (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
1540
1541 execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1542 return -errno;
1543 }
1544
1545 static void initialize_clock(void) {
1546 int r;
1547
1548 /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1549 * we are running, but only once. */
1550
1551 if (clock_is_localtime(NULL) > 0) {
1552 int min;
1553
1554 /*
1555 * The very first call of settimeofday() also does a time warp in the kernel.
1556 *
1557 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to take care
1558 * of maintaining the RTC and do all adjustments. This matches the behavior of Windows, which leaves
1559 * the RTC alone if the registry tells that the RTC runs in UTC.
1560 */
1561 r = clock_set_timezone(&min);
1562 if (r < 0)
1563 log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1564 else
1565 log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1566
1567 } else if (!in_initrd())
1568 /*
1569 * Do a dummy very first call to seal the kernel's time warp magic.
1570 *
1571 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with LOCAL, but the
1572 * real system could be set up that way. In such case, we need to delay the time-warp or the sealing
1573 * until we reach the real system.
1574 *
1575 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, the time
1576 * will jump or be incorrect at every daylight saving time change. All kernel local time concepts will
1577 * be treated as UTC that way.
1578 */
1579 (void) clock_reset_timewarp();
1580
1581 r = clock_apply_epoch();
1582 if (r < 0)
1583 log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
1584 else if (r > 0)
1585 log_info("System time before build time, advancing clock.");
1586 }
1587
1588 static void apply_clock_update(void) {
1589 struct timespec ts;
1590
1591 /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
1592 * command line and such. */
1593
1594 if (arg_clock_usec == 0)
1595 return;
1596
1597 if (getpid_cached() != 1)
1598 return;
1599
1600 if (clock_settime(CLOCK_REALTIME, timespec_store(&ts, arg_clock_usec)) < 0)
1601 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1602 else {
1603 char buf[FORMAT_TIMESTAMP_MAX];
1604
1605 log_info("Set system clock to %s, as specified on the kernel command line.",
1606 format_timestamp(buf, sizeof(buf), arg_clock_usec));
1607 }
1608 }
1609
1610 static void cmdline_take_random_seed(void) {
1611 size_t suggested;
1612 int r;
1613
1614 if (arg_random_seed_size == 0)
1615 return;
1616
1617 if (getpid_cached() != 1)
1618 return;
1619
1620 assert(arg_random_seed);
1621 suggested = random_pool_size();
1622
1623 if (arg_random_seed_size < suggested)
1624 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1625 arg_random_seed_size, suggested);
1626
1627 r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1628 if (r < 0) {
1629 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1630 return;
1631 }
1632
1633 log_notice("Successfully credited entropy passed on kernel command line.\n"
1634 "Note that the seed provided this way is accessible to unprivileged programs. This functionality should not be used outside of testing environments.");
1635 }
1636
1637 static void initialize_coredump(bool skip_setup) {
1638 #if ENABLE_COREDUMP
1639 if (getpid_cached() != 1)
1640 return;
1641
1642 /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour the limit)
1643 * will process core dumps for system services by default. */
1644 if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1645 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1646
1647 /* But at the same time, turn off the core_pattern logic by default, so that no
1648 * coredumps are stored until the systemd-coredump tool is enabled via
1649 * sysctl. However it can be changed via the kernel command line later so core
1650 * dumps can still be generated during early startup and in initramfs. */
1651 if (!skip_setup)
1652 disable_coredumps();
1653 #endif
1654 }
1655
1656 static void initialize_core_pattern(bool skip_setup) {
1657 int r;
1658
1659 if (skip_setup || !arg_early_core_pattern)
1660 return;
1661
1662 if (getpid_cached() != 1)
1663 return;
1664
1665 r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1666 if (r < 0)
1667 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern);
1668 }
1669
1670 static void update_cpu_affinity(bool skip_setup) {
1671 _cleanup_free_ char *mask = NULL;
1672
1673 if (skip_setup || !arg_cpu_affinity.set)
1674 return;
1675
1676 assert(arg_cpu_affinity.allocated > 0);
1677
1678 mask = cpu_set_to_string(&arg_cpu_affinity);
1679 log_debug("Setting CPU affinity to %s.", strnull(mask));
1680
1681 if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1682 log_warning_errno(errno, "Failed to set CPU affinity: %m");
1683 }
1684
1685 static void update_numa_policy(bool skip_setup) {
1686 int r;
1687 _cleanup_free_ char *nodes = NULL;
1688 const char * policy = NULL;
1689
1690 if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1691 return;
1692
1693 if (DEBUG_LOGGING) {
1694 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1695 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1696 log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
1697 }
1698
1699 r = apply_numa_policy(&arg_numa_policy);
1700 if (r == -EOPNOTSUPP)
1701 log_debug_errno(r, "NUMA support not available, ignoring.");
1702 else if (r < 0)
1703 log_warning_errno(r, "Failed to set NUMA memory policy: %m");
1704 }
1705
1706 static void do_reexecute(
1707 int argc,
1708 char *argv[],
1709 const struct rlimit *saved_rlimit_nofile,
1710 const struct rlimit *saved_rlimit_memlock,
1711 FDSet *fds,
1712 const char *switch_root_dir,
1713 const char *switch_root_init,
1714 const char **ret_error_message) {
1715
1716 unsigned i, j, args_size;
1717 const char **args;
1718 int r;
1719
1720 assert(saved_rlimit_nofile);
1721 assert(saved_rlimit_memlock);
1722 assert(ret_error_message);
1723
1724 /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get rebooted while
1725 * we do that */
1726 watchdog_close(true);
1727
1728 /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
1729 * the kernel default to its child processes */
1730 if (saved_rlimit_nofile->rlim_cur != 0)
1731 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
1732 if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
1733 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
1734
1735 if (switch_root_dir) {
1736 /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1737 * SIGCHLD for them after deserializing. */
1738 broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec);
1739
1740 /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */
1741 r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE);
1742 if (r < 0)
1743 log_error_errno(r, "Failed to switch root, trying to continue: %m");
1744 }
1745
1746 args_size = MAX(6, argc+1);
1747 args = newa(const char*, args_size);
1748
1749 if (!switch_root_init) {
1750 char sfd[DECIMAL_STR_MAX(int) + 1];
1751
1752 /* First try to spawn ourselves with the right path, and with full serialization. We do this only if
1753 * the user didn't specify an explicit init to spawn. */
1754
1755 assert(arg_serialization);
1756 assert(fds);
1757
1758 xsprintf(sfd, "%i", fileno(arg_serialization));
1759
1760 i = 0;
1761 args[i++] = SYSTEMD_BINARY_PATH;
1762 if (switch_root_dir)
1763 args[i++] = "--switched-root";
1764 args[i++] = arg_system ? "--system" : "--user";
1765 args[i++] = "--deserialize";
1766 args[i++] = sfd;
1767 args[i++] = NULL;
1768
1769 assert(i <= args_size);
1770
1771 /*
1772 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do this is on
1773 * its own on exec(), but it will do it on exit(). Hence, to ensure we get a summary here, fork() off
1774 * a child, let it exit() cleanly, so that it prints the summary, and wait() for it in the parent,
1775 * before proceeding into the exec().
1776 */
1777 valgrind_summary_hack();
1778
1779 (void) execv(args[0], (char* const*) args);
1780 log_debug_errno(errno, "Failed to execute our own binary, trying fallback: %m");
1781 }
1782
1783 /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and envp[]. (Well,
1784 * modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], but let's hope that
1785 * doesn't matter.) */
1786
1787 arg_serialization = safe_fclose(arg_serialization);
1788 fds = fdset_free(fds);
1789
1790 /* Reopen the console */
1791 (void) make_console_stdio();
1792
1793 for (j = 1, i = 1; j < (unsigned) argc; j++)
1794 args[i++] = argv[j];
1795 args[i++] = NULL;
1796 assert(i <= args_size);
1797
1798 /* Re-enable any blocked signals, especially important if we switch from initial ramdisk to init=... */
1799 (void) reset_all_signal_handlers();
1800 (void) reset_signal_mask();
1801 (void) rlimit_nofile_safe();
1802
1803 if (switch_root_init) {
1804 args[0] = switch_root_init;
1805 (void) execve(args[0], (char* const*) args, saved_env);
1806 log_warning_errno(errno, "Failed to execute configured init, trying fallback: %m");
1807 }
1808
1809 args[0] = "/sbin/init";
1810 (void) execv(args[0], (char* const*) args);
1811 r = -errno;
1812
1813 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
1814 ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
1815 "Failed to execute /sbin/init");
1816
1817 if (r == -ENOENT) {
1818 log_warning("No /sbin/init, trying fallback");
1819
1820 args[0] = "/bin/sh";
1821 args[1] = NULL;
1822 (void) execve(args[0], (char* const*) args, saved_env);
1823 log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
1824 } else
1825 log_warning_errno(r, "Failed to execute /sbin/init, giving up: %m");
1826
1827 *ret_error_message = "Failed to execute fallback shell";
1828 }
1829
1830 static int invoke_main_loop(
1831 Manager *m,
1832 const struct rlimit *saved_rlimit_nofile,
1833 const struct rlimit *saved_rlimit_memlock,
1834 bool *ret_reexecute,
1835 int *ret_retval, /* Return parameters relevant for shutting down */
1836 const char **ret_shutdown_verb, /* … */
1837 FDSet **ret_fds, /* Return parameters for reexecuting */
1838 char **ret_switch_root_dir, /* … */
1839 char **ret_switch_root_init, /* … */
1840 const char **ret_error_message) {
1841
1842 int r;
1843
1844 assert(m);
1845 assert(saved_rlimit_nofile);
1846 assert(saved_rlimit_memlock);
1847 assert(ret_reexecute);
1848 assert(ret_retval);
1849 assert(ret_shutdown_verb);
1850 assert(ret_fds);
1851 assert(ret_switch_root_dir);
1852 assert(ret_switch_root_init);
1853 assert(ret_error_message);
1854
1855 for (;;) {
1856 r = manager_loop(m);
1857 if (r < 0) {
1858 *ret_error_message = "Failed to run main loop";
1859 return log_emergency_errno(r, "Failed to run main loop: %m");
1860 }
1861
1862 switch ((ManagerObjective) r) {
1863
1864 case MANAGER_RELOAD: {
1865 LogTarget saved_log_target;
1866 int saved_log_level;
1867
1868 log_info("Reloading.");
1869
1870 /* First, save any overridden log level/target, then parse the configuration file, which might
1871 * change the log level to new settings. */
1872
1873 saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
1874 saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
1875
1876 (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
1877
1878 set_manager_defaults(m);
1879 set_manager_settings(m);
1880
1881 update_cpu_affinity(false);
1882 update_numa_policy(false);
1883
1884 if (saved_log_level >= 0)
1885 manager_override_log_level(m, saved_log_level);
1886 if (saved_log_target >= 0)
1887 manager_override_log_target(m, saved_log_target);
1888
1889 r = manager_reload(m);
1890 if (r < 0)
1891 /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */
1892 m->objective = MANAGER_OK;
1893
1894 break;
1895 }
1896
1897 case MANAGER_REEXECUTE:
1898
1899 r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
1900 if (r < 0) {
1901 *ret_error_message = "Failed to prepare for reexecution";
1902 return r;
1903 }
1904
1905 log_notice("Reexecuting.");
1906
1907 *ret_reexecute = true;
1908 *ret_retval = EXIT_SUCCESS;
1909 *ret_shutdown_verb = NULL;
1910 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1911
1912 return 0;
1913
1914 case MANAGER_SWITCH_ROOT:
1915 if (!m->switch_root_init) {
1916 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
1917 if (r < 0) {
1918 *ret_error_message = "Failed to prepare for reexecution";
1919 return r;
1920 }
1921 } else
1922 *ret_fds = NULL;
1923
1924 log_notice("Switching root.");
1925
1926 *ret_reexecute = true;
1927 *ret_retval = EXIT_SUCCESS;
1928 *ret_shutdown_verb = NULL;
1929
1930 /* Steal the switch root parameters */
1931 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
1932 *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
1933
1934 return 0;
1935
1936 case MANAGER_EXIT:
1937
1938 if (MANAGER_IS_USER(m)) {
1939 log_debug("Exit.");
1940
1941 *ret_reexecute = false;
1942 *ret_retval = m->return_value;
1943 *ret_shutdown_verb = NULL;
1944 *ret_fds = NULL;
1945 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1946
1947 return 0;
1948 }
1949
1950 _fallthrough_;
1951 case MANAGER_REBOOT:
1952 case MANAGER_POWEROFF:
1953 case MANAGER_HALT:
1954 case MANAGER_KEXEC: {
1955 static const char * const table[_MANAGER_OBJECTIVE_MAX] = {
1956 [MANAGER_EXIT] = "exit",
1957 [MANAGER_REBOOT] = "reboot",
1958 [MANAGER_POWEROFF] = "poweroff",
1959 [MANAGER_HALT] = "halt",
1960 [MANAGER_KEXEC] = "kexec",
1961 };
1962
1963 log_notice("Shutting down.");
1964
1965 *ret_reexecute = false;
1966 *ret_retval = m->return_value;
1967 assert_se(*ret_shutdown_verb = table[m->objective]);
1968 *ret_fds = NULL;
1969 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1970
1971 return 0;
1972 }
1973
1974 default:
1975 assert_not_reached("Unknown or unexpected manager objective.");
1976 }
1977 }
1978 }
1979
1980 static void log_execution_mode(bool *ret_first_boot) {
1981 assert(ret_first_boot);
1982
1983 if (arg_system) {
1984 int v;
1985
1986 log_info("systemd " GIT_VERSION " running in %ssystem mode. (%s)",
1987 arg_action == ACTION_TEST ? "test " : "",
1988 systemd_features);
1989
1990 v = detect_virtualization();
1991 if (v > 0)
1992 log_info("Detected virtualization %s.", virtualization_to_string(v));
1993
1994 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
1995
1996 if (in_initrd()) {
1997 *ret_first_boot = false;
1998 log_info("Running in initial RAM disk.");
1999 } else {
2000 int r;
2001 _cleanup_free_ char *id_text = NULL;
2002
2003 /* Let's check whether we are in first boot. We use /etc/machine-id as flag file
2004 * for this: If it is missing or contains the value "uninitialized", this is the
2005 * first boot. In any other case, it is not. This allows container managers and
2006 * installers to provision a couple of files already. If the container manager
2007 * wants to provision the machine ID itself it should pass $container_uuid to PID 1. */
2008
2009 r = read_one_line_file("/etc/machine-id", &id_text);
2010 if (r < 0 || streq(id_text, "uninitialized")) {
2011 if (r < 0 && r != -ENOENT)
2012 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, ignoring: %m");
2013
2014 *ret_first_boot = true;
2015 log_info("Detected first boot.");
2016 } else {
2017 *ret_first_boot = false;
2018 log_debug("Detected initialized system, this is not the first boot.");
2019 }
2020 }
2021 } else {
2022 if (DEBUG_LOGGING) {
2023 _cleanup_free_ char *t;
2024
2025 t = uid_to_name(getuid());
2026 log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2027 arg_action == ACTION_TEST ? " test" : "",
2028 getuid(), strna(t), systemd_features);
2029 }
2030
2031 *ret_first_boot = false;
2032 }
2033 }
2034
2035 static int initialize_runtime(
2036 bool skip_setup,
2037 bool first_boot,
2038 struct rlimit *saved_rlimit_nofile,
2039 struct rlimit *saved_rlimit_memlock,
2040 const char **ret_error_message) {
2041 int r;
2042
2043 assert(ret_error_message);
2044
2045 /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2046 *
2047 * - Some only apply to --system instances
2048 * - Some only apply to --user instances
2049 * - Some only apply when we first start up, but not when we reexecute
2050 */
2051
2052 if (arg_action != ACTION_RUN)
2053 return 0;
2054
2055 update_cpu_affinity(skip_setup);
2056 update_numa_policy(skip_setup);
2057
2058 if (arg_system) {
2059 /* Make sure we leave a core dump without panicking the kernel. */
2060 install_crash_handler();
2061
2062 if (!skip_setup) {
2063 r = mount_cgroup_controllers();
2064 if (r < 0) {
2065 *ret_error_message = "Failed to mount cgroup hierarchies";
2066 return r;
2067 }
2068
2069 status_welcome();
2070 (void) hostname_setup(true);
2071 /* Force transient machine-id on first boot. */
2072 machine_id_setup(NULL, first_boot, arg_machine_id, NULL);
2073 (void) loopback_setup();
2074 bump_unix_max_dgram_qlen();
2075 bump_file_max_and_nr_open();
2076 test_usr();
2077 write_container_id();
2078 }
2079
2080 if (arg_watchdog_device) {
2081 r = watchdog_set_device(arg_watchdog_device);
2082 if (r < 0)
2083 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2084 }
2085 } else {
2086 _cleanup_free_ char *p = NULL;
2087
2088 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2089 * user mode. In system mode mount_setup() already did that. */
2090
2091 r = xdg_user_runtime_dir(&p, "/systemd");
2092 if (r < 0) {
2093 *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2094 return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m");
2095 }
2096
2097 (void) mkdir_p_label(p, 0755);
2098 (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2099 }
2100
2101 if (arg_timer_slack_nsec != NSEC_INFINITY)
2102 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2103 log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2104
2105 if (arg_system && !cap_test_all(arg_capability_bounding_set)) {
2106 r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2107 if (r < 0) {
2108 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2109 return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m");
2110 }
2111
2112 r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2113 if (r < 0) {
2114 *ret_error_message = "Failed to drop capability bounding set";
2115 return log_emergency_errno(r, "Failed to drop capability bounding set: %m");
2116 }
2117 }
2118
2119 if (arg_system && arg_no_new_privs) {
2120 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2121 *ret_error_message = "Failed to disable new privileges";
2122 return log_emergency_errno(errno, "Failed to disable new privileges: %m");
2123 }
2124 }
2125
2126 if (arg_syscall_archs) {
2127 r = enforce_syscall_archs(arg_syscall_archs);
2128 if (r < 0) {
2129 *ret_error_message = "Failed to set syscall architectures";
2130 return r;
2131 }
2132 }
2133
2134 if (!arg_system)
2135 /* Become reaper of our children */
2136 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
2137 log_warning_errno(errno, "Failed to make us a subreaper: %m");
2138
2139 /* Bump up RLIMIT_NOFILE for systemd itself */
2140 (void) bump_rlimit_nofile(saved_rlimit_nofile);
2141 (void) bump_rlimit_memlock(saved_rlimit_memlock);
2142
2143 return 0;
2144 }
2145
2146 static int do_queue_default_job(
2147 Manager *m,
2148 const char **ret_error_message) {
2149
2150 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2151 const char *unit;
2152 Job *job;
2153 Unit *target;
2154 int r;
2155
2156 if (arg_default_unit)
2157 unit = arg_default_unit;
2158 else if (in_initrd())
2159 unit = SPECIAL_INITRD_TARGET;
2160 else
2161 unit = SPECIAL_DEFAULT_TARGET;
2162
2163 log_debug("Activating default unit: %s", unit);
2164
2165 r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2166 if (r < 0 && in_initrd() && !arg_default_unit) {
2167 /* Fall back to default.target, which we used to always use by default. Only do this if no
2168 * explicit configuration was given. */
2169
2170 log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
2171
2172 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2173 }
2174 if (r < 0) {
2175 log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
2176
2177 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2178 if (r < 0) {
2179 *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2180 : "Failed to load " SPECIAL_RESCUE_TARGET;
2181 return r;
2182 }
2183 }
2184
2185 assert(target->load_state == UNIT_LOADED);
2186
2187 r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
2188 if (r == -EPERM) {
2189 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2190
2191 sd_bus_error_free(&error);
2192
2193 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
2194 if (r < 0) {
2195 *ret_error_message = "Failed to start default target";
2196 return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r));
2197 }
2198
2199 } else if (r < 0) {
2200 *ret_error_message = "Failed to isolate default target";
2201 return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r));
2202 } else
2203 log_info("Queued %s job for default target %s.",
2204 job_type_to_string(job->type),
2205 unit_status_string(job->unit));
2206
2207 m->default_unit_job_id = job->id;
2208
2209 return 0;
2210 }
2211
2212 static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2213 struct rlimit *saved_rlimit_memlock) {
2214
2215 assert(saved_rlimit_nofile);
2216 assert(saved_rlimit_memlock);
2217
2218 if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2219 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2220
2221 if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2222 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2223 }
2224
2225 static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2226 struct rlimit *rl;
2227
2228 if (arg_default_rlimit[RLIMIT_NOFILE])
2229 return;
2230
2231 /* Make sure forked processes get limits based on the original kernel setting */
2232
2233 rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2234 if (!rl) {
2235 log_oom();
2236 return;
2237 }
2238
2239 /* Bump the hard limit for system services to a substantially higher value. The default
2240 * hard limit current kernels set is pretty low (4K), mostly for historical
2241 * reasons. According to kernel developers, the fd handling in recent kernels has been
2242 * optimized substantially enough, so that we can bump the limit now, without paying too
2243 * high a price in memory or performance. Note however that we only bump the hard limit,
2244 * not the soft limit. That's because select() works the way it works, and chokes on fds
2245 * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2246 * unexpecting programs that they get fds higher than what they can process using
2247 * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2248 * this pitfall: programs that are written by folks aware of the select() problem in mind
2249 * (and thus use poll()/epoll instead of select(), the way everybody should) can
2250 * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2251 * we pass. */
2252 if (arg_system) {
2253 int nr;
2254
2255 /* Get the underlying absolute limit the kernel enforces */
2256 nr = read_nr_open();
2257
2258 rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2259 }
2260
2261 /* If for some reason we were invoked with a soft limit above 1024 (which should never
2262 * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2263 * instance), then lower what we pass on to not confuse our children */
2264 rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2265
2266 arg_default_rlimit[RLIMIT_NOFILE] = rl;
2267 }
2268
2269 static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2270 struct rlimit *rl;
2271
2272 /* Pass the original value down to invoked processes */
2273
2274 if (arg_default_rlimit[RLIMIT_MEMLOCK])
2275 return;
2276
2277 rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2278 if (!rl) {
2279 log_oom();
2280 return;
2281 }
2282
2283 arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
2284 }
2285
2286 static void reset_arguments(void) {
2287 /* Frees/resets arg_* variables, with a few exceptions commented below. */
2288
2289 arg_default_unit = mfree(arg_default_unit);
2290
2291 /* arg_system — ignore */
2292
2293 arg_dump_core = true;
2294 arg_crash_chvt = -1;
2295 arg_crash_shell = false;
2296 arg_crash_reboot = false;
2297 arg_confirm_spawn = mfree(arg_confirm_spawn);
2298 arg_show_status = _SHOW_STATUS_INVALID;
2299 arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2300 arg_switched_root = false;
2301 arg_pager_flags = 0;
2302 arg_service_watchdogs = true;
2303 arg_default_std_output = EXEC_OUTPUT_JOURNAL;
2304 arg_default_std_error = EXEC_OUTPUT_INHERIT;
2305 arg_default_restart_usec = DEFAULT_RESTART_USEC;
2306 arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
2307 arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
2308 arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
2309 arg_default_timeout_abort_set = false;
2310 arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
2311 arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
2312 arg_runtime_watchdog = 0;
2313 arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2314 arg_kexec_watchdog = 0;
2315 arg_early_core_pattern = NULL;
2316 arg_watchdog_device = NULL;
2317
2318 arg_default_environment = strv_free(arg_default_environment);
2319 rlimit_free_all(arg_default_rlimit);
2320
2321 arg_capability_bounding_set = CAP_ALL;
2322 arg_no_new_privs = false;
2323 arg_timer_slack_nsec = NSEC_INFINITY;
2324 arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
2325
2326 arg_syscall_archs = set_free(arg_syscall_archs);
2327
2328 /* arg_serialization — ignore */
2329
2330 arg_default_cpu_accounting = -1;
2331 arg_default_io_accounting = false;
2332 arg_default_ip_accounting = false;
2333 arg_default_blockio_accounting = false;
2334 arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
2335 arg_default_tasks_accounting = true;
2336 arg_default_tasks_max = DEFAULT_TASKS_MAX;
2337 arg_machine_id = (sd_id128_t) {};
2338 arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2339 arg_default_oom_policy = OOM_STOP;
2340
2341 cpu_set_reset(&arg_cpu_affinity);
2342 numa_policy_reset(&arg_numa_policy);
2343
2344 arg_random_seed = mfree(arg_random_seed);
2345 arg_random_seed_size = 0;
2346 arg_clock_usec = 0;
2347 }
2348
2349 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2350 const struct rlimit *saved_rlimit_memlock) {
2351 int r;
2352
2353 assert(saved_rlimit_nofile);
2354 assert(saved_rlimit_memlock);
2355
2356 /* Assign configuration defaults */
2357 reset_arguments();
2358
2359 r = parse_config_file();
2360 if (r < 0)
2361 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2362
2363 if (arg_system) {
2364 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2365 if (r < 0)
2366 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2367 }
2368
2369 /* Initialize some default rlimits for services if they haven't been configured */
2370 fallback_rlimit_nofile(saved_rlimit_nofile);
2371 fallback_rlimit_memlock(saved_rlimit_memlock);
2372
2373 /* Note that this also parses bits from the kernel command line, including "debug". */
2374 log_parse_environment();
2375
2376 /* Initialize the show status setting if it hasn't been set explicitly yet */
2377 if (arg_show_status == _SHOW_STATUS_INVALID)
2378 arg_show_status = SHOW_STATUS_YES;
2379
2380 return 0;
2381 }
2382
2383 static int safety_checks(void) {
2384
2385 if (getpid_cached() == 1 &&
2386 arg_action != ACTION_RUN)
2387 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2388 "Unsupported execution mode while PID 1.");
2389
2390 if (getpid_cached() == 1 &&
2391 !arg_system)
2392 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2393 "Can't run --user mode as PID 1.");
2394
2395 if (arg_action == ACTION_RUN &&
2396 arg_system &&
2397 getpid_cached() != 1)
2398 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2399 "Can't run system mode unless PID 1.");
2400
2401 if (arg_action == ACTION_TEST &&
2402 geteuid() == 0)
2403 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2404 "Don't run test mode as root.");
2405
2406 if (!arg_system &&
2407 arg_action == ACTION_RUN &&
2408 sd_booted() <= 0)
2409 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2410 "Trying to run as user instance, but the system has not been booted with systemd.");
2411
2412 if (!arg_system &&
2413 arg_action == ACTION_RUN &&
2414 !getenv("XDG_RUNTIME_DIR"))
2415 return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2416 "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2417
2418 if (arg_system &&
2419 arg_action == ACTION_RUN &&
2420 running_in_chroot() > 0)
2421 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2422 "Cannot be run in a chroot() environment.");
2423
2424 return 0;
2425 }
2426
2427 static int initialize_security(
2428 bool *loaded_policy,
2429 dual_timestamp *security_start_timestamp,
2430 dual_timestamp *security_finish_timestamp,
2431 const char **ret_error_message) {
2432
2433 int r;
2434
2435 assert(loaded_policy);
2436 assert(security_start_timestamp);
2437 assert(security_finish_timestamp);
2438 assert(ret_error_message);
2439
2440 dual_timestamp_get(security_start_timestamp);
2441
2442 r = mac_selinux_setup(loaded_policy);
2443 if (r < 0) {
2444 *ret_error_message = "Failed to load SELinux policy";
2445 return r;
2446 }
2447
2448 r = mac_smack_setup(loaded_policy);
2449 if (r < 0) {
2450 *ret_error_message = "Failed to load SMACK policy";
2451 return r;
2452 }
2453
2454 r = mac_apparmor_setup();
2455 if (r < 0) {
2456 *ret_error_message = "Failed to load AppArmor policy";
2457 return r;
2458 }
2459
2460 r = ima_setup();
2461 if (r < 0) {
2462 *ret_error_message = "Failed to load IMA policy";
2463 return r;
2464 }
2465
2466 dual_timestamp_get(security_finish_timestamp);
2467 return 0;
2468 }
2469
2470 static void test_summary(Manager *m) {
2471 assert(m);
2472
2473 printf("-> By units:\n");
2474 manager_dump_units(m, stdout, "\t");
2475
2476 printf("-> By jobs:\n");
2477 manager_dump_jobs(m, stdout, "\t");
2478 }
2479
2480 static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2481 int r;
2482
2483 assert(ret_fds);
2484 assert(ret_error_message);
2485
2486 r = fdset_new_fill(ret_fds);
2487 if (r < 0) {
2488 *ret_error_message = "Failed to allocate fd set";
2489 return log_emergency_errno(r, "Failed to allocate fd set: %m");
2490 }
2491
2492 fdset_cloexec(*ret_fds, true);
2493
2494 if (arg_serialization)
2495 assert_se(fdset_remove(*ret_fds, fileno(arg_serialization)) >= 0);
2496
2497 return 0;
2498 }
2499
2500 static void setup_console_terminal(bool skip_setup) {
2501
2502 if (!arg_system)
2503 return;
2504
2505 /* Become a session leader if we aren't one yet. */
2506 (void) setsid();
2507
2508 /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a controlling
2509 * tty. */
2510 (void) release_terminal();
2511
2512 /* Reset the console, but only if this is really init and we are freshly booted */
2513 if (getpid_cached() == 1 && !skip_setup)
2514 (void) console_setup();
2515 }
2516
2517 static bool early_skip_setup_check(int argc, char *argv[]) {
2518 bool found_deserialize = false;
2519 int i;
2520
2521 /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much later, so
2522 * let's just have a quick peek here. Note that if we have switched root, do all the special setup things
2523 * anyway, even if in that case we also do deserialization. */
2524
2525 for (i = 1; i < argc; i++) {
2526 if (streq(argv[i], "--switched-root"))
2527 return false; /* If we switched root, don't skip the setup. */
2528 else if (streq(argv[i], "--deserialize"))
2529 found_deserialize = true;
2530 }
2531
2532 return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
2533 }
2534
2535 static int save_env(void) {
2536 char **l;
2537
2538 l = strv_copy(environ);
2539 if (!l)
2540 return -ENOMEM;
2541
2542 strv_free_and_replace(saved_env, l);
2543 return 0;
2544 }
2545
2546 int main(int argc, char *argv[]) {
2547
2548 dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL,
2549 security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL;
2550 struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
2551 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
2552 * in. Note we use different values
2553 * for the two that indicate whether
2554 * these fields are initialized! */
2555 bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false;
2556 char *switch_root_dir = NULL, *switch_root_init = NULL;
2557 usec_t before_startup, after_startup;
2558 static char systemd[] = "systemd";
2559 char timespan[FORMAT_TIMESPAN_MAX];
2560 const char *shutdown_verb = NULL, *error_message = NULL;
2561 int r, retval = EXIT_FAILURE;
2562 Manager *m = NULL;
2563 FDSet *fds = NULL;
2564
2565 /* SysV compatibility: redirect init → telinit */
2566 redirect_telinit(argc, argv);
2567
2568 /* Take timestamps early on */
2569 dual_timestamp_from_monotonic(&kernel_timestamp, 0);
2570 dual_timestamp_get(&userspace_timestamp);
2571
2572 /* Figure out whether we need to do initialize the system, or if we already did that because we are
2573 * reexecuting */
2574 skip_setup = early_skip_setup_check(argc, argv);
2575
2576 /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we
2577 * are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */
2578 program_invocation_short_name = systemd;
2579 (void) prctl(PR_SET_NAME, systemd);
2580
2581 /* Save the original command line */
2582 save_argc_argv(argc, argv);
2583
2584 /* Save the original environment as we might need to restore it if we're requested to execute another
2585 * system manager later. */
2586 r = save_env();
2587 if (r < 0) {
2588 error_message = "Failed to copy environment block";
2589 goto finish;
2590 }
2591
2592 /* Make sure that if the user says "syslog" we actually log to the journal. */
2593 log_set_upgrade_syslog_to_journal(true);
2594
2595 if (getpid_cached() == 1) {
2596 /* When we run as PID 1 force system mode */
2597 arg_system = true;
2598
2599 /* Disable the umask logic */
2600 umask(0);
2601
2602 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might not be
2603 * activated yet (even though the log socket for it exists). */
2604 log_set_prohibit_ipc(true);
2605
2606 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This is
2607 * important so that we never end up logging to any foreign stderr, for example if we have to log in a
2608 * child process right before execve()'ing the actual binary, at a point in time where socket
2609 * activation stderr/stdout area already set up. */
2610 log_set_always_reopen_console(true);
2611
2612 if (detect_container() <= 0) {
2613
2614 /* Running outside of a container as PID 1 */
2615 log_set_target(LOG_TARGET_KMSG);
2616 log_open();
2617
2618 if (in_initrd())
2619 initrd_timestamp = userspace_timestamp;
2620
2621 if (!skip_setup) {
2622 r = mount_setup_early();
2623 if (r < 0) {
2624 error_message = "Failed to mount early API filesystems";
2625 goto finish;
2626 }
2627
2628 /* Let's open the log backend a second time, in case the first time didn't
2629 * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
2630 * available, and it previously wasn't. */
2631 log_open();
2632
2633 disable_printk_ratelimit();
2634
2635 r = initialize_security(
2636 &loaded_policy,
2637 &security_start_timestamp,
2638 &security_finish_timestamp,
2639 &error_message);
2640 if (r < 0)
2641 goto finish;
2642 }
2643
2644 if (mac_selinux_init() < 0) {
2645 error_message = "Failed to initialize SELinux support";
2646 goto finish;
2647 }
2648
2649 if (!skip_setup)
2650 initialize_clock();
2651
2652 /* Set the default for later on, but don't actually open the logs like this for now. Note that
2653 * if we are transitioning from the initrd there might still be journal fd open, and we
2654 * shouldn't attempt opening that before we parsed /proc/cmdline which might redirect output
2655 * elsewhere. */
2656 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
2657
2658 } else {
2659 /* Running inside a container, as PID 1 */
2660 log_set_target(LOG_TARGET_CONSOLE);
2661 log_open();
2662
2663 /* For later on, see above... */
2664 log_set_target(LOG_TARGET_JOURNAL);
2665
2666 /* clear the kernel timestamp, because we are in a container */
2667 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2668 }
2669
2670 initialize_coredump(skip_setup);
2671
2672 r = fixup_environment();
2673 if (r < 0) {
2674 log_emergency_errno(r, "Failed to fix up PID 1 environment: %m");
2675 error_message = "Failed to fix up PID1 environment";
2676 goto finish;
2677 }
2678
2679 /* Try to figure out if we can use colors with the console. No need to do that for user instances since
2680 * they never log into the console. */
2681 log_show_color(colors_enabled());
2682
2683 r = make_null_stdio();
2684 if (r < 0)
2685 log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
2686
2687 /* Load the kernel modules early. */
2688 if (!skip_setup)
2689 kmod_setup();
2690
2691 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
2692 r = mount_setup(loaded_policy, skip_setup);
2693 if (r < 0) {
2694 error_message = "Failed to mount API filesystems";
2695 goto finish;
2696 }
2697
2698 /* The efivarfs is now mounted, let's read the random seed off it */
2699 (void) efi_take_random_seed();
2700
2701 /* Cache command-line options passed from EFI variables */
2702 if (!skip_setup)
2703 (void) cache_efi_options_variable();
2704 } else {
2705 /* Running as user instance */
2706 arg_system = false;
2707 log_set_target(LOG_TARGET_AUTO);
2708 log_open();
2709
2710 /* clear the kernel timestamp, because we are not PID 1 */
2711 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2712
2713 if (mac_selinux_init() < 0) {
2714 error_message = "Failed to initialize SELinux support";
2715 goto finish;
2716 }
2717 }
2718
2719 /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
2720 * transitioning from the initrd to the main systemd or suchlike. */
2721 save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
2722
2723 /* Reset all signal handlers. */
2724 (void) reset_all_signal_handlers();
2725 (void) ignore_signals(SIGNALS_IGNORE, -1);
2726
2727 (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
2728
2729 r = parse_argv(argc, argv);
2730 if (r < 0) {
2731 error_message = "Failed to parse commandline arguments";
2732 goto finish;
2733 }
2734
2735 r = safety_checks();
2736 if (r < 0)
2737 goto finish;
2738
2739 if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
2740 (void) pager_open(arg_pager_flags);
2741
2742 if (arg_action != ACTION_RUN)
2743 skip_setup = true;
2744
2745 if (arg_action == ACTION_HELP) {
2746 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
2747 goto finish;
2748 } else if (arg_action == ACTION_VERSION) {
2749 retval = version();
2750 goto finish;
2751 } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
2752 unit_dump_config_items(stdout);
2753 retval = EXIT_SUCCESS;
2754 goto finish;
2755 } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
2756 dump_bus_properties(stdout);
2757 retval = EXIT_SUCCESS;
2758 goto finish;
2759 } else if (arg_action == ACTION_BUS_INTROSPECT) {
2760 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
2761 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
2762 goto finish;
2763 }
2764
2765 assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
2766
2767 /* Move out of the way, so that we won't block unmounts */
2768 assert_se(chdir("/") == 0);
2769
2770 if (arg_action == ACTION_RUN) {
2771 if (!skip_setup) {
2772 /* Apply the systemd.clock_usec= kernel command line switch */
2773 apply_clock_update();
2774
2775 /* Apply random seed from kernel command line */
2776 cmdline_take_random_seed();
2777 }
2778
2779 /* A core pattern might have been specified via the cmdline. */
2780 initialize_core_pattern(skip_setup);
2781
2782 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
2783 log_close();
2784
2785 /* Remember open file descriptors for later deserialization */
2786 r = collect_fds(&fds, &error_message);
2787 if (r < 0)
2788 goto finish;
2789
2790 /* Give up any control of the console, but make sure its initialized. */
2791 setup_console_terminal(skip_setup);
2792
2793 /* Open the logging devices, if possible and necessary */
2794 log_open();
2795 }
2796
2797 log_execution_mode(&first_boot);
2798
2799 r = initialize_runtime(skip_setup,
2800 first_boot,
2801 &saved_rlimit_nofile,
2802 &saved_rlimit_memlock,
2803 &error_message);
2804 if (r < 0)
2805 goto finish;
2806
2807 r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER,
2808 arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
2809 &m);
2810 if (r < 0) {
2811 log_emergency_errno(r, "Failed to allocate manager object: %m");
2812 error_message = "Failed to allocate manager object";
2813 goto finish;
2814 }
2815
2816 m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
2817 m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
2818 m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
2819 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
2820 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
2821
2822 set_manager_defaults(m);
2823 set_manager_settings(m);
2824 manager_set_first_boot(m, first_boot);
2825
2826 /* Remember whether we should queue the default job */
2827 queue_default_job = !arg_serialization || arg_switched_root;
2828
2829 before_startup = now(CLOCK_MONOTONIC);
2830
2831 r = manager_startup(m, arg_serialization, fds);
2832 if (r < 0) {
2833 error_message = "Failed to start up manager";
2834 goto finish;
2835 }
2836
2837 /* This will close all file descriptors that were opened, but not claimed by any unit. */
2838 fds = fdset_free(fds);
2839 arg_serialization = safe_fclose(arg_serialization);
2840
2841 if (queue_default_job) {
2842 r = do_queue_default_job(m, &error_message);
2843 if (r < 0)
2844 goto finish;
2845 }
2846
2847 after_startup = now(CLOCK_MONOTONIC);
2848
2849 log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
2850 "Loaded units and determined initial transaction in %s.",
2851 format_timespan(timespan, sizeof(timespan), after_startup - before_startup, 100 * USEC_PER_MSEC));
2852
2853 if (arg_action == ACTION_TEST) {
2854 test_summary(m);
2855 retval = EXIT_SUCCESS;
2856 goto finish;
2857 }
2858
2859 (void) invoke_main_loop(m,
2860 &saved_rlimit_nofile,
2861 &saved_rlimit_memlock,
2862 &reexecute,
2863 &retval,
2864 &shutdown_verb,
2865 &fds,
2866 &switch_root_dir,
2867 &switch_root_init,
2868 &error_message);
2869
2870 finish:
2871 pager_close();
2872
2873 if (m) {
2874 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
2875 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
2876 m = manager_free(m);
2877 }
2878
2879 mac_selinux_finish();
2880
2881 if (reexecute)
2882 do_reexecute(argc, argv,
2883 &saved_rlimit_nofile,
2884 &saved_rlimit_memlock,
2885 fds,
2886 switch_root_dir,
2887 switch_root_init,
2888 &error_message); /* This only returns if reexecution failed */
2889
2890 arg_serialization = safe_fclose(arg_serialization);
2891 fds = fdset_free(fds);
2892
2893 saved_env = strv_free(saved_env);
2894
2895 #if HAVE_VALGRIND_VALGRIND_H
2896 /* If we are PID 1 and running under valgrind, then let's exit
2897 * here explicitly. valgrind will only generate nice output on
2898 * exit(), not on exec(), hence let's do the former not the
2899 * latter here. */
2900 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
2901 /* Cleanup watchdog_device strings for valgrind. We need them
2902 * in become_shutdown() so normally we cannot free them yet. */
2903 watchdog_free_device();
2904 arg_watchdog_device = mfree(arg_watchdog_device);
2905 reset_arguments();
2906 return retval;
2907 }
2908 #endif
2909
2910 #if HAS_FEATURE_ADDRESS_SANITIZER
2911 __lsan_do_leak_check();
2912 #endif
2913
2914 if (shutdown_verb) {
2915 r = become_shutdown(shutdown_verb, retval);
2916 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
2917 error_message = "Failed to execute shutdown binary";
2918 }
2919
2920 watchdog_free_device();
2921 arg_watchdog_device = mfree(arg_watchdog_device);
2922
2923 if (getpid_cached() == 1) {
2924 if (error_message)
2925 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2926 ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
2927 "%s.", error_message);
2928 freeze_or_exit_or_reboot();
2929 }
2930
2931 reset_arguments();
2932 return retval;
2933 }