]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/main.c
Merge pull request #18704 from keszybz/fallback-hostame-override
[thirdparty/systemd.git] / src / core / main.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <getopt.h>
6 #include <sys/mount.h>
7 #include <sys/prctl.h>
8 #include <sys/reboot.h>
9 #include <unistd.h>
10 #if HAVE_SECCOMP
11 #include <seccomp.h>
12 #endif
13 #if HAVE_VALGRIND_VALGRIND_H
14 #include <valgrind/valgrind.h>
15 #endif
16
17 #include "sd-bus.h"
18 #include "sd-daemon.h"
19 #include "sd-messages.h"
20
21 #include "alloc-util.h"
22 #include "apparmor-setup.h"
23 #include "architecture.h"
24 #include "build.h"
25 #include "bus-error.h"
26 #include "bus-util.h"
27 #include "capability-util.h"
28 #include "cgroup-util.h"
29 #include "clock-util.h"
30 #include "conf-parser.h"
31 #include "cpu-set-util.h"
32 #include "dbus-manager.h"
33 #include "dbus.h"
34 #include "def.h"
35 #include "dev-setup.h"
36 #include "efi-random.h"
37 #include "efivars.h"
38 #include "emergency-action.h"
39 #include "env-util.h"
40 #include "exit-status.h"
41 #include "fd-util.h"
42 #include "fdset.h"
43 #include "fileio.h"
44 #include "format-util.h"
45 #include "fs-util.h"
46 #include "hexdecoct.h"
47 #include "hostname-setup.h"
48 #include "ima-setup.h"
49 #include "killall.h"
50 #include "kmod-setup.h"
51 #include "limits-util.h"
52 #include "load-fragment.h"
53 #include "log.h"
54 #include "loopback-setup.h"
55 #include "machine-id-setup.h"
56 #include "manager.h"
57 #include "mkdir.h"
58 #include "mount-setup.h"
59 #include "os-util.h"
60 #include "pager.h"
61 #include "parse-argument.h"
62 #include "parse-util.h"
63 #include "path-util.h"
64 #include "pretty-print.h"
65 #include "proc-cmdline.h"
66 #include "process-util.h"
67 #include "random-util.h"
68 #include "raw-clone.h"
69 #include "rlimit-util.h"
70 #if HAVE_SECCOMP
71 #include "seccomp-util.h"
72 #endif
73 #include "selinux-setup.h"
74 #include "selinux-util.h"
75 #include "signal-util.h"
76 #include "smack-setup.h"
77 #include "special.h"
78 #include "stat-util.h"
79 #include "stdio-util.h"
80 #include "strv.h"
81 #include "switch-root.h"
82 #include "sysctl-util.h"
83 #include "terminal-util.h"
84 #include "umask-util.h"
85 #include "user-util.h"
86 #include "util.h"
87 #include "virt.h"
88 #include "watchdog.h"
89
90 #if HAS_FEATURE_ADDRESS_SANITIZER
91 #include <sanitizer/lsan_interface.h>
92 #endif
93
94 #define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */
95
96 static enum {
97 ACTION_RUN,
98 ACTION_HELP,
99 ACTION_VERSION,
100 ACTION_TEST,
101 ACTION_DUMP_CONFIGURATION_ITEMS,
102 ACTION_DUMP_BUS_PROPERTIES,
103 ACTION_BUS_INTROSPECT,
104 } arg_action = ACTION_RUN;
105
106 static const char *arg_bus_introspect = NULL;
107
108 /* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
109 * defaults are assigned in reset_arguments() below. */
110 static char *arg_default_unit;
111 static bool arg_system;
112 static bool arg_dump_core;
113 static int arg_crash_chvt;
114 static bool arg_crash_shell;
115 static bool arg_crash_reboot;
116 static char *arg_confirm_spawn;
117 static ShowStatus arg_show_status;
118 static StatusUnitFormat arg_status_unit_format;
119 static bool arg_switched_root;
120 static PagerFlags arg_pager_flags;
121 static bool arg_service_watchdogs;
122 static ExecOutput arg_default_std_output;
123 static ExecOutput arg_default_std_error;
124 static usec_t arg_default_restart_usec;
125 static usec_t arg_default_timeout_start_usec;
126 static usec_t arg_default_timeout_stop_usec;
127 static usec_t arg_default_timeout_abort_usec;
128 static bool arg_default_timeout_abort_set;
129 static usec_t arg_default_start_limit_interval;
130 static unsigned arg_default_start_limit_burst;
131 static usec_t arg_runtime_watchdog;
132 static usec_t arg_reboot_watchdog;
133 static usec_t arg_kexec_watchdog;
134 static char *arg_early_core_pattern;
135 static char *arg_watchdog_device;
136 static char **arg_default_environment;
137 static char **arg_manager_environment;
138 static struct rlimit *arg_default_rlimit[_RLIMIT_MAX];
139 static uint64_t arg_capability_bounding_set;
140 static bool arg_no_new_privs;
141 static nsec_t arg_timer_slack_nsec;
142 static usec_t arg_default_timer_accuracy_usec;
143 static Set* arg_syscall_archs;
144 static FILE* arg_serialization;
145 static int arg_default_cpu_accounting;
146 static bool arg_default_io_accounting;
147 static bool arg_default_ip_accounting;
148 static bool arg_default_blockio_accounting;
149 static bool arg_default_memory_accounting;
150 static bool arg_default_tasks_accounting;
151 static TasksMax arg_default_tasks_max;
152 static sd_id128_t arg_machine_id;
153 static EmergencyAction arg_cad_burst_action;
154 static OOMPolicy arg_default_oom_policy;
155 static CPUSet arg_cpu_affinity;
156 static NUMAPolicy arg_numa_policy;
157 static usec_t arg_clock_usec;
158 static void *arg_random_seed;
159 static size_t arg_random_seed_size;
160
161 /* A copy of the original environment block */
162 static char **saved_env = NULL;
163
164 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
165 const struct rlimit *saved_rlimit_memlock);
166
167 static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
168 _cleanup_free_ char *base = NULL;
169 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
170 int r;
171
172 r = xdg_user_config_dir(&base, "/systemd");
173 if (r < 0)
174 return r;
175
176 r = strv_extendf(&files, "%s/user.conf", base);
177 if (r < 0)
178 return r;
179
180 r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
181 if (r < 0)
182 return r;
183
184 r = strv_consume(&dirs, TAKE_PTR(base));
185 if (r < 0)
186 return r;
187
188 r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
189 if (r < 0)
190 return r;
191
192 *ret_files = TAKE_PTR(files);
193 *ret_dirs = TAKE_PTR(dirs);
194 return 0;
195 }
196
197 _noreturn_ static void freeze_or_exit_or_reboot(void) {
198
199 /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
200 * the container manager, and thus inform it that something went wrong. */
201 if (detect_container() > 0) {
202 log_emergency("Exiting PID 1...");
203 _exit(EXIT_EXCEPTION);
204 }
205
206 if (arg_crash_reboot) {
207 log_notice("Rebooting in 10s...");
208 (void) sleep(10);
209
210 log_notice("Rebooting now...");
211 (void) reboot(RB_AUTOBOOT);
212 log_emergency_errno(errno, "Failed to reboot: %m");
213 }
214
215 log_emergency("Freezing execution.");
216 freeze();
217 }
218
219 _noreturn_ static void crash(int sig) {
220 struct sigaction sa;
221 pid_t pid;
222
223 if (getpid_cached() != 1)
224 /* Pass this on immediately, if this is not PID 1 */
225 (void) raise(sig);
226 else if (!arg_dump_core)
227 log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig));
228 else {
229 sa = (struct sigaction) {
230 .sa_handler = nop_signal_handler,
231 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
232 };
233
234 /* We want to wait for the core process, hence let's enable SIGCHLD */
235 (void) sigaction(SIGCHLD, &sa, NULL);
236
237 pid = raw_clone(SIGCHLD);
238 if (pid < 0)
239 log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig));
240 else if (pid == 0) {
241 /* Enable default signal handler for core dump */
242
243 sa = (struct sigaction) {
244 .sa_handler = SIG_DFL,
245 };
246 (void) sigaction(sig, &sa, NULL);
247
248 /* Don't limit the coredump size */
249 (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
250
251 /* Just to be sure... */
252 (void) chdir("/");
253
254 /* Raise the signal again */
255 pid = raw_getpid();
256 (void) kill(pid, sig); /* raise() would kill the parent */
257
258 assert_not_reached("We shouldn't be here...");
259 _exit(EXIT_EXCEPTION);
260 } else {
261 siginfo_t status;
262 int r;
263
264 /* Order things nicely. */
265 r = wait_for_terminate(pid, &status);
266 if (r < 0)
267 log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig));
268 else if (status.si_code != CLD_DUMPED) {
269 const char *s = status.si_code == CLD_EXITED
270 ? exit_status_to_string(status.si_status, EXIT_STATUS_LIBC)
271 : signal_to_string(status.si_status);
272
273 log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
274 signal_to_string(sig),
275 pid,
276 sigchld_code_to_string(status.si_code),
277 status.si_status, strna(s));
278 } else
279 log_emergency("Caught <%s>, dumped core as pid "PID_FMT".",
280 signal_to_string(sig), pid);
281 }
282 }
283
284 if (arg_crash_chvt >= 0)
285 (void) chvt(arg_crash_chvt);
286
287 sa = (struct sigaction) {
288 .sa_handler = SIG_IGN,
289 .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
290 };
291
292 /* Let the kernel reap children for us */
293 (void) sigaction(SIGCHLD, &sa, NULL);
294
295 if (arg_crash_shell) {
296 log_notice("Executing crash shell in 10s...");
297 (void) sleep(10);
298
299 pid = raw_clone(SIGCHLD);
300 if (pid < 0)
301 log_emergency_errno(errno, "Failed to fork off crash shell: %m");
302 else if (pid == 0) {
303 (void) setsid();
304 (void) make_console_stdio();
305 (void) rlimit_nofile_safe();
306 (void) execle("/bin/sh", "/bin/sh", NULL, environ);
307
308 log_emergency_errno(errno, "execle() failed: %m");
309 _exit(EXIT_EXCEPTION);
310 } else {
311 log_info("Spawned crash shell as PID "PID_FMT".", pid);
312 (void) wait_for_terminate(pid, NULL);
313 }
314 }
315
316 freeze_or_exit_or_reboot();
317 }
318
319 static void install_crash_handler(void) {
320 static const struct sigaction sa = {
321 .sa_handler = crash,
322 .sa_flags = SA_NODEFER, /* So that we can raise the signal again from the signal handler */
323 };
324 int r;
325
326 /* We ignore the return value here, since, we don't mind if we
327 * cannot set up a crash handler */
328 r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER, -1);
329 if (r < 0)
330 log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
331 }
332
333 static int console_setup(void) {
334 _cleanup_close_ int tty_fd = -1;
335 int r;
336
337 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
338 if (tty_fd < 0)
339 return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
340
341 /* We don't want to force text mode. plymouth may be showing
342 * pictures already from initrd. */
343 r = reset_terminal_fd(tty_fd, false);
344 if (r < 0)
345 return log_error_errno(r, "Failed to reset /dev/console: %m");
346
347 return 0;
348 }
349
350 static int set_machine_id(const char *m) {
351 sd_id128_t t;
352 assert(m);
353
354 if (sd_id128_from_string(m, &t) < 0)
355 return -EINVAL;
356
357 if (sd_id128_is_null(t))
358 return -EINVAL;
359
360 arg_machine_id = t;
361 return 0;
362 }
363
364 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
365 int r;
366
367 assert(key);
368
369 if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
370
371 if (proc_cmdline_value_missing(key, value))
372 return 0;
373
374 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
375 log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
376 else if (in_initrd() == !!startswith(key, "rd."))
377 return free_and_strdup_warn(&arg_default_unit, value);
378
379 } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
380
381 r = value ? parse_boolean(value) : true;
382 if (r < 0)
383 log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
384 else
385 arg_dump_core = r;
386
387 } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
388
389 if (proc_cmdline_value_missing(key, value))
390 return 0;
391
392 if (path_is_absolute(value))
393 (void) parse_path_argument(value, false, &arg_early_core_pattern);
394 else
395 log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
396
397 } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
398
399 if (!value)
400 arg_crash_chvt = 0; /* turn on */
401 else {
402 r = parse_crash_chvt(value, &arg_crash_chvt);
403 if (r < 0)
404 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
405 }
406
407 } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
408
409 r = value ? parse_boolean(value) : true;
410 if (r < 0)
411 log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
412 else
413 arg_crash_shell = r;
414
415 } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
416
417 r = value ? parse_boolean(value) : true;
418 if (r < 0)
419 log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
420 else
421 arg_crash_reboot = r;
422
423 } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
424 char *s;
425
426 r = parse_confirm_spawn(value, &s);
427 if (r < 0)
428 log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
429 else
430 free_and_replace(arg_confirm_spawn, s);
431
432 } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
433
434 r = value ? parse_boolean(value) : true;
435 if (r < 0)
436 log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
437 else
438 arg_service_watchdogs = r;
439
440 } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
441
442 if (value) {
443 r = parse_show_status(value, &arg_show_status);
444 if (r < 0)
445 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
446 } else
447 arg_show_status = SHOW_STATUS_YES;
448
449 } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
450
451 if (proc_cmdline_value_missing(key, value))
452 return 0;
453
454 r = status_unit_format_from_string(value);
455 if (r < 0)
456 log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
457 else
458 arg_status_unit_format = r;
459
460 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
461
462 if (proc_cmdline_value_missing(key, value))
463 return 0;
464
465 r = exec_output_from_string(value);
466 if (r < 0)
467 log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
468 else
469 arg_default_std_output = r;
470
471 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
472
473 if (proc_cmdline_value_missing(key, value))
474 return 0;
475
476 r = exec_output_from_string(value);
477 if (r < 0)
478 log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
479 else
480 arg_default_std_error = r;
481
482 } else if (streq(key, "systemd.setenv")) {
483
484 if (proc_cmdline_value_missing(key, value))
485 return 0;
486
487 if (!env_assignment_is_valid(value))
488 log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
489 else {
490 r = strv_env_replace_strdup(&arg_default_environment, value);
491 if (r < 0)
492 return log_oom();
493 }
494
495 } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
496
497 if (proc_cmdline_value_missing(key, value))
498 return 0;
499
500 r = set_machine_id(value);
501 if (r < 0)
502 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
503
504 } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
505
506 if (proc_cmdline_value_missing(key, value))
507 return 0;
508
509 r = parse_sec(value, &arg_default_timeout_start_usec);
510 if (r < 0)
511 log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
512
513 if (arg_default_timeout_start_usec <= 0)
514 arg_default_timeout_start_usec = USEC_INFINITY;
515
516 } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
517
518 if (proc_cmdline_value_missing(key, value))
519 return 0;
520
521 r = parse_cpu_set(value, &arg_cpu_affinity);
522 if (r < 0)
523 log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
524
525 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
526
527 if (proc_cmdline_value_missing(key, value))
528 return 0;
529
530 (void) parse_path_argument(value, false, &arg_watchdog_device);
531
532 } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
533
534 if (proc_cmdline_value_missing(key, value))
535 return 0;
536
537 r = safe_atou64(value, &arg_clock_usec);
538 if (r < 0)
539 log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
540
541 } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
542 void *p;
543 size_t sz;
544
545 if (proc_cmdline_value_missing(key, value))
546 return 0;
547
548 r = unbase64mem(value, (size_t) -1, &p, &sz);
549 if (r < 0)
550 log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
551
552 free(arg_random_seed);
553 arg_random_seed = sz > 0 ? p : mfree(p);
554 arg_random_seed_size = sz;
555
556 } else if (streq(key, "quiet") && !value) {
557
558 if (arg_show_status == _SHOW_STATUS_INVALID)
559 arg_show_status = SHOW_STATUS_ERROR;
560
561 } else if (streq(key, "debug") && !value) {
562
563 /* Note that log_parse_environment() handles 'debug'
564 * too, and sets the log level to LOG_DEBUG. */
565
566 if (detect_container() > 0)
567 log_set_target(LOG_TARGET_CONSOLE);
568
569 } else if (!value) {
570 const char *target;
571
572 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
573 target = runlevel_to_target(key);
574 if (target)
575 return free_and_strdup_warn(&arg_default_unit, target);
576 }
577
578 return 0;
579 }
580
581 #define DEFINE_SETTER(name, func, descr) \
582 static int name(const char *unit, \
583 const char *filename, \
584 unsigned line, \
585 const char *section, \
586 unsigned section_line, \
587 const char *lvalue, \
588 int ltype, \
589 const char *rvalue, \
590 void *data, \
591 void *userdata) { \
592 \
593 int r; \
594 \
595 assert(filename); \
596 assert(lvalue); \
597 assert(rvalue); \
598 \
599 r = func(rvalue); \
600 if (r < 0) \
601 log_syntax(unit, LOG_ERR, filename, line, r, \
602 "Invalid " descr "'%s': %m", \
603 rvalue); \
604 \
605 return 0; \
606 }
607
608 DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
609 DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
610 DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
611 DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
612 DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
613
614 static int config_parse_default_timeout_abort(
615 const char *unit,
616 const char *filename,
617 unsigned line,
618 const char *section,
619 unsigned section_line,
620 const char *lvalue,
621 int ltype,
622 const char *rvalue,
623 void *data,
624 void *userdata) {
625 int r;
626
627 r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
628 &arg_default_timeout_abort_usec, userdata);
629 if (r >= 0)
630 arg_default_timeout_abort_set = r;
631 return 0;
632 }
633
634 static int parse_config_file(void) {
635 const ConfigTableItem items[] = {
636 { "Manager", "LogLevel", config_parse_level2, 0, NULL },
637 { "Manager", "LogTarget", config_parse_target, 0, NULL },
638 { "Manager", "LogColor", config_parse_color, 0, NULL },
639 { "Manager", "LogLocation", config_parse_location, 0, NULL },
640 { "Manager", "LogTime", config_parse_time, 0, NULL },
641 { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
642 { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
643 { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
644 { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
645 { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
646 { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
647 { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
648 { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
649 { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
650 { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
651 { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
652 { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog },
653 { "Manager", "RebootWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog },
654 { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
655 { "Manager", "KExecWatchdogSec", config_parse_sec, 0, &arg_kexec_watchdog },
656 { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
657 { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
658 { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
659 #if HAVE_SECCOMP
660 { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
661 #endif
662 { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
663 { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec },
664 { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_default_std_output },
665 { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_default_std_error },
666 { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
667 { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
668 { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
669 { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
670 { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
671 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_default_start_limit_interval },
672 { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst },
673 { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment },
674 { "Manager", "ManagerEnvironment", config_parse_environ, 0, &arg_manager_environment },
675 { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit },
676 { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit },
677 { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit },
678 { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit },
679 { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit },
680 { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit },
681 { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit },
682 { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit },
683 { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit },
684 { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit },
685 { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit },
686 { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit },
687 { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit },
688 { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit },
689 { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit },
690 { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit },
691 { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting },
692 { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting },
693 { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting },
694 { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting },
695 { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting },
696 { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
697 { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
698 { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
699 { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
700 {}
701 };
702
703 _cleanup_strv_free_ char **_free_files = NULL, **_free_dirs = NULL;
704
705 const char *const *files, *const *dirs, *suffix;
706 int r;
707
708 if (arg_system) {
709 files = STRV_MAKE_CONST(PKGSYSCONFDIR "/system.conf");
710 dirs = (const char* const*) CONF_PATHS_STRV("systemd");
711 suffix = "system.conf.d";
712 } else {
713 r = manager_find_user_config_paths(&_free_files, &_free_dirs);
714 if (r < 0)
715 return log_error_errno(r, "Failed to determine config file paths: %m");
716 files = (const char* const*) _free_files;
717 dirs = (const char* const*) _free_dirs;
718 suffix = "user.conf.d";
719 }
720
721 (void) config_parse_many(
722 files, dirs, suffix,
723 "Manager\0",
724 config_item_table_lookup, items,
725 CONFIG_PARSE_WARN,
726 NULL,
727 NULL);
728
729 /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
730 * USEC_INFINITY like everywhere else. */
731 if (arg_default_timeout_start_usec <= 0)
732 arg_default_timeout_start_usec = USEC_INFINITY;
733 if (arg_default_timeout_stop_usec <= 0)
734 arg_default_timeout_stop_usec = USEC_INFINITY;
735
736 return 0;
737 }
738
739 static void set_manager_defaults(Manager *m) {
740
741 assert(m);
742
743 /* Propagates the various default unit property settings into the manager object, i.e. properties that do not
744 * affect the manager itself, but are just what newly allocated units will have set if they haven't set
745 * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */
746
747 m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec;
748 m->default_std_output = arg_default_std_output;
749 m->default_std_error = arg_default_std_error;
750 m->default_timeout_start_usec = arg_default_timeout_start_usec;
751 m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
752 m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
753 m->default_timeout_abort_set = arg_default_timeout_abort_set;
754 m->default_restart_usec = arg_default_restart_usec;
755 m->default_start_limit_interval = arg_default_start_limit_interval;
756 m->default_start_limit_burst = arg_default_start_limit_burst;
757
758 /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
759 * controller to be enabled, so the default is to enable it unless we got told otherwise. */
760 if (arg_default_cpu_accounting >= 0)
761 m->default_cpu_accounting = arg_default_cpu_accounting;
762 else
763 m->default_cpu_accounting = cpu_accounting_is_cheap();
764
765 m->default_io_accounting = arg_default_io_accounting;
766 m->default_ip_accounting = arg_default_ip_accounting;
767 m->default_blockio_accounting = arg_default_blockio_accounting;
768 m->default_memory_accounting = arg_default_memory_accounting;
769 m->default_tasks_accounting = arg_default_tasks_accounting;
770 m->default_tasks_max = arg_default_tasks_max;
771 m->default_oom_policy = arg_default_oom_policy;
772
773 (void) manager_set_default_rlimits(m, arg_default_rlimit);
774
775 (void) manager_default_environment(m);
776 (void) manager_transient_environment_add(m, arg_default_environment);
777 }
778
779 static void set_manager_settings(Manager *m) {
780
781 assert(m);
782
783 /* Propagates the various manager settings into the manager object, i.e. properties that
784 * effect the manager itself (as opposed to just being inherited into newly allocated
785 * units, see set_manager_defaults() above). */
786
787 m->confirm_spawn = arg_confirm_spawn;
788 m->service_watchdogs = arg_service_watchdogs;
789 m->cad_burst_action = arg_cad_burst_action;
790
791 manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
792 manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
793 manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
794
795 manager_set_show_status(m, arg_show_status, "commandline");
796 m->status_unit_format = arg_status_unit_format;
797 }
798
799 static int parse_argv(int argc, char *argv[]) {
800 enum {
801 ARG_LOG_LEVEL = 0x100,
802 ARG_LOG_TARGET,
803 ARG_LOG_COLOR,
804 ARG_LOG_LOCATION,
805 ARG_LOG_TIME,
806 ARG_UNIT,
807 ARG_SYSTEM,
808 ARG_USER,
809 ARG_TEST,
810 ARG_NO_PAGER,
811 ARG_VERSION,
812 ARG_DUMP_CONFIGURATION_ITEMS,
813 ARG_DUMP_BUS_PROPERTIES,
814 ARG_BUS_INTROSPECT,
815 ARG_DUMP_CORE,
816 ARG_CRASH_CHVT,
817 ARG_CRASH_SHELL,
818 ARG_CRASH_REBOOT,
819 ARG_CONFIRM_SPAWN,
820 ARG_SHOW_STATUS,
821 ARG_DESERIALIZE,
822 ARG_SWITCHED_ROOT,
823 ARG_DEFAULT_STD_OUTPUT,
824 ARG_DEFAULT_STD_ERROR,
825 ARG_MACHINE_ID,
826 ARG_SERVICE_WATCHDOGS,
827 };
828
829 static const struct option options[] = {
830 { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
831 { "log-target", required_argument, NULL, ARG_LOG_TARGET },
832 { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
833 { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
834 { "log-time", optional_argument, NULL, ARG_LOG_TIME },
835 { "unit", required_argument, NULL, ARG_UNIT },
836 { "system", no_argument, NULL, ARG_SYSTEM },
837 { "user", no_argument, NULL, ARG_USER },
838 { "test", no_argument, NULL, ARG_TEST },
839 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
840 { "help", no_argument, NULL, 'h' },
841 { "version", no_argument, NULL, ARG_VERSION },
842 { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS },
843 { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES },
844 { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
845 { "dump-core", optional_argument, NULL, ARG_DUMP_CORE },
846 { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT },
847 { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL },
848 { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT },
849 { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN },
850 { "show-status", optional_argument, NULL, ARG_SHOW_STATUS },
851 { "deserialize", required_argument, NULL, ARG_DESERIALIZE },
852 { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT },
853 { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, },
854 { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, },
855 { "machine-id", required_argument, NULL, ARG_MACHINE_ID },
856 { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS },
857 {}
858 };
859
860 int c, r;
861 bool user_arg_seen = false;
862
863 assert(argc >= 1);
864 assert(argv);
865
866 if (getpid_cached() == 1)
867 opterr = 0;
868
869 while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0)
870
871 switch (c) {
872
873 case ARG_LOG_LEVEL:
874 r = log_set_max_level_from_string(optarg);
875 if (r < 0)
876 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
877
878 break;
879
880 case ARG_LOG_TARGET:
881 r = log_set_target_from_string(optarg);
882 if (r < 0)
883 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
884
885 break;
886
887 case ARG_LOG_COLOR:
888
889 if (optarg) {
890 r = log_show_color_from_string(optarg);
891 if (r < 0)
892 return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
893 optarg);
894 } else
895 log_show_color(true);
896
897 break;
898
899 case ARG_LOG_LOCATION:
900 if (optarg) {
901 r = log_show_location_from_string(optarg);
902 if (r < 0)
903 return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
904 optarg);
905 } else
906 log_show_location(true);
907
908 break;
909
910 case ARG_LOG_TIME:
911
912 if (optarg) {
913 r = log_show_time_from_string(optarg);
914 if (r < 0)
915 return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
916 optarg);
917 } else
918 log_show_time(true);
919
920 break;
921
922 case ARG_DEFAULT_STD_OUTPUT:
923 r = exec_output_from_string(optarg);
924 if (r < 0)
925 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
926 optarg);
927 arg_default_std_output = r;
928 break;
929
930 case ARG_DEFAULT_STD_ERROR:
931 r = exec_output_from_string(optarg);
932 if (r < 0)
933 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
934 optarg);
935 arg_default_std_error = r;
936 break;
937
938 case ARG_UNIT:
939 r = free_and_strdup(&arg_default_unit, optarg);
940 if (r < 0)
941 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
942
943 break;
944
945 case ARG_SYSTEM:
946 arg_system = true;
947 break;
948
949 case ARG_USER:
950 arg_system = false;
951 user_arg_seen = true;
952 break;
953
954 case ARG_TEST:
955 arg_action = ACTION_TEST;
956 break;
957
958 case ARG_NO_PAGER:
959 arg_pager_flags |= PAGER_DISABLE;
960 break;
961
962 case ARG_VERSION:
963 arg_action = ACTION_VERSION;
964 break;
965
966 case ARG_DUMP_CONFIGURATION_ITEMS:
967 arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
968 break;
969
970 case ARG_DUMP_BUS_PROPERTIES:
971 arg_action = ACTION_DUMP_BUS_PROPERTIES;
972 break;
973
974 case ARG_BUS_INTROSPECT:
975 arg_bus_introspect = optarg;
976 arg_action = ACTION_BUS_INTROSPECT;
977 break;
978
979 case ARG_DUMP_CORE:
980 r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
981 if (r < 0)
982 return r;
983 break;
984
985 case ARG_CRASH_CHVT:
986 r = parse_crash_chvt(optarg, &arg_crash_chvt);
987 if (r < 0)
988 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
989 optarg);
990 break;
991
992 case ARG_CRASH_SHELL:
993 r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
994 if (r < 0)
995 return r;
996 break;
997
998 case ARG_CRASH_REBOOT:
999 r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
1000 if (r < 0)
1001 return r;
1002 break;
1003
1004 case ARG_CONFIRM_SPAWN:
1005 arg_confirm_spawn = mfree(arg_confirm_spawn);
1006
1007 r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
1008 if (r < 0)
1009 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
1010 optarg);
1011 break;
1012
1013 case ARG_SERVICE_WATCHDOGS:
1014 r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
1015 if (r < 0)
1016 return r;
1017 break;
1018
1019 case ARG_SHOW_STATUS:
1020 if (optarg) {
1021 r = parse_show_status(optarg, &arg_show_status);
1022 if (r < 0)
1023 return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1024 optarg);
1025 } else
1026 arg_show_status = SHOW_STATUS_YES;
1027 break;
1028
1029 case ARG_DESERIALIZE: {
1030 int fd;
1031 FILE *f;
1032
1033 r = safe_atoi(optarg, &fd);
1034 if (r < 0)
1035 log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg);
1036 if (fd < 0)
1037 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1038 "Invalid deserialize fd: %d",
1039 fd);
1040
1041 (void) fd_cloexec(fd, true);
1042
1043 f = fdopen(fd, "r");
1044 if (!f)
1045 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1046
1047 safe_fclose(arg_serialization);
1048 arg_serialization = f;
1049
1050 break;
1051 }
1052
1053 case ARG_SWITCHED_ROOT:
1054 arg_switched_root = true;
1055 break;
1056
1057 case ARG_MACHINE_ID:
1058 r = set_machine_id(optarg);
1059 if (r < 0)
1060 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1061 break;
1062
1063 case 'h':
1064 arg_action = ACTION_HELP;
1065 break;
1066
1067 case 'D':
1068 log_set_max_level(LOG_DEBUG);
1069 break;
1070
1071 case 'b':
1072 case 's':
1073 case 'z':
1074 /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1075 * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1076 */
1077 case '?':
1078 if (getpid_cached() != 1)
1079 return -EINVAL;
1080 else
1081 return 0;
1082
1083 default:
1084 assert_not_reached("Unhandled option code.");
1085 }
1086
1087 if (optind < argc && getpid_cached() != 1)
1088 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1089 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1090
1091 if (arg_action == ACTION_RUN && !arg_system && !user_arg_seen)
1092 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1093 "Explicit --user argument required to run as user manager.");
1094
1095 return 0;
1096 }
1097
1098 static int help(void) {
1099 _cleanup_free_ char *link = NULL;
1100 int r;
1101
1102 r = terminal_urlify_man("systemd", "1", &link);
1103 if (r < 0)
1104 return log_oom();
1105
1106 printf("%s [OPTIONS...]\n\n"
1107 "%sStarts and monitors system and user services.%s\n\n"
1108 "This program takes no positional arguments.\n\n"
1109 "%sOptions%s:\n"
1110 " -h --help Show this help\n"
1111 " --version Show version\n"
1112 " --test Determine initial transaction, dump it and exit\n"
1113 " --system In combination with --test: operate as system service manager\n"
1114 " --user In combination with --test: operate as per-user service manager\n"
1115 " --no-pager Do not pipe output into a pager\n"
1116 " --dump-configuration-items Dump understood unit configuration items\n"
1117 " --dump-bus-properties Dump exposed bus properties\n"
1118 " --bus-introspect=PATH Write XML introspection data\n"
1119 " --unit=UNIT Set default unit\n"
1120 " --dump-core[=BOOL] Dump core on crash\n"
1121 " --crash-vt=NR Change to specified VT on crash\n"
1122 " --crash-reboot[=BOOL] Reboot on crash\n"
1123 " --crash-shell[=BOOL] Run shell on crash\n"
1124 " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
1125 " --show-status[=BOOL] Show status updates on the console during bootup\n"
1126 " --log-target=TARGET Set log target (console, journal, kmsg, journal-or-kmsg, null)\n"
1127 " --log-level=LEVEL Set log level (debug, info, notice, warning, err, crit, alert, emerg)\n"
1128 " --log-color[=BOOL] Highlight important log messages\n"
1129 " --log-location[=BOOL] Include code location in log messages\n"
1130 " --log-time[=BOOL] Prefix log messages with current time\n"
1131 " --default-standard-output= Set default standard output for services\n"
1132 " --default-standard-error= Set default standard error output for services\n"
1133 "\nSee the %s for details.\n",
1134 program_invocation_short_name,
1135 ansi_highlight(),
1136 ansi_normal(),
1137 ansi_underline(),
1138 ansi_normal(),
1139 link);
1140
1141 return 0;
1142 }
1143
1144 static int prepare_reexecute(
1145 Manager *m,
1146 FILE **ret_f,
1147 FDSet **ret_fds,
1148 bool switching_root) {
1149
1150 _cleanup_fdset_free_ FDSet *fds = NULL;
1151 _cleanup_fclose_ FILE *f = NULL;
1152 int r;
1153
1154 assert(m);
1155 assert(ret_f);
1156 assert(ret_fds);
1157
1158 r = manager_open_serialization(m, &f);
1159 if (r < 0)
1160 return log_error_errno(r, "Failed to create serialization file: %m");
1161
1162 /* Make sure nothing is really destructed when we shut down */
1163 m->n_reloading++;
1164 bus_manager_send_reloading(m, true);
1165
1166 fds = fdset_new();
1167 if (!fds)
1168 return log_oom();
1169
1170 r = manager_serialize(m, f, fds, switching_root);
1171 if (r < 0)
1172 return r;
1173
1174 if (fseeko(f, 0, SEEK_SET) == (off_t) -1)
1175 return log_error_errno(errno, "Failed to rewind serialization fd: %m");
1176
1177 r = fd_cloexec(fileno(f), false);
1178 if (r < 0)
1179 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1180
1181 r = fdset_cloexec(fds, false);
1182 if (r < 0)
1183 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1184
1185 *ret_f = TAKE_PTR(f);
1186 *ret_fds = TAKE_PTR(fds);
1187
1188 return 0;
1189 }
1190
1191 static void bump_file_max_and_nr_open(void) {
1192
1193 /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file
1194 * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting
1195 * them and limiting them in another two layers of limits is unnecessary and just complicates things. This
1196 * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft +
1197 * hard) the only ones that really matter. */
1198
1199 #if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1200 int r;
1201 #endif
1202
1203 #if BUMP_PROC_SYS_FS_FILE_MAX
1204 /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously thing where
1205 * different but the operation would fail silently.) */
1206 r = sysctl_writef("fs/file-max", "%li\n", LONG_MAX);
1207 if (r < 0)
1208 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m");
1209 #endif
1210
1211 #if BUMP_PROC_SYS_FS_NR_OPEN
1212 int v = INT_MAX;
1213
1214 /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they
1215 * are. The expression by which the maximum is determined is dependent on the architecture, and is something we
1216 * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since
1217 * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with
1218 * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel
1219 * APIs are kernel APIs, so what do can we do... 🤯 */
1220
1221 for (;;) {
1222 int k;
1223
1224 v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1225 if (v < 1024) {
1226 log_warning("Can't bump fs.nr_open, value too small.");
1227 break;
1228 }
1229
1230 k = read_nr_open();
1231 if (k < 0) {
1232 log_error_errno(k, "Failed to read fs.nr_open: %m");
1233 break;
1234 }
1235 if (k >= v) { /* Already larger */
1236 log_debug("Skipping bump, value is already larger.");
1237 break;
1238 }
1239
1240 r = sysctl_writef("fs/nr_open", "%i\n", v);
1241 if (r == -EINVAL) {
1242 log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1243 v /= 2;
1244 continue;
1245 }
1246 if (r < 0) {
1247 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1248 break;
1249 }
1250
1251 log_debug("Successfully bumped fs.nr_open to %i", v);
1252 break;
1253 }
1254 #endif
1255 }
1256
1257 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
1258 struct rlimit new_rlimit;
1259 int r, nr;
1260
1261 /* Get the underlying absolute limit the kernel enforces */
1262 nr = read_nr_open();
1263
1264 /* Calculate the new limits to use for us. Never lower from what we inherited. */
1265 new_rlimit = (struct rlimit) {
1266 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1267 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1268 };
1269
1270 /* Shortcut if nothing changes. */
1271 if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1272 saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1273 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1274 return 0;
1275 }
1276
1277 /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1278 * both hard and soft. */
1279 r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1280 if (r < 0)
1281 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1282
1283 return 0;
1284 }
1285
1286 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
1287 struct rlimit new_rlimit;
1288 uint64_t mm;
1289 int r;
1290
1291 /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should
1292 * normally disable such checks. We need them to implement IPAddressAllow= and IPAddressDeny=, hence let's bump
1293 * the value high enough for our user. */
1294
1295 /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1296 * must be unsigned, hence this is a given, but let's make this clear here. */
1297 assert_cc(RLIM_INFINITY > 0);
1298
1299 mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of physical
1300 * RAM. We allow an eighth to be locked by us, just to pick a value. */
1301
1302 new_rlimit = (struct rlimit) {
1303 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1304 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1305 };
1306
1307 if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1308 saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1309 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1310 return 0;
1311 }
1312
1313 r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1314 if (r < 0)
1315 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1316
1317 return 0;
1318 }
1319
1320 static void test_usr(void) {
1321
1322 /* Check that /usr is either on the same file system as / or mounted already. */
1323
1324 if (dir_is_empty("/usr") <= 0)
1325 return;
1326
1327 log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
1328 "Some things will probably break (sometimes even silently) in mysterious ways. "
1329 "Consult http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
1330 }
1331
1332 static int enforce_syscall_archs(Set *archs) {
1333 #if HAVE_SECCOMP
1334 int r;
1335
1336 if (!is_seccomp_available())
1337 return 0;
1338
1339 r = seccomp_restrict_archs(arg_syscall_archs);
1340 if (r < 0)
1341 return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
1342 #endif
1343 return 0;
1344 }
1345
1346 static int status_welcome(void) {
1347 _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL;
1348 int r;
1349
1350 if (!show_status_on(arg_show_status))
1351 return 0;
1352
1353 r = parse_os_release(NULL,
1354 "PRETTY_NAME", &pretty_name,
1355 "ANSI_COLOR", &ansi_color);
1356 if (r < 0)
1357 log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1358 "Failed to read os-release file, ignoring: %m");
1359
1360 if (log_get_show_color())
1361 return status_printf(NULL, 0,
1362 "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
1363 isempty(ansi_color) ? "1" : ansi_color,
1364 isempty(pretty_name) ? "Linux" : pretty_name);
1365 else
1366 return status_printf(NULL, 0,
1367 "\nWelcome to %s!\n",
1368 isempty(pretty_name) ? "Linux" : pretty_name);
1369 }
1370
1371 static int write_container_id(void) {
1372 const char *c;
1373 int r;
1374
1375 c = getenv("container");
1376 if (isempty(c))
1377 return 0;
1378
1379 RUN_WITH_UMASK(0022)
1380 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1381 if (r < 0)
1382 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1383
1384 return 1;
1385 }
1386
1387 static int bump_unix_max_dgram_qlen(void) {
1388 _cleanup_free_ char *qlen = NULL;
1389 unsigned long v;
1390 int r;
1391
1392 /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set the value
1393 * really really early during boot, so that it is actually applied to all our sockets, including the
1394 * $NOTIFY_SOCKET one. */
1395
1396 r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1397 if (r < 0)
1398 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1399
1400 r = safe_atolu(qlen, &v);
1401 if (r < 0)
1402 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1403
1404 if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1405 return 0;
1406
1407 r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN);
1408 if (r < 0)
1409 return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1410 "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1411
1412 return 1;
1413 }
1414
1415 static int fixup_environment(void) {
1416 _cleanup_free_ char *term = NULL;
1417 const char *t;
1418 int r;
1419
1420 /* Only fix up the environment when we are started as PID 1 */
1421 if (getpid_cached() != 1)
1422 return 0;
1423
1424 /* We expect the environment to be set correctly if run inside a container. */
1425 if (detect_container() > 0)
1426 return 0;
1427
1428 /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the backend
1429 * device used by the console. We try to make a better guess here since some consoles might not have support
1430 * for color mode for example.
1431 *
1432 * However if TERM was configured through the kernel command line then leave it alone. */
1433 r = proc_cmdline_get_key("TERM", 0, &term);
1434 if (r < 0)
1435 return r;
1436
1437 t = term ?: default_term_for_tty("/dev/console");
1438
1439 if (setenv("TERM", t, 1) < 0)
1440 return -errno;
1441
1442 /* The kernels sets HOME=/ for init. Let's undo this. */
1443 if (path_equal_ptr(getenv("HOME"), "/"))
1444 assert_se(unsetenv("HOME") == 0);
1445
1446 return 0;
1447 }
1448
1449 static void redirect_telinit(int argc, char *argv[]) {
1450
1451 /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
1452
1453 #if HAVE_SYSV_COMPAT
1454 if (getpid_cached() == 1)
1455 return;
1456
1457 if (!invoked_as(argv, "init"))
1458 return;
1459
1460 execv(SYSTEMCTL_BINARY_PATH, argv);
1461 log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
1462 exit(EXIT_FAILURE);
1463 #endif
1464 }
1465
1466 static int become_shutdown(
1467 const char *shutdown_verb,
1468 int retval) {
1469
1470 char log_level[DECIMAL_STR_MAX(int) + 1],
1471 exit_code[DECIMAL_STR_MAX(uint8_t) + 1],
1472 timeout[DECIMAL_STR_MAX(usec_t) + 1];
1473
1474 const char* command_line[13] = {
1475 SYSTEMD_SHUTDOWN_BINARY_PATH,
1476 shutdown_verb,
1477 "--timeout", timeout,
1478 "--log-level", log_level,
1479 "--log-target",
1480 };
1481
1482 _cleanup_strv_free_ char **env_block = NULL;
1483 size_t pos = 7;
1484 int r;
1485 usec_t watchdog_timer = 0;
1486
1487 assert(shutdown_verb);
1488 assert(!command_line[pos]);
1489 env_block = strv_copy(environ);
1490
1491 xsprintf(log_level, "%d", log_get_max_level());
1492 xsprintf(timeout, "%" PRI_USEC "us", arg_default_timeout_stop_usec);
1493
1494 switch (log_get_target()) {
1495
1496 case LOG_TARGET_KMSG:
1497 case LOG_TARGET_JOURNAL_OR_KMSG:
1498 case LOG_TARGET_SYSLOG_OR_KMSG:
1499 command_line[pos++] = "kmsg";
1500 break;
1501
1502 case LOG_TARGET_NULL:
1503 command_line[pos++] = "null";
1504 break;
1505
1506 case LOG_TARGET_CONSOLE:
1507 default:
1508 command_line[pos++] = "console";
1509 break;
1510 };
1511
1512 if (log_get_show_color())
1513 command_line[pos++] = "--log-color";
1514
1515 if (log_get_show_location())
1516 command_line[pos++] = "--log-location";
1517
1518 if (log_get_show_time())
1519 command_line[pos++] = "--log-time";
1520
1521 if (streq(shutdown_verb, "exit")) {
1522 command_line[pos++] = "--exit-code";
1523 command_line[pos++] = exit_code;
1524 xsprintf(exit_code, "%d", retval);
1525 }
1526
1527 assert(pos < ELEMENTSOF(command_line));
1528
1529 if (streq(shutdown_verb, "reboot"))
1530 watchdog_timer = arg_reboot_watchdog;
1531 else if (streq(shutdown_verb, "kexec"))
1532 watchdog_timer = arg_kexec_watchdog;
1533
1534 if (watchdog_timer > 0 && watchdog_timer != USEC_INFINITY) {
1535
1536 char *e;
1537
1538 /* If we reboot or kexec let's set the shutdown
1539 * watchdog and tell the shutdown binary to
1540 * repeatedly ping it */
1541 r = watchdog_set_timeout(&watchdog_timer);
1542 watchdog_close(r < 0);
1543
1544 /* Tell the binary how often to ping, ignore failure */
1545 if (asprintf(&e, "WATCHDOG_USEC="USEC_FMT, watchdog_timer) > 0)
1546 (void) strv_consume(&env_block, e);
1547
1548 if (arg_watchdog_device &&
1549 asprintf(&e, "WATCHDOG_DEVICE=%s", arg_watchdog_device) > 0)
1550 (void) strv_consume(&env_block, e);
1551 } else
1552 watchdog_close(true);
1553
1554 /* Avoid the creation of new processes forked by the
1555 * kernel; at this point, we will not listen to the
1556 * signals anyway */
1557 if (detect_container() <= 0)
1558 (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
1559
1560 execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1561 return -errno;
1562 }
1563
1564 static void initialize_clock(void) {
1565 int r;
1566
1567 /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1568 * we are running, but only once. */
1569
1570 if (clock_is_localtime(NULL) > 0) {
1571 int min;
1572
1573 /*
1574 * The very first call of settimeofday() also does a time warp in the kernel.
1575 *
1576 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to take care
1577 * of maintaining the RTC and do all adjustments. This matches the behavior of Windows, which leaves
1578 * the RTC alone if the registry tells that the RTC runs in UTC.
1579 */
1580 r = clock_set_timezone(&min);
1581 if (r < 0)
1582 log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1583 else
1584 log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1585
1586 } else if (!in_initrd())
1587 /*
1588 * Do a dummy very first call to seal the kernel's time warp magic.
1589 *
1590 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with LOCAL, but the
1591 * real system could be set up that way. In such case, we need to delay the time-warp or the sealing
1592 * until we reach the real system.
1593 *
1594 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, the time
1595 * will jump or be incorrect at every daylight saving time change. All kernel local time concepts will
1596 * be treated as UTC that way.
1597 */
1598 (void) clock_reset_timewarp();
1599
1600 r = clock_apply_epoch();
1601 if (r < 0)
1602 log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
1603 else if (r > 0)
1604 log_info("System time before build time, advancing clock.");
1605 }
1606
1607 static void apply_clock_update(void) {
1608 struct timespec ts;
1609
1610 /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
1611 * command line and such. */
1612
1613 if (arg_clock_usec == 0)
1614 return;
1615
1616 if (getpid_cached() != 1)
1617 return;
1618
1619 if (clock_settime(CLOCK_REALTIME, timespec_store(&ts, arg_clock_usec)) < 0)
1620 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1621 else {
1622 char buf[FORMAT_TIMESTAMP_MAX];
1623
1624 log_info("Set system clock to %s, as specified on the kernel command line.",
1625 format_timestamp(buf, sizeof(buf), arg_clock_usec));
1626 }
1627 }
1628
1629 static void cmdline_take_random_seed(void) {
1630 size_t suggested;
1631 int r;
1632
1633 if (arg_random_seed_size == 0)
1634 return;
1635
1636 if (getpid_cached() != 1)
1637 return;
1638
1639 assert(arg_random_seed);
1640 suggested = random_pool_size();
1641
1642 if (arg_random_seed_size < suggested)
1643 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1644 arg_random_seed_size, suggested);
1645
1646 r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1647 if (r < 0) {
1648 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1649 return;
1650 }
1651
1652 log_notice("Successfully credited entropy passed on kernel command line.\n"
1653 "Note that the seed provided this way is accessible to unprivileged programs. This functionality should not be used outside of testing environments.");
1654 }
1655
1656 static void initialize_coredump(bool skip_setup) {
1657 #if ENABLE_COREDUMP
1658 if (getpid_cached() != 1)
1659 return;
1660
1661 /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour the limit)
1662 * will process core dumps for system services by default. */
1663 if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1664 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1665
1666 /* But at the same time, turn off the core_pattern logic by default, so that no
1667 * coredumps are stored until the systemd-coredump tool is enabled via
1668 * sysctl. However it can be changed via the kernel command line later so core
1669 * dumps can still be generated during early startup and in initramfs. */
1670 if (!skip_setup)
1671 disable_coredumps();
1672 #endif
1673 }
1674
1675 static void initialize_core_pattern(bool skip_setup) {
1676 int r;
1677
1678 if (skip_setup || !arg_early_core_pattern)
1679 return;
1680
1681 if (getpid_cached() != 1)
1682 return;
1683
1684 r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1685 if (r < 0)
1686 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern);
1687 }
1688
1689 static void update_cpu_affinity(bool skip_setup) {
1690 _cleanup_free_ char *mask = NULL;
1691
1692 if (skip_setup || !arg_cpu_affinity.set)
1693 return;
1694
1695 assert(arg_cpu_affinity.allocated > 0);
1696
1697 mask = cpu_set_to_string(&arg_cpu_affinity);
1698 log_debug("Setting CPU affinity to %s.", strnull(mask));
1699
1700 if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1701 log_warning_errno(errno, "Failed to set CPU affinity: %m");
1702 }
1703
1704 static void update_numa_policy(bool skip_setup) {
1705 int r;
1706 _cleanup_free_ char *nodes = NULL;
1707 const char * policy = NULL;
1708
1709 if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1710 return;
1711
1712 if (DEBUG_LOGGING) {
1713 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1714 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1715 log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
1716 }
1717
1718 r = apply_numa_policy(&arg_numa_policy);
1719 if (r == -EOPNOTSUPP)
1720 log_debug_errno(r, "NUMA support not available, ignoring.");
1721 else if (r < 0)
1722 log_warning_errno(r, "Failed to set NUMA memory policy: %m");
1723 }
1724
1725 static void do_reexecute(
1726 int argc,
1727 char *argv[],
1728 const struct rlimit *saved_rlimit_nofile,
1729 const struct rlimit *saved_rlimit_memlock,
1730 FDSet *fds,
1731 const char *switch_root_dir,
1732 const char *switch_root_init,
1733 const char **ret_error_message) {
1734
1735 unsigned i, j, args_size;
1736 const char **args;
1737 int r;
1738
1739 assert(saved_rlimit_nofile);
1740 assert(saved_rlimit_memlock);
1741 assert(ret_error_message);
1742
1743 /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get rebooted while
1744 * we do that */
1745 watchdog_close(true);
1746
1747 /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
1748 * the kernel default to its child processes */
1749 if (saved_rlimit_nofile->rlim_cur != 0)
1750 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
1751 if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
1752 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
1753
1754 if (switch_root_dir) {
1755 /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1756 * SIGCHLD for them after deserializing. */
1757 broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec);
1758
1759 /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */
1760 r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE);
1761 if (r < 0)
1762 log_error_errno(r, "Failed to switch root, trying to continue: %m");
1763 }
1764
1765 args_size = MAX(6, argc+1);
1766 args = newa(const char*, args_size);
1767
1768 if (!switch_root_init) {
1769 char sfd[DECIMAL_STR_MAX(int) + 1];
1770
1771 /* First try to spawn ourselves with the right path, and with full serialization. We do this only if
1772 * the user didn't specify an explicit init to spawn. */
1773
1774 assert(arg_serialization);
1775 assert(fds);
1776
1777 xsprintf(sfd, "%i", fileno(arg_serialization));
1778
1779 i = 0;
1780 args[i++] = SYSTEMD_BINARY_PATH;
1781 if (switch_root_dir)
1782 args[i++] = "--switched-root";
1783 args[i++] = arg_system ? "--system" : "--user";
1784 args[i++] = "--deserialize";
1785 args[i++] = sfd;
1786 args[i++] = NULL;
1787
1788 assert(i <= args_size);
1789
1790 /*
1791 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do this is on
1792 * its own on exec(), but it will do it on exit(). Hence, to ensure we get a summary here, fork() off
1793 * a child, let it exit() cleanly, so that it prints the summary, and wait() for it in the parent,
1794 * before proceeding into the exec().
1795 */
1796 valgrind_summary_hack();
1797
1798 (void) execv(args[0], (char* const*) args);
1799 log_debug_errno(errno, "Failed to execute our own binary, trying fallback: %m");
1800 }
1801
1802 /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and envp[]. (Well,
1803 * modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], but let's hope that
1804 * doesn't matter.) */
1805
1806 arg_serialization = safe_fclose(arg_serialization);
1807 fds = fdset_free(fds);
1808
1809 /* Reopen the console */
1810 (void) make_console_stdio();
1811
1812 for (j = 1, i = 1; j < (unsigned) argc; j++)
1813 args[i++] = argv[j];
1814 args[i++] = NULL;
1815 assert(i <= args_size);
1816
1817 /* Re-enable any blocked signals, especially important if we switch from initial ramdisk to init=... */
1818 (void) reset_all_signal_handlers();
1819 (void) reset_signal_mask();
1820 (void) rlimit_nofile_safe();
1821
1822 if (switch_root_init) {
1823 args[0] = switch_root_init;
1824 (void) execve(args[0], (char* const*) args, saved_env);
1825 log_warning_errno(errno, "Failed to execute configured init, trying fallback: %m");
1826 }
1827
1828 args[0] = "/sbin/init";
1829 (void) execv(args[0], (char* const*) args);
1830 r = -errno;
1831
1832 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
1833 ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
1834 "Failed to execute /sbin/init");
1835
1836 if (r == -ENOENT) {
1837 log_warning("No /sbin/init, trying fallback");
1838
1839 args[0] = "/bin/sh";
1840 args[1] = NULL;
1841 (void) execve(args[0], (char* const*) args, saved_env);
1842 log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
1843 } else
1844 log_warning_errno(r, "Failed to execute /sbin/init, giving up: %m");
1845
1846 *ret_error_message = "Failed to execute fallback shell";
1847 }
1848
1849 static int invoke_main_loop(
1850 Manager *m,
1851 const struct rlimit *saved_rlimit_nofile,
1852 const struct rlimit *saved_rlimit_memlock,
1853 bool *ret_reexecute,
1854 int *ret_retval, /* Return parameters relevant for shutting down */
1855 const char **ret_shutdown_verb, /* … */
1856 FDSet **ret_fds, /* Return parameters for reexecuting */
1857 char **ret_switch_root_dir, /* … */
1858 char **ret_switch_root_init, /* … */
1859 const char **ret_error_message) {
1860
1861 int r;
1862
1863 assert(m);
1864 assert(saved_rlimit_nofile);
1865 assert(saved_rlimit_memlock);
1866 assert(ret_reexecute);
1867 assert(ret_retval);
1868 assert(ret_shutdown_verb);
1869 assert(ret_fds);
1870 assert(ret_switch_root_dir);
1871 assert(ret_switch_root_init);
1872 assert(ret_error_message);
1873
1874 for (;;) {
1875 r = manager_loop(m);
1876 if (r < 0) {
1877 *ret_error_message = "Failed to run main loop";
1878 return log_emergency_errno(r, "Failed to run main loop: %m");
1879 }
1880
1881 switch ((ManagerObjective) r) {
1882
1883 case MANAGER_RELOAD: {
1884 LogTarget saved_log_target;
1885 int saved_log_level;
1886
1887 log_info("Reloading.");
1888
1889 /* First, save any overridden log level/target, then parse the configuration file, which might
1890 * change the log level to new settings. */
1891
1892 saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
1893 saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
1894
1895 (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
1896
1897 set_manager_defaults(m);
1898 set_manager_settings(m);
1899
1900 update_cpu_affinity(false);
1901 update_numa_policy(false);
1902
1903 if (saved_log_level >= 0)
1904 manager_override_log_level(m, saved_log_level);
1905 if (saved_log_target >= 0)
1906 manager_override_log_target(m, saved_log_target);
1907
1908 r = manager_reload(m);
1909 if (r < 0)
1910 /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */
1911 m->objective = MANAGER_OK;
1912
1913 break;
1914 }
1915
1916 case MANAGER_REEXECUTE:
1917
1918 r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
1919 if (r < 0) {
1920 *ret_error_message = "Failed to prepare for reexecution";
1921 return r;
1922 }
1923
1924 log_notice("Reexecuting.");
1925
1926 *ret_reexecute = true;
1927 *ret_retval = EXIT_SUCCESS;
1928 *ret_shutdown_verb = NULL;
1929 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1930
1931 return 0;
1932
1933 case MANAGER_SWITCH_ROOT:
1934 if (!m->switch_root_init) {
1935 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
1936 if (r < 0) {
1937 *ret_error_message = "Failed to prepare for reexecution";
1938 return r;
1939 }
1940 } else
1941 *ret_fds = NULL;
1942
1943 log_notice("Switching root.");
1944
1945 *ret_reexecute = true;
1946 *ret_retval = EXIT_SUCCESS;
1947 *ret_shutdown_verb = NULL;
1948
1949 /* Steal the switch root parameters */
1950 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
1951 *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
1952
1953 return 0;
1954
1955 case MANAGER_EXIT:
1956
1957 if (MANAGER_IS_USER(m)) {
1958 log_debug("Exit.");
1959
1960 *ret_reexecute = false;
1961 *ret_retval = m->return_value;
1962 *ret_shutdown_verb = NULL;
1963 *ret_fds = NULL;
1964 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1965
1966 return 0;
1967 }
1968
1969 _fallthrough_;
1970 case MANAGER_REBOOT:
1971 case MANAGER_POWEROFF:
1972 case MANAGER_HALT:
1973 case MANAGER_KEXEC: {
1974 static const char * const table[_MANAGER_OBJECTIVE_MAX] = {
1975 [MANAGER_EXIT] = "exit",
1976 [MANAGER_REBOOT] = "reboot",
1977 [MANAGER_POWEROFF] = "poweroff",
1978 [MANAGER_HALT] = "halt",
1979 [MANAGER_KEXEC] = "kexec",
1980 };
1981
1982 log_notice("Shutting down.");
1983
1984 *ret_reexecute = false;
1985 *ret_retval = m->return_value;
1986 assert_se(*ret_shutdown_verb = table[m->objective]);
1987 *ret_fds = NULL;
1988 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1989
1990 return 0;
1991 }
1992
1993 default:
1994 assert_not_reached("Unknown or unexpected manager objective.");
1995 }
1996 }
1997 }
1998
1999 static void log_execution_mode(bool *ret_first_boot) {
2000 assert(ret_first_boot);
2001
2002 if (arg_system) {
2003 int v;
2004
2005 log_info("systemd " GIT_VERSION " running in %ssystem mode. (%s)",
2006 arg_action == ACTION_TEST ? "test " : "",
2007 systemd_features);
2008
2009 v = detect_virtualization();
2010 if (v > 0)
2011 log_info("Detected virtualization %s.", virtualization_to_string(v));
2012
2013 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
2014
2015 if (in_initrd()) {
2016 *ret_first_boot = false;
2017 log_info("Running in initial RAM disk.");
2018 } else {
2019 int r;
2020 _cleanup_free_ char *id_text = NULL;
2021
2022 /* Let's check whether we are in first boot. We use /etc/machine-id as flag file
2023 * for this: If it is missing or contains the value "uninitialized", this is the
2024 * first boot. In any other case, it is not. This allows container managers and
2025 * installers to provision a couple of files already. If the container manager
2026 * wants to provision the machine ID itself it should pass $container_uuid to PID 1. */
2027
2028 r = read_one_line_file("/etc/machine-id", &id_text);
2029 if (r < 0 || streq(id_text, "uninitialized")) {
2030 if (r < 0 && r != -ENOENT)
2031 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, ignoring: %m");
2032
2033 *ret_first_boot = true;
2034 log_info("Detected first boot.");
2035 } else {
2036 *ret_first_boot = false;
2037 log_debug("Detected initialized system, this is not the first boot.");
2038 }
2039 }
2040 } else {
2041 if (DEBUG_LOGGING) {
2042 _cleanup_free_ char *t;
2043
2044 t = uid_to_name(getuid());
2045 log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2046 arg_action == ACTION_TEST ? " test" : "",
2047 getuid(), strna(t), systemd_features);
2048 }
2049
2050 *ret_first_boot = false;
2051 }
2052 }
2053
2054 static int initialize_runtime(
2055 bool skip_setup,
2056 bool first_boot,
2057 struct rlimit *saved_rlimit_nofile,
2058 struct rlimit *saved_rlimit_memlock,
2059 const char **ret_error_message) {
2060 int r;
2061
2062 assert(ret_error_message);
2063
2064 /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2065 *
2066 * - Some only apply to --system instances
2067 * - Some only apply to --user instances
2068 * - Some only apply when we first start up, but not when we reexecute
2069 */
2070
2071 if (arg_action != ACTION_RUN)
2072 return 0;
2073
2074 update_cpu_affinity(skip_setup);
2075 update_numa_policy(skip_setup);
2076
2077 if (arg_system) {
2078 /* Make sure we leave a core dump without panicking the kernel. */
2079 install_crash_handler();
2080
2081 if (!skip_setup) {
2082 r = mount_cgroup_controllers();
2083 if (r < 0) {
2084 *ret_error_message = "Failed to mount cgroup hierarchies";
2085 return r;
2086 }
2087
2088 status_welcome();
2089 (void) hostname_setup(true);
2090 /* Force transient machine-id on first boot. */
2091 machine_id_setup(NULL, first_boot, arg_machine_id, NULL);
2092 (void) loopback_setup();
2093 bump_unix_max_dgram_qlen();
2094 bump_file_max_and_nr_open();
2095 test_usr();
2096 write_container_id();
2097 }
2098
2099 if (arg_watchdog_device) {
2100 r = watchdog_set_device(arg_watchdog_device);
2101 if (r < 0)
2102 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2103 }
2104 } else {
2105 _cleanup_free_ char *p = NULL;
2106
2107 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2108 * user mode. In system mode mount_setup() already did that. */
2109
2110 r = xdg_user_runtime_dir(&p, "/systemd");
2111 if (r < 0) {
2112 *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2113 return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m");
2114 }
2115
2116 (void) mkdir_p_label(p, 0755);
2117 (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2118 }
2119
2120 if (arg_timer_slack_nsec != NSEC_INFINITY)
2121 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2122 log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2123
2124 if (arg_system && !cap_test_all(arg_capability_bounding_set)) {
2125 r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2126 if (r < 0) {
2127 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2128 return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m");
2129 }
2130
2131 r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2132 if (r < 0) {
2133 *ret_error_message = "Failed to drop capability bounding set";
2134 return log_emergency_errno(r, "Failed to drop capability bounding set: %m");
2135 }
2136 }
2137
2138 if (arg_system && arg_no_new_privs) {
2139 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2140 *ret_error_message = "Failed to disable new privileges";
2141 return log_emergency_errno(errno, "Failed to disable new privileges: %m");
2142 }
2143 }
2144
2145 if (arg_syscall_archs) {
2146 r = enforce_syscall_archs(arg_syscall_archs);
2147 if (r < 0) {
2148 *ret_error_message = "Failed to set syscall architectures";
2149 return r;
2150 }
2151 }
2152
2153 if (!arg_system)
2154 /* Become reaper of our children */
2155 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
2156 log_warning_errno(errno, "Failed to make us a subreaper: %m");
2157
2158 /* Bump up RLIMIT_NOFILE for systemd itself */
2159 (void) bump_rlimit_nofile(saved_rlimit_nofile);
2160 (void) bump_rlimit_memlock(saved_rlimit_memlock);
2161
2162 return 0;
2163 }
2164
2165 static int do_queue_default_job(
2166 Manager *m,
2167 const char **ret_error_message) {
2168
2169 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2170 const char *unit;
2171 Job *job;
2172 Unit *target;
2173 int r;
2174
2175 if (arg_default_unit)
2176 unit = arg_default_unit;
2177 else if (in_initrd())
2178 unit = SPECIAL_INITRD_TARGET;
2179 else
2180 unit = SPECIAL_DEFAULT_TARGET;
2181
2182 log_debug("Activating default unit: %s", unit);
2183
2184 r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2185 if (r < 0 && in_initrd() && !arg_default_unit) {
2186 /* Fall back to default.target, which we used to always use by default. Only do this if no
2187 * explicit configuration was given. */
2188
2189 log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
2190
2191 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2192 }
2193 if (r < 0) {
2194 log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
2195
2196 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2197 if (r < 0) {
2198 *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2199 : "Failed to load " SPECIAL_RESCUE_TARGET;
2200 return r;
2201 }
2202 }
2203
2204 assert(target->load_state == UNIT_LOADED);
2205
2206 r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
2207 if (r == -EPERM) {
2208 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2209
2210 sd_bus_error_free(&error);
2211
2212 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
2213 if (r < 0) {
2214 *ret_error_message = "Failed to start default target";
2215 return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r));
2216 }
2217
2218 } else if (r < 0) {
2219 *ret_error_message = "Failed to isolate default target";
2220 return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r));
2221 } else
2222 log_info("Queued %s job for default target %s.",
2223 job_type_to_string(job->type),
2224 unit_status_string(job->unit));
2225
2226 m->default_unit_job_id = job->id;
2227
2228 return 0;
2229 }
2230
2231 static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2232 struct rlimit *saved_rlimit_memlock) {
2233
2234 assert(saved_rlimit_nofile);
2235 assert(saved_rlimit_memlock);
2236
2237 if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2238 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2239
2240 if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2241 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2242 }
2243
2244 static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2245 struct rlimit *rl;
2246
2247 if (arg_default_rlimit[RLIMIT_NOFILE])
2248 return;
2249
2250 /* Make sure forked processes get limits based on the original kernel setting */
2251
2252 rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2253 if (!rl) {
2254 log_oom();
2255 return;
2256 }
2257
2258 /* Bump the hard limit for system services to a substantially higher value. The default
2259 * hard limit current kernels set is pretty low (4K), mostly for historical
2260 * reasons. According to kernel developers, the fd handling in recent kernels has been
2261 * optimized substantially enough, so that we can bump the limit now, without paying too
2262 * high a price in memory or performance. Note however that we only bump the hard limit,
2263 * not the soft limit. That's because select() works the way it works, and chokes on fds
2264 * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2265 * unexpecting programs that they get fds higher than what they can process using
2266 * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2267 * this pitfall: programs that are written by folks aware of the select() problem in mind
2268 * (and thus use poll()/epoll instead of select(), the way everybody should) can
2269 * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2270 * we pass. */
2271 if (arg_system) {
2272 int nr;
2273
2274 /* Get the underlying absolute limit the kernel enforces */
2275 nr = read_nr_open();
2276
2277 rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2278 }
2279
2280 /* If for some reason we were invoked with a soft limit above 1024 (which should never
2281 * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2282 * instance), then lower what we pass on to not confuse our children */
2283 rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2284
2285 arg_default_rlimit[RLIMIT_NOFILE] = rl;
2286 }
2287
2288 static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2289 struct rlimit *rl;
2290
2291 /* Pass the original value down to invoked processes */
2292
2293 if (arg_default_rlimit[RLIMIT_MEMLOCK])
2294 return;
2295
2296 rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2297 if (!rl) {
2298 log_oom();
2299 return;
2300 }
2301
2302 arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
2303 }
2304
2305 static void setenv_manager_environment(void) {
2306 char **p;
2307 int r;
2308
2309 STRV_FOREACH(p, arg_manager_environment) {
2310 log_debug("Setting '%s' in our own environment.", *p);
2311
2312 r = putenv_dup(*p, true);
2313 if (r < 0)
2314 log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
2315 }
2316 }
2317
2318 static void reset_arguments(void) {
2319 /* Frees/resets arg_* variables, with a few exceptions commented below. */
2320
2321 arg_default_unit = mfree(arg_default_unit);
2322
2323 /* arg_system — ignore */
2324
2325 arg_dump_core = true;
2326 arg_crash_chvt = -1;
2327 arg_crash_shell = false;
2328 arg_crash_reboot = false;
2329 arg_confirm_spawn = mfree(arg_confirm_spawn);
2330 arg_show_status = _SHOW_STATUS_INVALID;
2331 arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2332 arg_switched_root = false;
2333 arg_pager_flags = 0;
2334 arg_service_watchdogs = true;
2335 arg_default_std_output = EXEC_OUTPUT_JOURNAL;
2336 arg_default_std_error = EXEC_OUTPUT_INHERIT;
2337 arg_default_restart_usec = DEFAULT_RESTART_USEC;
2338 arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
2339 arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
2340 arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
2341 arg_default_timeout_abort_set = false;
2342 arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
2343 arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
2344 arg_runtime_watchdog = 0;
2345 arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2346 arg_kexec_watchdog = 0;
2347 arg_early_core_pattern = NULL;
2348 arg_watchdog_device = NULL;
2349
2350 arg_default_environment = strv_free(arg_default_environment);
2351 arg_manager_environment = strv_free(arg_manager_environment);
2352 rlimit_free_all(arg_default_rlimit);
2353
2354 arg_capability_bounding_set = CAP_ALL;
2355 arg_no_new_privs = false;
2356 arg_timer_slack_nsec = NSEC_INFINITY;
2357 arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
2358
2359 arg_syscall_archs = set_free(arg_syscall_archs);
2360
2361 /* arg_serialization — ignore */
2362
2363 arg_default_cpu_accounting = -1;
2364 arg_default_io_accounting = false;
2365 arg_default_ip_accounting = false;
2366 arg_default_blockio_accounting = false;
2367 arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
2368 arg_default_tasks_accounting = true;
2369 arg_default_tasks_max = DEFAULT_TASKS_MAX;
2370 arg_machine_id = (sd_id128_t) {};
2371 arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2372 arg_default_oom_policy = OOM_STOP;
2373
2374 cpu_set_reset(&arg_cpu_affinity);
2375 numa_policy_reset(&arg_numa_policy);
2376
2377 arg_random_seed = mfree(arg_random_seed);
2378 arg_random_seed_size = 0;
2379 arg_clock_usec = 0;
2380 }
2381
2382 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2383 const struct rlimit *saved_rlimit_memlock) {
2384 int r;
2385
2386 assert(saved_rlimit_nofile);
2387 assert(saved_rlimit_memlock);
2388
2389 /* Assign configuration defaults */
2390 reset_arguments();
2391
2392 r = parse_config_file();
2393 if (r < 0)
2394 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2395
2396 if (arg_system) {
2397 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2398 if (r < 0)
2399 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2400 }
2401
2402 /* Initialize some default rlimits for services if they haven't been configured */
2403 fallback_rlimit_nofile(saved_rlimit_nofile);
2404 fallback_rlimit_memlock(saved_rlimit_memlock);
2405
2406 /* Note that this also parses bits from the kernel command line, including "debug". */
2407 log_parse_environment();
2408
2409 /* Initialize the show status setting if it hasn't been set explicitly yet */
2410 if (arg_show_status == _SHOW_STATUS_INVALID)
2411 arg_show_status = SHOW_STATUS_YES;
2412
2413 /* Push variables into the manager environment block */
2414 setenv_manager_environment();
2415
2416 return 0;
2417 }
2418
2419 static int safety_checks(void) {
2420
2421 if (getpid_cached() == 1 &&
2422 arg_action != ACTION_RUN)
2423 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2424 "Unsupported execution mode while PID 1.");
2425
2426 if (getpid_cached() == 1 &&
2427 !arg_system)
2428 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2429 "Can't run --user mode as PID 1.");
2430
2431 if (arg_action == ACTION_RUN &&
2432 arg_system &&
2433 getpid_cached() != 1)
2434 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2435 "Can't run system mode unless PID 1.");
2436
2437 if (arg_action == ACTION_TEST &&
2438 geteuid() == 0)
2439 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2440 "Don't run test mode as root.");
2441
2442 if (!arg_system &&
2443 arg_action == ACTION_RUN &&
2444 sd_booted() <= 0)
2445 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2446 "Trying to run as user instance, but the system has not been booted with systemd.");
2447
2448 if (!arg_system &&
2449 arg_action == ACTION_RUN &&
2450 !getenv("XDG_RUNTIME_DIR"))
2451 return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2452 "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2453
2454 if (arg_system &&
2455 arg_action == ACTION_RUN &&
2456 running_in_chroot() > 0)
2457 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2458 "Cannot be run in a chroot() environment.");
2459
2460 return 0;
2461 }
2462
2463 static int initialize_security(
2464 bool *loaded_policy,
2465 dual_timestamp *security_start_timestamp,
2466 dual_timestamp *security_finish_timestamp,
2467 const char **ret_error_message) {
2468
2469 int r;
2470
2471 assert(loaded_policy);
2472 assert(security_start_timestamp);
2473 assert(security_finish_timestamp);
2474 assert(ret_error_message);
2475
2476 dual_timestamp_get(security_start_timestamp);
2477
2478 r = mac_selinux_setup(loaded_policy);
2479 if (r < 0) {
2480 *ret_error_message = "Failed to load SELinux policy";
2481 return r;
2482 }
2483
2484 r = mac_smack_setup(loaded_policy);
2485 if (r < 0) {
2486 *ret_error_message = "Failed to load SMACK policy";
2487 return r;
2488 }
2489
2490 r = mac_apparmor_setup();
2491 if (r < 0) {
2492 *ret_error_message = "Failed to load AppArmor policy";
2493 return r;
2494 }
2495
2496 r = ima_setup();
2497 if (r < 0) {
2498 *ret_error_message = "Failed to load IMA policy";
2499 return r;
2500 }
2501
2502 dual_timestamp_get(security_finish_timestamp);
2503 return 0;
2504 }
2505
2506 static void test_summary(Manager *m) {
2507 assert(m);
2508
2509 printf("-> By units:\n");
2510 manager_dump_units(m, stdout, "\t");
2511
2512 printf("-> By jobs:\n");
2513 manager_dump_jobs(m, stdout, "\t");
2514 }
2515
2516 static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2517 int r;
2518
2519 assert(ret_fds);
2520 assert(ret_error_message);
2521
2522 r = fdset_new_fill(ret_fds);
2523 if (r < 0) {
2524 *ret_error_message = "Failed to allocate fd set";
2525 return log_emergency_errno(r, "Failed to allocate fd set: %m");
2526 }
2527
2528 fdset_cloexec(*ret_fds, true);
2529
2530 if (arg_serialization)
2531 assert_se(fdset_remove(*ret_fds, fileno(arg_serialization)) >= 0);
2532
2533 return 0;
2534 }
2535
2536 static void setup_console_terminal(bool skip_setup) {
2537
2538 if (!arg_system)
2539 return;
2540
2541 /* Become a session leader if we aren't one yet. */
2542 (void) setsid();
2543
2544 /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a controlling
2545 * tty. */
2546 (void) release_terminal();
2547
2548 /* Reset the console, but only if this is really init and we are freshly booted */
2549 if (getpid_cached() == 1 && !skip_setup)
2550 (void) console_setup();
2551 }
2552
2553 static bool early_skip_setup_check(int argc, char *argv[]) {
2554 bool found_deserialize = false;
2555 int i;
2556
2557 /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much later, so
2558 * let's just have a quick peek here. Note that if we have switched root, do all the special setup things
2559 * anyway, even if in that case we also do deserialization. */
2560
2561 for (i = 1; i < argc; i++) {
2562 if (streq(argv[i], "--switched-root"))
2563 return false; /* If we switched root, don't skip the setup. */
2564 else if (streq(argv[i], "--deserialize"))
2565 found_deserialize = true;
2566 }
2567
2568 return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
2569 }
2570
2571 static int save_env(void) {
2572 char **l;
2573
2574 l = strv_copy(environ);
2575 if (!l)
2576 return -ENOMEM;
2577
2578 strv_free_and_replace(saved_env, l);
2579 return 0;
2580 }
2581
2582 int main(int argc, char *argv[]) {
2583
2584 dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL,
2585 security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL;
2586 struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
2587 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
2588 * in. Note we use different values
2589 * for the two that indicate whether
2590 * these fields are initialized! */
2591 bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false;
2592 char *switch_root_dir = NULL, *switch_root_init = NULL;
2593 usec_t before_startup, after_startup;
2594 static char systemd[] = "systemd";
2595 char timespan[FORMAT_TIMESPAN_MAX];
2596 const char *shutdown_verb = NULL, *error_message = NULL;
2597 int r, retval = EXIT_FAILURE;
2598 Manager *m = NULL;
2599 FDSet *fds = NULL;
2600
2601 /* SysV compatibility: redirect init → telinit */
2602 redirect_telinit(argc, argv);
2603
2604 /* Take timestamps early on */
2605 dual_timestamp_from_monotonic(&kernel_timestamp, 0);
2606 dual_timestamp_get(&userspace_timestamp);
2607
2608 /* Figure out whether we need to do initialize the system, or if we already did that because we are
2609 * reexecuting */
2610 skip_setup = early_skip_setup_check(argc, argv);
2611
2612 /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we
2613 * are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */
2614 program_invocation_short_name = systemd;
2615 (void) prctl(PR_SET_NAME, systemd);
2616
2617 /* Save the original command line */
2618 save_argc_argv(argc, argv);
2619
2620 /* Save the original environment as we might need to restore it if we're requested to execute another
2621 * system manager later. */
2622 r = save_env();
2623 if (r < 0) {
2624 error_message = "Failed to copy environment block";
2625 goto finish;
2626 }
2627
2628 /* Make sure that if the user says "syslog" we actually log to the journal. */
2629 log_set_upgrade_syslog_to_journal(true);
2630
2631 if (getpid_cached() == 1) {
2632 /* When we run as PID 1 force system mode */
2633 arg_system = true;
2634
2635 /* Disable the umask logic */
2636 umask(0);
2637
2638 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might not be
2639 * activated yet (even though the log socket for it exists). */
2640 log_set_prohibit_ipc(true);
2641
2642 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This is
2643 * important so that we never end up logging to any foreign stderr, for example if we have to log in a
2644 * child process right before execve()'ing the actual binary, at a point in time where socket
2645 * activation stderr/stdout area already set up. */
2646 log_set_always_reopen_console(true);
2647
2648 if (detect_container() <= 0) {
2649
2650 /* Running outside of a container as PID 1 */
2651 log_set_target(LOG_TARGET_KMSG);
2652 log_open();
2653
2654 if (in_initrd())
2655 initrd_timestamp = userspace_timestamp;
2656
2657 if (!skip_setup) {
2658 r = mount_setup_early();
2659 if (r < 0) {
2660 error_message = "Failed to mount early API filesystems";
2661 goto finish;
2662 }
2663
2664 /* Let's open the log backend a second time, in case the first time didn't
2665 * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
2666 * available, and it previously wasn't. */
2667 log_open();
2668
2669 disable_printk_ratelimit();
2670
2671 r = initialize_security(
2672 &loaded_policy,
2673 &security_start_timestamp,
2674 &security_finish_timestamp,
2675 &error_message);
2676 if (r < 0)
2677 goto finish;
2678 }
2679
2680 if (mac_selinux_init() < 0) {
2681 error_message = "Failed to initialize SELinux support";
2682 goto finish;
2683 }
2684
2685 if (!skip_setup)
2686 initialize_clock();
2687
2688 /* Set the default for later on, but don't actually open the logs like this for now. Note that
2689 * if we are transitioning from the initrd there might still be journal fd open, and we
2690 * shouldn't attempt opening that before we parsed /proc/cmdline which might redirect output
2691 * elsewhere. */
2692 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
2693
2694 } else {
2695 /* Running inside a container, as PID 1 */
2696 log_set_target(LOG_TARGET_CONSOLE);
2697 log_open();
2698
2699 /* For later on, see above... */
2700 log_set_target(LOG_TARGET_JOURNAL);
2701
2702 /* clear the kernel timestamp, because we are in a container */
2703 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2704 }
2705
2706 initialize_coredump(skip_setup);
2707
2708 r = fixup_environment();
2709 if (r < 0) {
2710 log_emergency_errno(r, "Failed to fix up PID 1 environment: %m");
2711 error_message = "Failed to fix up PID1 environment";
2712 goto finish;
2713 }
2714
2715 /* Try to figure out if we can use colors with the console. No need to do that for user instances since
2716 * they never log into the console. */
2717 log_show_color(colors_enabled());
2718
2719 r = make_null_stdio();
2720 if (r < 0)
2721 log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
2722
2723 /* Load the kernel modules early. */
2724 if (!skip_setup)
2725 kmod_setup();
2726
2727 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
2728 r = mount_setup(loaded_policy, skip_setup);
2729 if (r < 0) {
2730 error_message = "Failed to mount API filesystems";
2731 goto finish;
2732 }
2733
2734 /* The efivarfs is now mounted, let's read the random seed off it */
2735 (void) efi_take_random_seed();
2736
2737 /* Cache command-line options passed from EFI variables */
2738 if (!skip_setup)
2739 (void) cache_efi_options_variable();
2740 } else {
2741 /* Running as user instance */
2742 arg_system = false;
2743 log_set_target(LOG_TARGET_AUTO);
2744 log_open();
2745
2746 /* clear the kernel timestamp, because we are not PID 1 */
2747 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2748
2749 if (mac_selinux_init() < 0) {
2750 error_message = "Failed to initialize SELinux support";
2751 goto finish;
2752 }
2753 }
2754
2755 /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
2756 * transitioning from the initrd to the main systemd or suchlike. */
2757 save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
2758
2759 /* Reset all signal handlers. */
2760 (void) reset_all_signal_handlers();
2761 (void) ignore_signals(SIGNALS_IGNORE, -1);
2762
2763 (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
2764
2765 r = parse_argv(argc, argv);
2766 if (r < 0) {
2767 error_message = "Failed to parse commandline arguments";
2768 goto finish;
2769 }
2770
2771 r = safety_checks();
2772 if (r < 0)
2773 goto finish;
2774
2775 if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
2776 (void) pager_open(arg_pager_flags);
2777
2778 if (arg_action != ACTION_RUN)
2779 skip_setup = true;
2780
2781 if (arg_action == ACTION_HELP) {
2782 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
2783 goto finish;
2784 } else if (arg_action == ACTION_VERSION) {
2785 retval = version();
2786 goto finish;
2787 } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
2788 unit_dump_config_items(stdout);
2789 retval = EXIT_SUCCESS;
2790 goto finish;
2791 } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
2792 dump_bus_properties(stdout);
2793 retval = EXIT_SUCCESS;
2794 goto finish;
2795 } else if (arg_action == ACTION_BUS_INTROSPECT) {
2796 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
2797 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
2798 goto finish;
2799 }
2800
2801 assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
2802
2803 /* Move out of the way, so that we won't block unmounts */
2804 assert_se(chdir("/") == 0);
2805
2806 if (arg_action == ACTION_RUN) {
2807 if (!skip_setup) {
2808 /* Apply the systemd.clock_usec= kernel command line switch */
2809 apply_clock_update();
2810
2811 /* Apply random seed from kernel command line */
2812 cmdline_take_random_seed();
2813 }
2814
2815 /* A core pattern might have been specified via the cmdline. */
2816 initialize_core_pattern(skip_setup);
2817
2818 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
2819 log_close();
2820
2821 /* Remember open file descriptors for later deserialization */
2822 r = collect_fds(&fds, &error_message);
2823 if (r < 0)
2824 goto finish;
2825
2826 /* Give up any control of the console, but make sure its initialized. */
2827 setup_console_terminal(skip_setup);
2828
2829 /* Open the logging devices, if possible and necessary */
2830 log_open();
2831 }
2832
2833 log_execution_mode(&first_boot);
2834
2835 r = initialize_runtime(skip_setup,
2836 first_boot,
2837 &saved_rlimit_nofile,
2838 &saved_rlimit_memlock,
2839 &error_message);
2840 if (r < 0)
2841 goto finish;
2842
2843 r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER,
2844 arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
2845 &m);
2846 if (r < 0) {
2847 log_emergency_errno(r, "Failed to allocate manager object: %m");
2848 error_message = "Failed to allocate manager object";
2849 goto finish;
2850 }
2851
2852 m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
2853 m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
2854 m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
2855 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
2856 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
2857
2858 set_manager_defaults(m);
2859 set_manager_settings(m);
2860 manager_set_first_boot(m, first_boot);
2861
2862 /* Remember whether we should queue the default job */
2863 queue_default_job = !arg_serialization || arg_switched_root;
2864
2865 before_startup = now(CLOCK_MONOTONIC);
2866
2867 r = manager_startup(m, arg_serialization, fds);
2868 if (r < 0) {
2869 error_message = "Failed to start up manager";
2870 goto finish;
2871 }
2872
2873 /* This will close all file descriptors that were opened, but not claimed by any unit. */
2874 fds = fdset_free(fds);
2875 arg_serialization = safe_fclose(arg_serialization);
2876
2877 if (queue_default_job) {
2878 r = do_queue_default_job(m, &error_message);
2879 if (r < 0)
2880 goto finish;
2881 }
2882
2883 after_startup = now(CLOCK_MONOTONIC);
2884
2885 log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
2886 "Loaded units and determined initial transaction in %s.",
2887 format_timespan(timespan, sizeof(timespan), after_startup - before_startup, 100 * USEC_PER_MSEC));
2888
2889 if (arg_action == ACTION_TEST) {
2890 test_summary(m);
2891 retval = EXIT_SUCCESS;
2892 goto finish;
2893 }
2894
2895 (void) invoke_main_loop(m,
2896 &saved_rlimit_nofile,
2897 &saved_rlimit_memlock,
2898 &reexecute,
2899 &retval,
2900 &shutdown_verb,
2901 &fds,
2902 &switch_root_dir,
2903 &switch_root_init,
2904 &error_message);
2905
2906 finish:
2907 pager_close();
2908
2909 if (m) {
2910 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
2911 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
2912 m = manager_free(m);
2913 }
2914
2915 mac_selinux_finish();
2916
2917 if (reexecute)
2918 do_reexecute(argc, argv,
2919 &saved_rlimit_nofile,
2920 &saved_rlimit_memlock,
2921 fds,
2922 switch_root_dir,
2923 switch_root_init,
2924 &error_message); /* This only returns if reexecution failed */
2925
2926 arg_serialization = safe_fclose(arg_serialization);
2927 fds = fdset_free(fds);
2928
2929 saved_env = strv_free(saved_env);
2930
2931 #if HAVE_VALGRIND_VALGRIND_H
2932 /* If we are PID 1 and running under valgrind, then let's exit
2933 * here explicitly. valgrind will only generate nice output on
2934 * exit(), not on exec(), hence let's do the former not the
2935 * latter here. */
2936 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
2937 /* Cleanup watchdog_device strings for valgrind. We need them
2938 * in become_shutdown() so normally we cannot free them yet. */
2939 watchdog_free_device();
2940 arg_watchdog_device = mfree(arg_watchdog_device);
2941 reset_arguments();
2942 return retval;
2943 }
2944 #endif
2945
2946 #if HAS_FEATURE_ADDRESS_SANITIZER
2947 __lsan_do_leak_check();
2948 #endif
2949
2950 if (shutdown_verb) {
2951 r = become_shutdown(shutdown_verb, retval);
2952 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
2953 error_message = "Failed to execute shutdown binary";
2954 }
2955
2956 watchdog_free_device();
2957 arg_watchdog_device = mfree(arg_watchdog_device);
2958
2959 if (getpid_cached() == 1) {
2960 if (error_message)
2961 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2962 ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
2963 "%s.", error_message);
2964 freeze_or_exit_or_reboot();
2965 }
2966
2967 reset_arguments();
2968 return retval;
2969 }