]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/main.c
resolved: reply using unicast mDNS when appropriate
[thirdparty/systemd.git] / src / core / main.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <getopt.h>
6 #include <sys/mount.h>
7 #include <sys/prctl.h>
8 #include <sys/reboot.h>
9 #include <unistd.h>
10 #if HAVE_SECCOMP
11 #include <seccomp.h>
12 #endif
13 #if HAVE_VALGRIND_VALGRIND_H
14 #include <valgrind/valgrind.h>
15 #endif
16
17 #include "sd-bus.h"
18 #include "sd-daemon.h"
19 #include "sd-messages.h"
20
21 #include "alloc-util.h"
22 #include "apparmor-setup.h"
23 #include "architecture.h"
24 #include "build.h"
25 #include "bus-error.h"
26 #include "bus-util.h"
27 #include "capability-util.h"
28 #include "cgroup-util.h"
29 #include "clock-util.h"
30 #include "conf-parser.h"
31 #include "cpu-set-util.h"
32 #include "dbus-manager.h"
33 #include "dbus.h"
34 #include "def.h"
35 #include "dev-setup.h"
36 #include "efi-random.h"
37 #include "efivars.h"
38 #include "emergency-action.h"
39 #include "env-util.h"
40 #include "exit-status.h"
41 #include "fd-util.h"
42 #include "fdset.h"
43 #include "fileio.h"
44 #include "format-util.h"
45 #include "fs-util.h"
46 #include "hexdecoct.h"
47 #include "hostname-setup.h"
48 #include "ima-setup.h"
49 #include "killall.h"
50 #include "kmod-setup.h"
51 #include "limits-util.h"
52 #include "load-fragment.h"
53 #include "log.h"
54 #include "loopback-setup.h"
55 #include "machine-id-setup.h"
56 #include "manager.h"
57 #include "mkdir.h"
58 #include "mount-setup.h"
59 #include "os-util.h"
60 #include "pager.h"
61 #include "parse-argument.h"
62 #include "parse-util.h"
63 #include "path-util.h"
64 #include "pretty-print.h"
65 #include "proc-cmdline.h"
66 #include "process-util.h"
67 #include "random-util.h"
68 #include "raw-clone.h"
69 #include "rlimit-util.h"
70 #if HAVE_SECCOMP
71 #include "seccomp-util.h"
72 #endif
73 #include "selinux-setup.h"
74 #include "selinux-util.h"
75 #include "signal-util.h"
76 #include "smack-setup.h"
77 #include "special.h"
78 #include "stat-util.h"
79 #include "stdio-util.h"
80 #include "strv.h"
81 #include "switch-root.h"
82 #include "sysctl-util.h"
83 #include "terminal-util.h"
84 #include "umask-util.h"
85 #include "user-util.h"
86 #include "util.h"
87 #include "virt.h"
88 #include "watchdog.h"
89
90 #if HAS_FEATURE_ADDRESS_SANITIZER
91 #include <sanitizer/lsan_interface.h>
92 #endif
93
94 #define DEFAULT_TASKS_MAX ((TasksMax) { 15U, 100U }) /* 15% */
95
96 static enum {
97 ACTION_RUN,
98 ACTION_HELP,
99 ACTION_VERSION,
100 ACTION_TEST,
101 ACTION_DUMP_CONFIGURATION_ITEMS,
102 ACTION_DUMP_BUS_PROPERTIES,
103 ACTION_BUS_INTROSPECT,
104 } arg_action = ACTION_RUN;
105
106 static const char *arg_bus_introspect = NULL;
107
108 /* Those variables are initialized to 0 automatically, so we avoid uninitialized memory access. Real
109 * defaults are assigned in reset_arguments() below. */
110 static char *arg_default_unit;
111 static bool arg_system;
112 static bool arg_dump_core;
113 static int arg_crash_chvt;
114 static bool arg_crash_shell;
115 static bool arg_crash_reboot;
116 static char *arg_confirm_spawn;
117 static ShowStatus arg_show_status;
118 static StatusUnitFormat arg_status_unit_format;
119 static bool arg_switched_root;
120 static PagerFlags arg_pager_flags;
121 static bool arg_service_watchdogs;
122 static ExecOutput arg_default_std_output;
123 static ExecOutput arg_default_std_error;
124 static usec_t arg_default_restart_usec;
125 static usec_t arg_default_timeout_start_usec;
126 static usec_t arg_default_timeout_stop_usec;
127 static usec_t arg_default_timeout_abort_usec;
128 static bool arg_default_timeout_abort_set;
129 static usec_t arg_default_start_limit_interval;
130 static unsigned arg_default_start_limit_burst;
131 static usec_t arg_runtime_watchdog;
132 static usec_t arg_reboot_watchdog;
133 static usec_t arg_kexec_watchdog;
134 static char *arg_early_core_pattern;
135 static char *arg_watchdog_device;
136 static char **arg_default_environment;
137 static char **arg_manager_environment;
138 static struct rlimit *arg_default_rlimit[_RLIMIT_MAX];
139 static uint64_t arg_capability_bounding_set;
140 static bool arg_no_new_privs;
141 static nsec_t arg_timer_slack_nsec;
142 static usec_t arg_default_timer_accuracy_usec;
143 static Set* arg_syscall_archs;
144 static FILE* arg_serialization;
145 static int arg_default_cpu_accounting;
146 static bool arg_default_io_accounting;
147 static bool arg_default_ip_accounting;
148 static bool arg_default_blockio_accounting;
149 static bool arg_default_memory_accounting;
150 static bool arg_default_tasks_accounting;
151 static TasksMax arg_default_tasks_max;
152 static sd_id128_t arg_machine_id;
153 static EmergencyAction arg_cad_burst_action;
154 static OOMPolicy arg_default_oom_policy;
155 static CPUSet arg_cpu_affinity;
156 static NUMAPolicy arg_numa_policy;
157 static usec_t arg_clock_usec;
158 static void *arg_random_seed;
159 static size_t arg_random_seed_size;
160
161 /* A copy of the original environment block */
162 static char **saved_env = NULL;
163
164 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
165 const struct rlimit *saved_rlimit_memlock);
166
167 static int manager_find_user_config_paths(char ***ret_files, char ***ret_dirs) {
168 _cleanup_free_ char *base = NULL;
169 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
170 int r;
171
172 r = xdg_user_config_dir(&base, "/systemd");
173 if (r < 0)
174 return r;
175
176 r = strv_extendf(&files, "%s/user.conf", base);
177 if (r < 0)
178 return r;
179
180 r = strv_extend(&files, PKGSYSCONFDIR "/user.conf");
181 if (r < 0)
182 return r;
183
184 r = strv_consume(&dirs, TAKE_PTR(base));
185 if (r < 0)
186 return r;
187
188 r = strv_extend_strv(&dirs, CONF_PATHS_STRV("systemd"), false);
189 if (r < 0)
190 return r;
191
192 *ret_files = TAKE_PTR(files);
193 *ret_dirs = TAKE_PTR(dirs);
194 return 0;
195 }
196
197 _noreturn_ static void freeze_or_exit_or_reboot(void) {
198
199 /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
200 * the container manager, and thus inform it that something went wrong. */
201 if (detect_container() > 0) {
202 log_emergency("Exiting PID 1...");
203 _exit(EXIT_EXCEPTION);
204 }
205
206 if (arg_crash_reboot) {
207 log_notice("Rebooting in 10s...");
208 (void) sleep(10);
209
210 log_notice("Rebooting now...");
211 (void) reboot(RB_AUTOBOOT);
212 log_emergency_errno(errno, "Failed to reboot: %m");
213 }
214
215 log_emergency("Freezing execution.");
216 freeze();
217 }
218
219 _noreturn_ static void crash(int sig) {
220 struct sigaction sa;
221 pid_t pid;
222
223 if (getpid_cached() != 1)
224 /* Pass this on immediately, if this is not PID 1 */
225 (void) raise(sig);
226 else if (!arg_dump_core)
227 log_emergency("Caught <%s>, not dumping core.", signal_to_string(sig));
228 else {
229 sa = (struct sigaction) {
230 .sa_handler = nop_signal_handler,
231 .sa_flags = SA_NOCLDSTOP|SA_RESTART,
232 };
233
234 /* We want to wait for the core process, hence let's enable SIGCHLD */
235 (void) sigaction(SIGCHLD, &sa, NULL);
236
237 pid = raw_clone(SIGCHLD);
238 if (pid < 0)
239 log_emergency_errno(errno, "Caught <%s>, cannot fork for core dump: %m", signal_to_string(sig));
240 else if (pid == 0) {
241 /* Enable default signal handler for core dump */
242
243 sa = (struct sigaction) {
244 .sa_handler = SIG_DFL,
245 };
246 (void) sigaction(sig, &sa, NULL);
247
248 /* Don't limit the coredump size */
249 (void) setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY));
250
251 /* Just to be sure... */
252 (void) chdir("/");
253
254 /* Raise the signal again */
255 pid = raw_getpid();
256 (void) kill(pid, sig); /* raise() would kill the parent */
257
258 assert_not_reached("We shouldn't be here...");
259 _exit(EXIT_EXCEPTION);
260 } else {
261 siginfo_t status;
262 int r;
263
264 /* Order things nicely. */
265 r = wait_for_terminate(pid, &status);
266 if (r < 0)
267 log_emergency_errno(r, "Caught <%s>, waitpid() failed: %m", signal_to_string(sig));
268 else if (status.si_code != CLD_DUMPED) {
269 const char *s = status.si_code == CLD_EXITED
270 ? exit_status_to_string(status.si_status, EXIT_STATUS_LIBC)
271 : signal_to_string(status.si_status);
272
273 log_emergency("Caught <%s>, core dump failed (child "PID_FMT", code=%s, status=%i/%s).",
274 signal_to_string(sig),
275 pid,
276 sigchld_code_to_string(status.si_code),
277 status.si_status, strna(s));
278 } else
279 log_emergency("Caught <%s>, dumped core as pid "PID_FMT".",
280 signal_to_string(sig), pid);
281 }
282 }
283
284 if (arg_crash_chvt >= 0)
285 (void) chvt(arg_crash_chvt);
286
287 sa = (struct sigaction) {
288 .sa_handler = SIG_IGN,
289 .sa_flags = SA_NOCLDSTOP|SA_NOCLDWAIT|SA_RESTART,
290 };
291
292 /* Let the kernel reap children for us */
293 (void) sigaction(SIGCHLD, &sa, NULL);
294
295 if (arg_crash_shell) {
296 log_notice("Executing crash shell in 10s...");
297 (void) sleep(10);
298
299 pid = raw_clone(SIGCHLD);
300 if (pid < 0)
301 log_emergency_errno(errno, "Failed to fork off crash shell: %m");
302 else if (pid == 0) {
303 (void) setsid();
304 (void) make_console_stdio();
305 (void) rlimit_nofile_safe();
306 (void) execle("/bin/sh", "/bin/sh", NULL, environ);
307
308 log_emergency_errno(errno, "execle() failed: %m");
309 _exit(EXIT_EXCEPTION);
310 } else {
311 log_info("Spawned crash shell as PID "PID_FMT".", pid);
312 (void) wait_for_terminate(pid, NULL);
313 }
314 }
315
316 freeze_or_exit_or_reboot();
317 }
318
319 static void install_crash_handler(void) {
320 static const struct sigaction sa = {
321 .sa_handler = crash,
322 .sa_flags = SA_NODEFER, /* So that we can raise the signal again from the signal handler */
323 };
324 int r;
325
326 /* We ignore the return value here, since, we don't mind if we cannot set up a crash handler */
327 r = sigaction_many(&sa, SIGNALS_CRASH_HANDLER);
328 if (r < 0)
329 log_debug_errno(r, "I had trouble setting up the crash handler, ignoring: %m");
330 }
331
332 static int console_setup(void) {
333 _cleanup_close_ int tty_fd = -1;
334 int r;
335
336 tty_fd = open_terminal("/dev/console", O_WRONLY|O_NOCTTY|O_CLOEXEC);
337 if (tty_fd < 0)
338 return log_error_errno(tty_fd, "Failed to open /dev/console: %m");
339
340 /* We don't want to force text mode. plymouth may be showing
341 * pictures already from initrd. */
342 r = reset_terminal_fd(tty_fd, false);
343 if (r < 0)
344 return log_error_errno(r, "Failed to reset /dev/console: %m");
345
346 return 0;
347 }
348
349 static int set_machine_id(const char *m) {
350 sd_id128_t t;
351 assert(m);
352
353 if (sd_id128_from_string(m, &t) < 0)
354 return -EINVAL;
355
356 if (sd_id128_is_null(t))
357 return -EINVAL;
358
359 arg_machine_id = t;
360 return 0;
361 }
362
363 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
364 int r;
365
366 assert(key);
367
368 if (STR_IN_SET(key, "systemd.unit", "rd.systemd.unit")) {
369
370 if (proc_cmdline_value_missing(key, value))
371 return 0;
372
373 if (!unit_name_is_valid(value, UNIT_NAME_PLAIN|UNIT_NAME_INSTANCE))
374 log_warning("Unit name specified on %s= is not valid, ignoring: %s", key, value);
375 else if (in_initrd() == !!startswith(key, "rd."))
376 return free_and_strdup_warn(&arg_default_unit, value);
377
378 } else if (proc_cmdline_key_streq(key, "systemd.dump_core")) {
379
380 r = value ? parse_boolean(value) : true;
381 if (r < 0)
382 log_warning_errno(r, "Failed to parse dump core switch %s, ignoring: %m", value);
383 else
384 arg_dump_core = r;
385
386 } else if (proc_cmdline_key_streq(key, "systemd.early_core_pattern")) {
387
388 if (proc_cmdline_value_missing(key, value))
389 return 0;
390
391 if (path_is_absolute(value))
392 (void) parse_path_argument(value, false, &arg_early_core_pattern);
393 else
394 log_warning("Specified core pattern '%s' is not an absolute path, ignoring.", value);
395
396 } else if (proc_cmdline_key_streq(key, "systemd.crash_chvt")) {
397
398 if (!value)
399 arg_crash_chvt = 0; /* turn on */
400 else {
401 r = parse_crash_chvt(value, &arg_crash_chvt);
402 if (r < 0)
403 log_warning_errno(r, "Failed to parse crash chvt switch %s, ignoring: %m", value);
404 }
405
406 } else if (proc_cmdline_key_streq(key, "systemd.crash_shell")) {
407
408 r = value ? parse_boolean(value) : true;
409 if (r < 0)
410 log_warning_errno(r, "Failed to parse crash shell switch %s, ignoring: %m", value);
411 else
412 arg_crash_shell = r;
413
414 } else if (proc_cmdline_key_streq(key, "systemd.crash_reboot")) {
415
416 r = value ? parse_boolean(value) : true;
417 if (r < 0)
418 log_warning_errno(r, "Failed to parse crash reboot switch %s, ignoring: %m", value);
419 else
420 arg_crash_reboot = r;
421
422 } else if (proc_cmdline_key_streq(key, "systemd.confirm_spawn")) {
423 char *s;
424
425 r = parse_confirm_spawn(value, &s);
426 if (r < 0)
427 log_warning_errno(r, "Failed to parse confirm_spawn switch %s, ignoring: %m", value);
428 else
429 free_and_replace(arg_confirm_spawn, s);
430
431 } else if (proc_cmdline_key_streq(key, "systemd.service_watchdogs")) {
432
433 r = value ? parse_boolean(value) : true;
434 if (r < 0)
435 log_warning_errno(r, "Failed to parse service watchdog switch %s, ignoring: %m", value);
436 else
437 arg_service_watchdogs = r;
438
439 } else if (proc_cmdline_key_streq(key, "systemd.show_status")) {
440
441 if (value) {
442 r = parse_show_status(value, &arg_show_status);
443 if (r < 0)
444 log_warning_errno(r, "Failed to parse show status switch %s, ignoring: %m", value);
445 } else
446 arg_show_status = SHOW_STATUS_YES;
447
448 } else if (proc_cmdline_key_streq(key, "systemd.status_unit_format")) {
449
450 if (proc_cmdline_value_missing(key, value))
451 return 0;
452
453 r = status_unit_format_from_string(value);
454 if (r < 0)
455 log_warning_errno(r, "Failed to parse %s=%s, ignoring: %m", key, value);
456 else
457 arg_status_unit_format = r;
458
459 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_output")) {
460
461 if (proc_cmdline_value_missing(key, value))
462 return 0;
463
464 r = exec_output_from_string(value);
465 if (r < 0)
466 log_warning_errno(r, "Failed to parse default standard output switch %s, ignoring: %m", value);
467 else
468 arg_default_std_output = r;
469
470 } else if (proc_cmdline_key_streq(key, "systemd.default_standard_error")) {
471
472 if (proc_cmdline_value_missing(key, value))
473 return 0;
474
475 r = exec_output_from_string(value);
476 if (r < 0)
477 log_warning_errno(r, "Failed to parse default standard error switch %s, ignoring: %m", value);
478 else
479 arg_default_std_error = r;
480
481 } else if (streq(key, "systemd.setenv")) {
482
483 if (proc_cmdline_value_missing(key, value))
484 return 0;
485
486 if (!env_assignment_is_valid(value))
487 log_warning("Environment variable assignment '%s' is not valid. Ignoring.", value);
488 else {
489 r = strv_env_replace_strdup(&arg_default_environment, value);
490 if (r < 0)
491 return log_oom();
492 }
493
494 } else if (proc_cmdline_key_streq(key, "systemd.machine_id")) {
495
496 if (proc_cmdline_value_missing(key, value))
497 return 0;
498
499 r = set_machine_id(value);
500 if (r < 0)
501 log_warning_errno(r, "MachineID '%s' is not valid, ignoring: %m", value);
502
503 } else if (proc_cmdline_key_streq(key, "systemd.default_timeout_start_sec")) {
504
505 if (proc_cmdline_value_missing(key, value))
506 return 0;
507
508 r = parse_sec(value, &arg_default_timeout_start_usec);
509 if (r < 0)
510 log_warning_errno(r, "Failed to parse default start timeout '%s', ignoring: %m", value);
511
512 if (arg_default_timeout_start_usec <= 0)
513 arg_default_timeout_start_usec = USEC_INFINITY;
514
515 } else if (proc_cmdline_key_streq(key, "systemd.cpu_affinity")) {
516
517 if (proc_cmdline_value_missing(key, value))
518 return 0;
519
520 r = parse_cpu_set(value, &arg_cpu_affinity);
521 if (r < 0)
522 log_warning_errno(r, "Failed to parse CPU affinity mask '%s', ignoring: %m", value);
523
524 } else if (proc_cmdline_key_streq(key, "systemd.watchdog_device")) {
525
526 if (proc_cmdline_value_missing(key, value))
527 return 0;
528
529 (void) parse_path_argument(value, false, &arg_watchdog_device);
530
531 } else if (proc_cmdline_key_streq(key, "systemd.clock_usec")) {
532
533 if (proc_cmdline_value_missing(key, value))
534 return 0;
535
536 r = safe_atou64(value, &arg_clock_usec);
537 if (r < 0)
538 log_warning_errno(r, "Failed to parse systemd.clock_usec= argument, ignoring: %s", value);
539
540 } else if (proc_cmdline_key_streq(key, "systemd.random_seed")) {
541 void *p;
542 size_t sz;
543
544 if (proc_cmdline_value_missing(key, value))
545 return 0;
546
547 r = unbase64mem(value, SIZE_MAX, &p, &sz);
548 if (r < 0)
549 log_warning_errno(r, "Failed to parse systemd.random_seed= argument, ignoring: %s", value);
550
551 free(arg_random_seed);
552 arg_random_seed = sz > 0 ? p : mfree(p);
553 arg_random_seed_size = sz;
554
555 } else if (streq(key, "quiet") && !value) {
556
557 if (arg_show_status == _SHOW_STATUS_INVALID)
558 arg_show_status = SHOW_STATUS_ERROR;
559
560 } else if (streq(key, "debug") && !value) {
561
562 /* Note that log_parse_environment() handles 'debug'
563 * too, and sets the log level to LOG_DEBUG. */
564
565 if (detect_container() > 0)
566 log_set_target(LOG_TARGET_CONSOLE);
567
568 } else if (!value) {
569 const char *target;
570
571 /* Compatible with SysV, but supported independently even if SysV compatibility is disabled. */
572 target = runlevel_to_target(key);
573 if (target)
574 return free_and_strdup_warn(&arg_default_unit, target);
575 }
576
577 return 0;
578 }
579
580 #define DEFINE_SETTER(name, func, descr) \
581 static int name(const char *unit, \
582 const char *filename, \
583 unsigned line, \
584 const char *section, \
585 unsigned section_line, \
586 const char *lvalue, \
587 int ltype, \
588 const char *rvalue, \
589 void *data, \
590 void *userdata) { \
591 \
592 int r; \
593 \
594 assert(filename); \
595 assert(lvalue); \
596 assert(rvalue); \
597 \
598 r = func(rvalue); \
599 if (r < 0) \
600 log_syntax(unit, LOG_ERR, filename, line, r, \
601 "Invalid " descr "'%s': %m", \
602 rvalue); \
603 \
604 return 0; \
605 }
606
607 DEFINE_SETTER(config_parse_level2, log_set_max_level_from_string, "log level");
608 DEFINE_SETTER(config_parse_target, log_set_target_from_string, "target");
609 DEFINE_SETTER(config_parse_color, log_show_color_from_string, "color");
610 DEFINE_SETTER(config_parse_location, log_show_location_from_string, "location");
611 DEFINE_SETTER(config_parse_time, log_show_time_from_string, "time");
612
613 static int config_parse_default_timeout_abort(
614 const char *unit,
615 const char *filename,
616 unsigned line,
617 const char *section,
618 unsigned section_line,
619 const char *lvalue,
620 int ltype,
621 const char *rvalue,
622 void *data,
623 void *userdata) {
624 int r;
625
626 r = config_parse_timeout_abort(unit, filename, line, section, section_line, lvalue, ltype, rvalue,
627 &arg_default_timeout_abort_usec, userdata);
628 if (r >= 0)
629 arg_default_timeout_abort_set = r;
630 return 0;
631 }
632
633 static int parse_config_file(void) {
634 const ConfigTableItem items[] = {
635 { "Manager", "LogLevel", config_parse_level2, 0, NULL },
636 { "Manager", "LogTarget", config_parse_target, 0, NULL },
637 { "Manager", "LogColor", config_parse_color, 0, NULL },
638 { "Manager", "LogLocation", config_parse_location, 0, NULL },
639 { "Manager", "LogTime", config_parse_time, 0, NULL },
640 { "Manager", "DumpCore", config_parse_bool, 0, &arg_dump_core },
641 { "Manager", "CrashChVT", /* legacy */ config_parse_crash_chvt, 0, &arg_crash_chvt },
642 { "Manager", "CrashChangeVT", config_parse_crash_chvt, 0, &arg_crash_chvt },
643 { "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
644 { "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
645 { "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
646 { "Manager", "StatusUnitFormat", config_parse_status_unit_format, 0, &arg_status_unit_format },
647 { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
648 { "Manager", "NUMAPolicy", config_parse_numa_policy, 0, &arg_numa_policy.type },
649 { "Manager", "NUMAMask", config_parse_numa_mask, 0, &arg_numa_policy },
650 { "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
651 { "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog },
652 { "Manager", "RebootWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog },
653 { "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_reboot_watchdog }, /* obsolete alias */
654 { "Manager", "KExecWatchdogSec", config_parse_sec, 0, &arg_kexec_watchdog },
655 { "Manager", "WatchdogDevice", config_parse_path, 0, &arg_watchdog_device },
656 { "Manager", "CapabilityBoundingSet", config_parse_capability_set, 0, &arg_capability_bounding_set },
657 { "Manager", "NoNewPrivileges", config_parse_bool, 0, &arg_no_new_privs },
658 #if HAVE_SECCOMP
659 { "Manager", "SystemCallArchitectures", config_parse_syscall_archs, 0, &arg_syscall_archs },
660 #endif
661 { "Manager", "TimerSlackNSec", config_parse_nsec, 0, &arg_timer_slack_nsec },
662 { "Manager", "DefaultTimerAccuracySec", config_parse_sec, 0, &arg_default_timer_accuracy_usec },
663 { "Manager", "DefaultStandardOutput", config_parse_output_restricted, 0, &arg_default_std_output },
664 { "Manager", "DefaultStandardError", config_parse_output_restricted, 0, &arg_default_std_error },
665 { "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
666 { "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
667 { "Manager", "DefaultTimeoutAbortSec", config_parse_default_timeout_abort, 0, NULL },
668 { "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
669 { "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
670 { "Manager", "DefaultStartLimitIntervalSec", config_parse_sec, 0, &arg_default_start_limit_interval },
671 { "Manager", "DefaultStartLimitBurst", config_parse_unsigned, 0, &arg_default_start_limit_burst },
672 { "Manager", "DefaultEnvironment", config_parse_environ, 0, &arg_default_environment },
673 { "Manager", "ManagerEnvironment", config_parse_environ, 0, &arg_manager_environment },
674 { "Manager", "DefaultLimitCPU", config_parse_rlimit, RLIMIT_CPU, arg_default_rlimit },
675 { "Manager", "DefaultLimitFSIZE", config_parse_rlimit, RLIMIT_FSIZE, arg_default_rlimit },
676 { "Manager", "DefaultLimitDATA", config_parse_rlimit, RLIMIT_DATA, arg_default_rlimit },
677 { "Manager", "DefaultLimitSTACK", config_parse_rlimit, RLIMIT_STACK, arg_default_rlimit },
678 { "Manager", "DefaultLimitCORE", config_parse_rlimit, RLIMIT_CORE, arg_default_rlimit },
679 { "Manager", "DefaultLimitRSS", config_parse_rlimit, RLIMIT_RSS, arg_default_rlimit },
680 { "Manager", "DefaultLimitNOFILE", config_parse_rlimit, RLIMIT_NOFILE, arg_default_rlimit },
681 { "Manager", "DefaultLimitAS", config_parse_rlimit, RLIMIT_AS, arg_default_rlimit },
682 { "Manager", "DefaultLimitNPROC", config_parse_rlimit, RLIMIT_NPROC, arg_default_rlimit },
683 { "Manager", "DefaultLimitMEMLOCK", config_parse_rlimit, RLIMIT_MEMLOCK, arg_default_rlimit },
684 { "Manager", "DefaultLimitLOCKS", config_parse_rlimit, RLIMIT_LOCKS, arg_default_rlimit },
685 { "Manager", "DefaultLimitSIGPENDING", config_parse_rlimit, RLIMIT_SIGPENDING, arg_default_rlimit },
686 { "Manager", "DefaultLimitMSGQUEUE", config_parse_rlimit, RLIMIT_MSGQUEUE, arg_default_rlimit },
687 { "Manager", "DefaultLimitNICE", config_parse_rlimit, RLIMIT_NICE, arg_default_rlimit },
688 { "Manager", "DefaultLimitRTPRIO", config_parse_rlimit, RLIMIT_RTPRIO, arg_default_rlimit },
689 { "Manager", "DefaultLimitRTTIME", config_parse_rlimit, RLIMIT_RTTIME, arg_default_rlimit },
690 { "Manager", "DefaultCPUAccounting", config_parse_tristate, 0, &arg_default_cpu_accounting },
691 { "Manager", "DefaultIOAccounting", config_parse_bool, 0, &arg_default_io_accounting },
692 { "Manager", "DefaultIPAccounting", config_parse_bool, 0, &arg_default_ip_accounting },
693 { "Manager", "DefaultBlockIOAccounting", config_parse_bool, 0, &arg_default_blockio_accounting },
694 { "Manager", "DefaultMemoryAccounting", config_parse_bool, 0, &arg_default_memory_accounting },
695 { "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
696 { "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
697 { "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
698 { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
699 {}
700 };
701
702 _cleanup_strv_free_ char **files = NULL, **dirs = NULL;
703 const char *suffix;
704 int r;
705
706 if (arg_system)
707 suffix = "system.conf.d";
708 else {
709 r = manager_find_user_config_paths(&files, &dirs);
710 if (r < 0)
711 return log_error_errno(r, "Failed to determine config file paths: %m");
712
713 suffix = "user.conf.d";
714 }
715
716 (void) config_parse_many(
717 (const char* const*) (files ?: STRV_MAKE(PKGSYSCONFDIR "/system.conf")),
718 (const char* const*) (dirs ?: CONF_PATHS_STRV("systemd")),
719 suffix,
720 "Manager\0",
721 config_item_table_lookup, items,
722 CONFIG_PARSE_WARN,
723 NULL,
724 NULL);
725
726 /* Traditionally "0" was used to turn off the default unit timeouts. Fix this up so that we use
727 * USEC_INFINITY like everywhere else. */
728 if (arg_default_timeout_start_usec <= 0)
729 arg_default_timeout_start_usec = USEC_INFINITY;
730 if (arg_default_timeout_stop_usec <= 0)
731 arg_default_timeout_stop_usec = USEC_INFINITY;
732
733 return 0;
734 }
735
736 static void set_manager_defaults(Manager *m) {
737
738 assert(m);
739
740 /* Propagates the various default unit property settings into the manager object, i.e. properties that do not
741 * affect the manager itself, but are just what newly allocated units will have set if they haven't set
742 * anything else. (Also see set_manager_settings() for the settings that affect the manager's own behaviour) */
743
744 m->default_timer_accuracy_usec = arg_default_timer_accuracy_usec;
745 m->default_std_output = arg_default_std_output;
746 m->default_std_error = arg_default_std_error;
747 m->default_timeout_start_usec = arg_default_timeout_start_usec;
748 m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
749 m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
750 m->default_timeout_abort_set = arg_default_timeout_abort_set;
751 m->default_restart_usec = arg_default_restart_usec;
752 m->default_start_limit_interval = arg_default_start_limit_interval;
753 m->default_start_limit_burst = arg_default_start_limit_burst;
754
755 /* On 4.15+ with unified hierarchy, CPU accounting is essentially free as it doesn't require the CPU
756 * controller to be enabled, so the default is to enable it unless we got told otherwise. */
757 if (arg_default_cpu_accounting >= 0)
758 m->default_cpu_accounting = arg_default_cpu_accounting;
759 else
760 m->default_cpu_accounting = cpu_accounting_is_cheap();
761
762 m->default_io_accounting = arg_default_io_accounting;
763 m->default_ip_accounting = arg_default_ip_accounting;
764 m->default_blockio_accounting = arg_default_blockio_accounting;
765 m->default_memory_accounting = arg_default_memory_accounting;
766 m->default_tasks_accounting = arg_default_tasks_accounting;
767 m->default_tasks_max = arg_default_tasks_max;
768 m->default_oom_policy = arg_default_oom_policy;
769
770 (void) manager_set_default_rlimits(m, arg_default_rlimit);
771
772 (void) manager_default_environment(m);
773 (void) manager_transient_environment_add(m, arg_default_environment);
774 }
775
776 static void set_manager_settings(Manager *m) {
777
778 assert(m);
779
780 /* Propagates the various manager settings into the manager object, i.e. properties that
781 * effect the manager itself (as opposed to just being inherited into newly allocated
782 * units, see set_manager_defaults() above). */
783
784 m->confirm_spawn = arg_confirm_spawn;
785 m->service_watchdogs = arg_service_watchdogs;
786 m->cad_burst_action = arg_cad_burst_action;
787
788 manager_set_watchdog(m, WATCHDOG_RUNTIME, arg_runtime_watchdog);
789 manager_set_watchdog(m, WATCHDOG_REBOOT, arg_reboot_watchdog);
790 manager_set_watchdog(m, WATCHDOG_KEXEC, arg_kexec_watchdog);
791
792 manager_set_show_status(m, arg_show_status, "commandline");
793 m->status_unit_format = arg_status_unit_format;
794 }
795
796 static int parse_argv(int argc, char *argv[]) {
797 enum {
798 ARG_LOG_LEVEL = 0x100,
799 ARG_LOG_TARGET,
800 ARG_LOG_COLOR,
801 ARG_LOG_LOCATION,
802 ARG_LOG_TIME,
803 ARG_UNIT,
804 ARG_SYSTEM,
805 ARG_USER,
806 ARG_TEST,
807 ARG_NO_PAGER,
808 ARG_VERSION,
809 ARG_DUMP_CONFIGURATION_ITEMS,
810 ARG_DUMP_BUS_PROPERTIES,
811 ARG_BUS_INTROSPECT,
812 ARG_DUMP_CORE,
813 ARG_CRASH_CHVT,
814 ARG_CRASH_SHELL,
815 ARG_CRASH_REBOOT,
816 ARG_CONFIRM_SPAWN,
817 ARG_SHOW_STATUS,
818 ARG_DESERIALIZE,
819 ARG_SWITCHED_ROOT,
820 ARG_DEFAULT_STD_OUTPUT,
821 ARG_DEFAULT_STD_ERROR,
822 ARG_MACHINE_ID,
823 ARG_SERVICE_WATCHDOGS,
824 };
825
826 static const struct option options[] = {
827 { "log-level", required_argument, NULL, ARG_LOG_LEVEL },
828 { "log-target", required_argument, NULL, ARG_LOG_TARGET },
829 { "log-color", optional_argument, NULL, ARG_LOG_COLOR },
830 { "log-location", optional_argument, NULL, ARG_LOG_LOCATION },
831 { "log-time", optional_argument, NULL, ARG_LOG_TIME },
832 { "unit", required_argument, NULL, ARG_UNIT },
833 { "system", no_argument, NULL, ARG_SYSTEM },
834 { "user", no_argument, NULL, ARG_USER },
835 { "test", no_argument, NULL, ARG_TEST },
836 { "no-pager", no_argument, NULL, ARG_NO_PAGER },
837 { "help", no_argument, NULL, 'h' },
838 { "version", no_argument, NULL, ARG_VERSION },
839 { "dump-configuration-items", no_argument, NULL, ARG_DUMP_CONFIGURATION_ITEMS },
840 { "dump-bus-properties", no_argument, NULL, ARG_DUMP_BUS_PROPERTIES },
841 { "bus-introspect", required_argument, NULL, ARG_BUS_INTROSPECT },
842 { "dump-core", optional_argument, NULL, ARG_DUMP_CORE },
843 { "crash-chvt", required_argument, NULL, ARG_CRASH_CHVT },
844 { "crash-shell", optional_argument, NULL, ARG_CRASH_SHELL },
845 { "crash-reboot", optional_argument, NULL, ARG_CRASH_REBOOT },
846 { "confirm-spawn", optional_argument, NULL, ARG_CONFIRM_SPAWN },
847 { "show-status", optional_argument, NULL, ARG_SHOW_STATUS },
848 { "deserialize", required_argument, NULL, ARG_DESERIALIZE },
849 { "switched-root", no_argument, NULL, ARG_SWITCHED_ROOT },
850 { "default-standard-output", required_argument, NULL, ARG_DEFAULT_STD_OUTPUT, },
851 { "default-standard-error", required_argument, NULL, ARG_DEFAULT_STD_ERROR, },
852 { "machine-id", required_argument, NULL, ARG_MACHINE_ID },
853 { "service-watchdogs", required_argument, NULL, ARG_SERVICE_WATCHDOGS },
854 {}
855 };
856
857 int c, r;
858 bool user_arg_seen = false;
859
860 assert(argc >= 1);
861 assert(argv);
862
863 if (getpid_cached() == 1)
864 opterr = 0;
865
866 while ((c = getopt_long(argc, argv, "hDbsz:", options, NULL)) >= 0)
867
868 switch (c) {
869
870 case ARG_LOG_LEVEL:
871 r = log_set_max_level_from_string(optarg);
872 if (r < 0)
873 return log_error_errno(r, "Failed to parse log level \"%s\": %m", optarg);
874
875 break;
876
877 case ARG_LOG_TARGET:
878 r = log_set_target_from_string(optarg);
879 if (r < 0)
880 return log_error_errno(r, "Failed to parse log target \"%s\": %m", optarg);
881
882 break;
883
884 case ARG_LOG_COLOR:
885
886 if (optarg) {
887 r = log_show_color_from_string(optarg);
888 if (r < 0)
889 return log_error_errno(r, "Failed to parse log color setting \"%s\": %m",
890 optarg);
891 } else
892 log_show_color(true);
893
894 break;
895
896 case ARG_LOG_LOCATION:
897 if (optarg) {
898 r = log_show_location_from_string(optarg);
899 if (r < 0)
900 return log_error_errno(r, "Failed to parse log location setting \"%s\": %m",
901 optarg);
902 } else
903 log_show_location(true);
904
905 break;
906
907 case ARG_LOG_TIME:
908
909 if (optarg) {
910 r = log_show_time_from_string(optarg);
911 if (r < 0)
912 return log_error_errno(r, "Failed to parse log time setting \"%s\": %m",
913 optarg);
914 } else
915 log_show_time(true);
916
917 break;
918
919 case ARG_DEFAULT_STD_OUTPUT:
920 r = exec_output_from_string(optarg);
921 if (r < 0)
922 return log_error_errno(r, "Failed to parse default standard output setting \"%s\": %m",
923 optarg);
924 arg_default_std_output = r;
925 break;
926
927 case ARG_DEFAULT_STD_ERROR:
928 r = exec_output_from_string(optarg);
929 if (r < 0)
930 return log_error_errno(r, "Failed to parse default standard error output setting \"%s\": %m",
931 optarg);
932 arg_default_std_error = r;
933 break;
934
935 case ARG_UNIT:
936 r = free_and_strdup(&arg_default_unit, optarg);
937 if (r < 0)
938 return log_error_errno(r, "Failed to set default unit \"%s\": %m", optarg);
939
940 break;
941
942 case ARG_SYSTEM:
943 arg_system = true;
944 break;
945
946 case ARG_USER:
947 arg_system = false;
948 user_arg_seen = true;
949 break;
950
951 case ARG_TEST:
952 arg_action = ACTION_TEST;
953 break;
954
955 case ARG_NO_PAGER:
956 arg_pager_flags |= PAGER_DISABLE;
957 break;
958
959 case ARG_VERSION:
960 arg_action = ACTION_VERSION;
961 break;
962
963 case ARG_DUMP_CONFIGURATION_ITEMS:
964 arg_action = ACTION_DUMP_CONFIGURATION_ITEMS;
965 break;
966
967 case ARG_DUMP_BUS_PROPERTIES:
968 arg_action = ACTION_DUMP_BUS_PROPERTIES;
969 break;
970
971 case ARG_BUS_INTROSPECT:
972 arg_bus_introspect = optarg;
973 arg_action = ACTION_BUS_INTROSPECT;
974 break;
975
976 case ARG_DUMP_CORE:
977 r = parse_boolean_argument("--dump-core", optarg, &arg_dump_core);
978 if (r < 0)
979 return r;
980 break;
981
982 case ARG_CRASH_CHVT:
983 r = parse_crash_chvt(optarg, &arg_crash_chvt);
984 if (r < 0)
985 return log_error_errno(r, "Failed to parse crash virtual terminal index: \"%s\": %m",
986 optarg);
987 break;
988
989 case ARG_CRASH_SHELL:
990 r = parse_boolean_argument("--crash-shell", optarg, &arg_crash_shell);
991 if (r < 0)
992 return r;
993 break;
994
995 case ARG_CRASH_REBOOT:
996 r = parse_boolean_argument("--crash-reboot", optarg, &arg_crash_reboot);
997 if (r < 0)
998 return r;
999 break;
1000
1001 case ARG_CONFIRM_SPAWN:
1002 arg_confirm_spawn = mfree(arg_confirm_spawn);
1003
1004 r = parse_confirm_spawn(optarg, &arg_confirm_spawn);
1005 if (r < 0)
1006 return log_error_errno(r, "Failed to parse confirm spawn option: \"%s\": %m",
1007 optarg);
1008 break;
1009
1010 case ARG_SERVICE_WATCHDOGS:
1011 r = parse_boolean_argument("--service-watchdogs=", optarg, &arg_service_watchdogs);
1012 if (r < 0)
1013 return r;
1014 break;
1015
1016 case ARG_SHOW_STATUS:
1017 if (optarg) {
1018 r = parse_show_status(optarg, &arg_show_status);
1019 if (r < 0)
1020 return log_error_errno(r, "Failed to parse show status boolean: \"%s\": %m",
1021 optarg);
1022 } else
1023 arg_show_status = SHOW_STATUS_YES;
1024 break;
1025
1026 case ARG_DESERIALIZE: {
1027 int fd;
1028 FILE *f;
1029
1030 r = safe_atoi(optarg, &fd);
1031 if (r < 0)
1032 log_error_errno(r, "Failed to parse deserialize option \"%s\": %m", optarg);
1033 if (fd < 0)
1034 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1035 "Invalid deserialize fd: %d",
1036 fd);
1037
1038 (void) fd_cloexec(fd, true);
1039
1040 f = fdopen(fd, "r");
1041 if (!f)
1042 return log_error_errno(errno, "Failed to open serialization fd %d: %m", fd);
1043
1044 safe_fclose(arg_serialization);
1045 arg_serialization = f;
1046
1047 break;
1048 }
1049
1050 case ARG_SWITCHED_ROOT:
1051 arg_switched_root = true;
1052 break;
1053
1054 case ARG_MACHINE_ID:
1055 r = set_machine_id(optarg);
1056 if (r < 0)
1057 return log_error_errno(r, "MachineID '%s' is not valid: %m", optarg);
1058 break;
1059
1060 case 'h':
1061 arg_action = ACTION_HELP;
1062 break;
1063
1064 case 'D':
1065 log_set_max_level(LOG_DEBUG);
1066 break;
1067
1068 case 'b':
1069 case 's':
1070 case 'z':
1071 /* Just to eat away the sysvinit kernel cmdline args that we'll parse in
1072 * parse_proc_cmdline_item() or ignore, without any getopt() error messages.
1073 */
1074 case '?':
1075 if (getpid_cached() != 1)
1076 return -EINVAL;
1077 else
1078 return 0;
1079
1080 default:
1081 assert_not_reached("Unhandled option code.");
1082 }
1083
1084 if (optind < argc && getpid_cached() != 1)
1085 /* Hmm, when we aren't run as init system let's complain about excess arguments */
1086 return log_error_errno(SYNTHETIC_ERRNO(EINVAL), "Excess arguments.");
1087
1088 if (arg_action == ACTION_RUN && !arg_system && !user_arg_seen)
1089 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1090 "Explicit --user argument required to run as user manager.");
1091
1092 return 0;
1093 }
1094
1095 static int help(void) {
1096 _cleanup_free_ char *link = NULL;
1097 int r;
1098
1099 r = terminal_urlify_man("systemd", "1", &link);
1100 if (r < 0)
1101 return log_oom();
1102
1103 printf("%s [OPTIONS...]\n\n"
1104 "%sStarts and monitors system and user services.%s\n\n"
1105 "This program takes no positional arguments.\n\n"
1106 "%sOptions%s:\n"
1107 " -h --help Show this help\n"
1108 " --version Show version\n"
1109 " --test Determine initial transaction, dump it and exit\n"
1110 " --system In combination with --test: operate as system service manager\n"
1111 " --user In combination with --test: operate as per-user service manager\n"
1112 " --no-pager Do not pipe output into a pager\n"
1113 " --dump-configuration-items Dump understood unit configuration items\n"
1114 " --dump-bus-properties Dump exposed bus properties\n"
1115 " --bus-introspect=PATH Write XML introspection data\n"
1116 " --unit=UNIT Set default unit\n"
1117 " --dump-core[=BOOL] Dump core on crash\n"
1118 " --crash-vt=NR Change to specified VT on crash\n"
1119 " --crash-reboot[=BOOL] Reboot on crash\n"
1120 " --crash-shell[=BOOL] Run shell on crash\n"
1121 " --confirm-spawn[=BOOL] Ask for confirmation when spawning processes\n"
1122 " --show-status[=BOOL] Show status updates on the console during bootup\n"
1123 " --log-target=TARGET Set log target (console, journal, kmsg, journal-or-kmsg, null)\n"
1124 " --log-level=LEVEL Set log level (debug, info, notice, warning, err, crit, alert, emerg)\n"
1125 " --log-color[=BOOL] Highlight important log messages\n"
1126 " --log-location[=BOOL] Include code location in log messages\n"
1127 " --log-time[=BOOL] Prefix log messages with current time\n"
1128 " --default-standard-output= Set default standard output for services\n"
1129 " --default-standard-error= Set default standard error output for services\n"
1130 "\nSee the %s for details.\n",
1131 program_invocation_short_name,
1132 ansi_highlight(),
1133 ansi_normal(),
1134 ansi_underline(),
1135 ansi_normal(),
1136 link);
1137
1138 return 0;
1139 }
1140
1141 static int prepare_reexecute(
1142 Manager *m,
1143 FILE **ret_f,
1144 FDSet **ret_fds,
1145 bool switching_root) {
1146
1147 _cleanup_fdset_free_ FDSet *fds = NULL;
1148 _cleanup_fclose_ FILE *f = NULL;
1149 int r;
1150
1151 assert(m);
1152 assert(ret_f);
1153 assert(ret_fds);
1154
1155 r = manager_open_serialization(m, &f);
1156 if (r < 0)
1157 return log_error_errno(r, "Failed to create serialization file: %m");
1158
1159 /* Make sure nothing is really destructed when we shut down */
1160 m->n_reloading++;
1161 bus_manager_send_reloading(m, true);
1162
1163 fds = fdset_new();
1164 if (!fds)
1165 return log_oom();
1166
1167 r = manager_serialize(m, f, fds, switching_root);
1168 if (r < 0)
1169 return r;
1170
1171 if (fseeko(f, 0, SEEK_SET) == (off_t) -1)
1172 return log_error_errno(errno, "Failed to rewind serialization fd: %m");
1173
1174 r = fd_cloexec(fileno(f), false);
1175 if (r < 0)
1176 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization: %m");
1177
1178 r = fdset_cloexec(fds, false);
1179 if (r < 0)
1180 return log_error_errno(r, "Failed to disable O_CLOEXEC for serialization fds: %m");
1181
1182 *ret_f = TAKE_PTR(f);
1183 *ret_fds = TAKE_PTR(fds);
1184
1185 return 0;
1186 }
1187
1188 static void bump_file_max_and_nr_open(void) {
1189
1190 /* Let's bump fs.file-max and fs.nr_open to their respective maximums. On current kernels large numbers of file
1191 * descriptors are no longer a performance problem and their memory is properly tracked by memcg, thus counting
1192 * them and limiting them in another two layers of limits is unnecessary and just complicates things. This
1193 * function hence turns off 2 of the 4 levels of limits on file descriptors, and makes RLIMIT_NOLIMIT (soft +
1194 * hard) the only ones that really matter. */
1195
1196 #if BUMP_PROC_SYS_FS_FILE_MAX || BUMP_PROC_SYS_FS_NR_OPEN
1197 int r;
1198 #endif
1199
1200 #if BUMP_PROC_SYS_FS_FILE_MAX
1201 /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously thing where
1202 * different but the operation would fail silently.) */
1203 r = sysctl_writef("fs/file-max", "%li\n", LONG_MAX);
1204 if (r < 0)
1205 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.file-max, ignoring: %m");
1206 #endif
1207
1208 #if BUMP_PROC_SYS_FS_NR_OPEN
1209 int v = INT_MAX;
1210
1211 /* Arg! The kernel enforces maximum and minimum values on the fs.nr_open, but we don't really know what they
1212 * are. The expression by which the maximum is determined is dependent on the architecture, and is something we
1213 * don't really want to copy to userspace, as it is dependent on implementation details of the kernel. Since
1214 * the kernel doesn't expose the maximum value to us, we can only try and hope. Hence, let's start with
1215 * INT_MAX, and then keep halving the value until we find one that works. Ugly? Yes, absolutely, but kernel
1216 * APIs are kernel APIs, so what do can we do... 🤯 */
1217
1218 for (;;) {
1219 int k;
1220
1221 v &= ~(__SIZEOF_POINTER__ - 1); /* Round down to next multiple of the pointer size */
1222 if (v < 1024) {
1223 log_warning("Can't bump fs.nr_open, value too small.");
1224 break;
1225 }
1226
1227 k = read_nr_open();
1228 if (k < 0) {
1229 log_error_errno(k, "Failed to read fs.nr_open: %m");
1230 break;
1231 }
1232 if (k >= v) { /* Already larger */
1233 log_debug("Skipping bump, value is already larger.");
1234 break;
1235 }
1236
1237 r = sysctl_writef("fs/nr_open", "%i\n", v);
1238 if (r == -EINVAL) {
1239 log_debug("Couldn't write fs.nr_open as %i, halving it.", v);
1240 v /= 2;
1241 continue;
1242 }
1243 if (r < 0) {
1244 log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r, "Failed to bump fs.nr_open, ignoring: %m");
1245 break;
1246 }
1247
1248 log_debug("Successfully bumped fs.nr_open to %i", v);
1249 break;
1250 }
1251 #endif
1252 }
1253
1254 static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
1255 struct rlimit new_rlimit;
1256 int r, nr;
1257
1258 /* Get the underlying absolute limit the kernel enforces */
1259 nr = read_nr_open();
1260
1261 /* Calculate the new limits to use for us. Never lower from what we inherited. */
1262 new_rlimit = (struct rlimit) {
1263 .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
1264 .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
1265 };
1266
1267 /* Shortcut if nothing changes. */
1268 if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
1269 saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
1270 log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
1271 return 0;
1272 }
1273
1274 /* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
1275 * both hard and soft. */
1276 r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
1277 if (r < 0)
1278 return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
1279
1280 return 0;
1281 }
1282
1283 static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
1284 struct rlimit new_rlimit;
1285 uint64_t mm;
1286 int r;
1287
1288 /* BPF_MAP_TYPE_LPM_TRIE bpf maps are charged against RLIMIT_MEMLOCK, even if we have CAP_IPC_LOCK which should
1289 * normally disable such checks. We need them to implement IPAddressAllow= and IPAddressDeny=, hence let's bump
1290 * the value high enough for our user. */
1291
1292 /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
1293 * must be unsigned, hence this is a given, but let's make this clear here. */
1294 assert_cc(RLIM_INFINITY > 0);
1295
1296 mm = physical_memory_scale(1, 8); /* Let's scale how much we allow to be locked by the amount of physical
1297 * RAM. We allow an eighth to be locked by us, just to pick a value. */
1298
1299 new_rlimit = (struct rlimit) {
1300 .rlim_cur = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur, mm),
1301 .rlim_max = MAX3(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max, mm),
1302 };
1303
1304 if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
1305 saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
1306 log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
1307 return 0;
1308 }
1309
1310 r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
1311 if (r < 0)
1312 return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
1313
1314 return 0;
1315 }
1316
1317 static void test_usr(void) {
1318
1319 /* Check that /usr is either on the same file system as / or mounted already. */
1320
1321 if (dir_is_empty("/usr") <= 0)
1322 return;
1323
1324 log_warning("/usr appears to be on its own filesystem and is not already mounted. This is not a supported setup. "
1325 "Some things will probably break (sometimes even silently) in mysterious ways. "
1326 "Consult http://freedesktop.org/wiki/Software/systemd/separate-usr-is-broken for more information.");
1327 }
1328
1329 static int enforce_syscall_archs(Set *archs) {
1330 #if HAVE_SECCOMP
1331 int r;
1332
1333 if (!is_seccomp_available())
1334 return 0;
1335
1336 r = seccomp_restrict_archs(arg_syscall_archs);
1337 if (r < 0)
1338 return log_error_errno(r, "Failed to enforce system call architecture restrication: %m");
1339 #endif
1340 return 0;
1341 }
1342
1343 static int status_welcome(void) {
1344 _cleanup_free_ char *pretty_name = NULL, *ansi_color = NULL;
1345 int r;
1346
1347 if (!show_status_on(arg_show_status))
1348 return 0;
1349
1350 r = parse_os_release(NULL,
1351 "PRETTY_NAME", &pretty_name,
1352 "ANSI_COLOR", &ansi_color);
1353 if (r < 0)
1354 log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r,
1355 "Failed to read os-release file, ignoring: %m");
1356
1357 if (log_get_show_color())
1358 return status_printf(NULL, 0,
1359 "\nWelcome to \x1B[%sm%s\x1B[0m!\n",
1360 isempty(ansi_color) ? "1" : ansi_color,
1361 isempty(pretty_name) ? "Linux" : pretty_name);
1362 else
1363 return status_printf(NULL, 0,
1364 "\nWelcome to %s!\n",
1365 isempty(pretty_name) ? "Linux" : pretty_name);
1366 }
1367
1368 static int write_container_id(void) {
1369 const char *c;
1370 int r;
1371
1372 c = getenv("container");
1373 if (isempty(c))
1374 return 0;
1375
1376 RUN_WITH_UMASK(0022)
1377 r = write_string_file("/run/systemd/container", c, WRITE_STRING_FILE_CREATE);
1378 if (r < 0)
1379 return log_warning_errno(r, "Failed to write /run/systemd/container, ignoring: %m");
1380
1381 return 1;
1382 }
1383
1384 static int bump_unix_max_dgram_qlen(void) {
1385 _cleanup_free_ char *qlen = NULL;
1386 unsigned long v;
1387 int r;
1388
1389 /* Let's bump the net.unix.max_dgram_qlen sysctl. The kernel default of 16 is simply too low. We set the value
1390 * really really early during boot, so that it is actually applied to all our sockets, including the
1391 * $NOTIFY_SOCKET one. */
1392
1393 r = read_one_line_file("/proc/sys/net/unix/max_dgram_qlen", &qlen);
1394 if (r < 0)
1395 return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_WARNING, r, "Failed to read AF_UNIX datagram queue length, ignoring: %m");
1396
1397 r = safe_atolu(qlen, &v);
1398 if (r < 0)
1399 return log_warning_errno(r, "Failed to parse AF_UNIX datagram queue length '%s', ignoring: %m", qlen);
1400
1401 if (v >= DEFAULT_UNIX_MAX_DGRAM_QLEN)
1402 return 0;
1403
1404 r = write_string_filef("/proc/sys/net/unix/max_dgram_qlen", WRITE_STRING_FILE_DISABLE_BUFFER, "%lu", DEFAULT_UNIX_MAX_DGRAM_QLEN);
1405 if (r < 0)
1406 return log_full_errno(IN_SET(r, -EROFS, -EPERM, -EACCES) ? LOG_DEBUG : LOG_WARNING, r,
1407 "Failed to bump AF_UNIX datagram queue length, ignoring: %m");
1408
1409 return 1;
1410 }
1411
1412 static int fixup_environment(void) {
1413 _cleanup_free_ char *term = NULL;
1414 const char *t;
1415 int r;
1416
1417 /* Only fix up the environment when we are started as PID 1 */
1418 if (getpid_cached() != 1)
1419 return 0;
1420
1421 /* We expect the environment to be set correctly if run inside a container. */
1422 if (detect_container() > 0)
1423 return 0;
1424
1425 /* When started as PID1, the kernel uses /dev/console for our stdios and uses TERM=linux whatever the backend
1426 * device used by the console. We try to make a better guess here since some consoles might not have support
1427 * for color mode for example.
1428 *
1429 * However if TERM was configured through the kernel command line then leave it alone. */
1430 r = proc_cmdline_get_key("TERM", 0, &term);
1431 if (r < 0)
1432 return r;
1433
1434 t = term ?: default_term_for_tty("/dev/console");
1435
1436 if (setenv("TERM", t, 1) < 0)
1437 return -errno;
1438
1439 /* The kernels sets HOME=/ for init. Let's undo this. */
1440 if (path_equal_ptr(getenv("HOME"), "/"))
1441 assert_se(unsetenv("HOME") == 0);
1442
1443 return 0;
1444 }
1445
1446 static void redirect_telinit(int argc, char *argv[]) {
1447
1448 /* This is compatibility support for SysV, where calling init as a user is identical to telinit. */
1449
1450 #if HAVE_SYSV_COMPAT
1451 if (getpid_cached() == 1)
1452 return;
1453
1454 if (!invoked_as(argv, "init"))
1455 return;
1456
1457 execv(SYSTEMCTL_BINARY_PATH, argv);
1458 log_error_errno(errno, "Failed to exec " SYSTEMCTL_BINARY_PATH ": %m");
1459 exit(EXIT_FAILURE);
1460 #endif
1461 }
1462
1463 static int become_shutdown(
1464 const char *shutdown_verb,
1465 int retval) {
1466
1467 char log_level[DECIMAL_STR_MAX(int) + 1],
1468 exit_code[DECIMAL_STR_MAX(uint8_t) + 1],
1469 timeout[DECIMAL_STR_MAX(usec_t) + 1];
1470
1471 const char* command_line[13] = {
1472 SYSTEMD_SHUTDOWN_BINARY_PATH,
1473 shutdown_verb,
1474 "--timeout", timeout,
1475 "--log-level", log_level,
1476 "--log-target",
1477 };
1478
1479 _cleanup_strv_free_ char **env_block = NULL;
1480 size_t pos = 7;
1481 int r;
1482 usec_t watchdog_timer = 0;
1483
1484 assert(shutdown_verb);
1485 assert(!command_line[pos]);
1486 env_block = strv_copy(environ);
1487
1488 xsprintf(log_level, "%d", log_get_max_level());
1489 xsprintf(timeout, "%" PRI_USEC "us", arg_default_timeout_stop_usec);
1490
1491 switch (log_get_target()) {
1492
1493 case LOG_TARGET_KMSG:
1494 case LOG_TARGET_JOURNAL_OR_KMSG:
1495 case LOG_TARGET_SYSLOG_OR_KMSG:
1496 command_line[pos++] = "kmsg";
1497 break;
1498
1499 case LOG_TARGET_NULL:
1500 command_line[pos++] = "null";
1501 break;
1502
1503 case LOG_TARGET_CONSOLE:
1504 default:
1505 command_line[pos++] = "console";
1506 break;
1507 };
1508
1509 if (log_get_show_color())
1510 command_line[pos++] = "--log-color";
1511
1512 if (log_get_show_location())
1513 command_line[pos++] = "--log-location";
1514
1515 if (log_get_show_time())
1516 command_line[pos++] = "--log-time";
1517
1518 if (streq(shutdown_verb, "exit")) {
1519 command_line[pos++] = "--exit-code";
1520 command_line[pos++] = exit_code;
1521 xsprintf(exit_code, "%d", retval);
1522 }
1523
1524 assert(pos < ELEMENTSOF(command_line));
1525
1526 if (streq(shutdown_verb, "reboot"))
1527 watchdog_timer = arg_reboot_watchdog;
1528 else if (streq(shutdown_verb, "kexec"))
1529 watchdog_timer = arg_kexec_watchdog;
1530
1531 if (watchdog_timer > 0 && watchdog_timer != USEC_INFINITY) {
1532
1533 char *e;
1534
1535 /* If we reboot or kexec let's set the shutdown
1536 * watchdog and tell the shutdown binary to
1537 * repeatedly ping it */
1538 r = watchdog_set_timeout(&watchdog_timer);
1539 watchdog_close(r < 0);
1540
1541 /* Tell the binary how often to ping, ignore failure */
1542 if (asprintf(&e, "WATCHDOG_USEC="USEC_FMT, watchdog_timer) > 0)
1543 (void) strv_consume(&env_block, e);
1544
1545 if (arg_watchdog_device &&
1546 asprintf(&e, "WATCHDOG_DEVICE=%s", arg_watchdog_device) > 0)
1547 (void) strv_consume(&env_block, e);
1548 } else
1549 watchdog_close(true);
1550
1551 /* Avoid the creation of new processes forked by the
1552 * kernel; at this point, we will not listen to the
1553 * signals anyway */
1554 if (detect_container() <= 0)
1555 (void) cg_uninstall_release_agent(SYSTEMD_CGROUP_CONTROLLER);
1556
1557 execve(SYSTEMD_SHUTDOWN_BINARY_PATH, (char **) command_line, env_block);
1558 return -errno;
1559 }
1560
1561 static void initialize_clock(void) {
1562 int r;
1563
1564 /* This is called very early on, before we parse the kernel command line or otherwise figure out why
1565 * we are running, but only once. */
1566
1567 if (clock_is_localtime(NULL) > 0) {
1568 int min;
1569
1570 /*
1571 * The very first call of settimeofday() also does a time warp in the kernel.
1572 *
1573 * In the rtc-in-local time mode, we set the kernel's timezone, and rely on external tools to take care
1574 * of maintaining the RTC and do all adjustments. This matches the behavior of Windows, which leaves
1575 * the RTC alone if the registry tells that the RTC runs in UTC.
1576 */
1577 r = clock_set_timezone(&min);
1578 if (r < 0)
1579 log_error_errno(r, "Failed to apply local time delta, ignoring: %m");
1580 else
1581 log_info("RTC configured in localtime, applying delta of %i minutes to system time.", min);
1582
1583 } else if (!in_initrd())
1584 /*
1585 * Do a dummy very first call to seal the kernel's time warp magic.
1586 *
1587 * Do not call this from inside the initrd. The initrd might not carry /etc/adjtime with LOCAL, but the
1588 * real system could be set up that way. In such case, we need to delay the time-warp or the sealing
1589 * until we reach the real system.
1590 *
1591 * Do no set the kernel's timezone. The concept of local time cannot be supported reliably, the time
1592 * will jump or be incorrect at every daylight saving time change. All kernel local time concepts will
1593 * be treated as UTC that way.
1594 */
1595 (void) clock_reset_timewarp();
1596
1597 r = clock_apply_epoch();
1598 if (r < 0)
1599 log_error_errno(r, "Current system time is before build time, but cannot correct: %m");
1600 else if (r > 0)
1601 log_info("System time before build time, advancing clock.");
1602 }
1603
1604 static void apply_clock_update(void) {
1605 struct timespec ts;
1606
1607 /* This is called later than initialize_clock(), i.e. after we parsed configuration files/kernel
1608 * command line and such. */
1609
1610 if (arg_clock_usec == 0)
1611 return;
1612
1613 if (getpid_cached() != 1)
1614 return;
1615
1616 if (clock_settime(CLOCK_REALTIME, timespec_store(&ts, arg_clock_usec)) < 0)
1617 log_error_errno(errno, "Failed to set system clock to time specified on kernel command line: %m");
1618 else {
1619 char buf[FORMAT_TIMESTAMP_MAX];
1620
1621 log_info("Set system clock to %s, as specified on the kernel command line.",
1622 format_timestamp(buf, sizeof(buf), arg_clock_usec));
1623 }
1624 }
1625
1626 static void cmdline_take_random_seed(void) {
1627 size_t suggested;
1628 int r;
1629
1630 if (arg_random_seed_size == 0)
1631 return;
1632
1633 if (getpid_cached() != 1)
1634 return;
1635
1636 assert(arg_random_seed);
1637 suggested = random_pool_size();
1638
1639 if (arg_random_seed_size < suggested)
1640 log_warning("Random seed specified on kernel command line has size %zu, but %zu bytes required to fill entropy pool.",
1641 arg_random_seed_size, suggested);
1642
1643 r = random_write_entropy(-1, arg_random_seed, arg_random_seed_size, true);
1644 if (r < 0) {
1645 log_warning_errno(r, "Failed to credit entropy specified on kernel command line, ignoring: %m");
1646 return;
1647 }
1648
1649 log_notice("Successfully credited entropy passed on kernel command line.\n"
1650 "Note that the seed provided this way is accessible to unprivileged programs. This functionality should not be used outside of testing environments.");
1651 }
1652
1653 static void initialize_coredump(bool skip_setup) {
1654 #if ENABLE_COREDUMP
1655 if (getpid_cached() != 1)
1656 return;
1657
1658 /* Don't limit the core dump size, so that coredump handlers such as systemd-coredump (which honour the limit)
1659 * will process core dumps for system services by default. */
1660 if (setrlimit(RLIMIT_CORE, &RLIMIT_MAKE_CONST(RLIM_INFINITY)) < 0)
1661 log_warning_errno(errno, "Failed to set RLIMIT_CORE: %m");
1662
1663 /* But at the same time, turn off the core_pattern logic by default, so that no
1664 * coredumps are stored until the systemd-coredump tool is enabled via
1665 * sysctl. However it can be changed via the kernel command line later so core
1666 * dumps can still be generated during early startup and in initramfs. */
1667 if (!skip_setup)
1668 disable_coredumps();
1669 #endif
1670 }
1671
1672 static void initialize_core_pattern(bool skip_setup) {
1673 int r;
1674
1675 if (skip_setup || !arg_early_core_pattern)
1676 return;
1677
1678 if (getpid_cached() != 1)
1679 return;
1680
1681 r = write_string_file("/proc/sys/kernel/core_pattern", arg_early_core_pattern, WRITE_STRING_FILE_DISABLE_BUFFER);
1682 if (r < 0)
1683 log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern);
1684 }
1685
1686 static void update_cpu_affinity(bool skip_setup) {
1687 _cleanup_free_ char *mask = NULL;
1688
1689 if (skip_setup || !arg_cpu_affinity.set)
1690 return;
1691
1692 assert(arg_cpu_affinity.allocated > 0);
1693
1694 mask = cpu_set_to_string(&arg_cpu_affinity);
1695 log_debug("Setting CPU affinity to %s.", strnull(mask));
1696
1697 if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
1698 log_warning_errno(errno, "Failed to set CPU affinity: %m");
1699 }
1700
1701 static void update_numa_policy(bool skip_setup) {
1702 int r;
1703 _cleanup_free_ char *nodes = NULL;
1704 const char * policy = NULL;
1705
1706 if (skip_setup || !mpol_is_valid(numa_policy_get_type(&arg_numa_policy)))
1707 return;
1708
1709 if (DEBUG_LOGGING) {
1710 policy = mpol_to_string(numa_policy_get_type(&arg_numa_policy));
1711 nodes = cpu_set_to_range_string(&arg_numa_policy.nodes);
1712 log_debug("Setting NUMA policy to %s, with nodes %s.", strnull(policy), strnull(nodes));
1713 }
1714
1715 r = apply_numa_policy(&arg_numa_policy);
1716 if (r == -EOPNOTSUPP)
1717 log_debug_errno(r, "NUMA support not available, ignoring.");
1718 else if (r < 0)
1719 log_warning_errno(r, "Failed to set NUMA memory policy: %m");
1720 }
1721
1722 static void do_reexecute(
1723 int argc,
1724 char *argv[],
1725 const struct rlimit *saved_rlimit_nofile,
1726 const struct rlimit *saved_rlimit_memlock,
1727 FDSet *fds,
1728 const char *switch_root_dir,
1729 const char *switch_root_init,
1730 const char **ret_error_message) {
1731
1732 unsigned i, j, args_size;
1733 const char **args;
1734 int r;
1735
1736 assert(saved_rlimit_nofile);
1737 assert(saved_rlimit_memlock);
1738 assert(ret_error_message);
1739
1740 /* Close and disarm the watchdog, so that the new instance can reinitialize it, but doesn't get rebooted while
1741 * we do that */
1742 watchdog_close(true);
1743
1744 /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
1745 * the kernel default to its child processes */
1746 if (saved_rlimit_nofile->rlim_cur != 0)
1747 (void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
1748 if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
1749 (void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
1750
1751 if (switch_root_dir) {
1752 /* Kill all remaining processes from the initrd, but don't wait for them, so that we can handle the
1753 * SIGCHLD for them after deserializing. */
1754 broadcast_signal(SIGTERM, false, true, arg_default_timeout_stop_usec);
1755
1756 /* And switch root with MS_MOVE, because we remove the old directory afterwards and detach it. */
1757 r = switch_root(switch_root_dir, "/mnt", true, MS_MOVE);
1758 if (r < 0)
1759 log_error_errno(r, "Failed to switch root, trying to continue: %m");
1760 }
1761
1762 args_size = MAX(6, argc+1);
1763 args = newa(const char*, args_size);
1764
1765 if (!switch_root_init) {
1766 char sfd[DECIMAL_STR_MAX(int) + 1];
1767
1768 /* First try to spawn ourselves with the right path, and with full serialization. We do this only if
1769 * the user didn't specify an explicit init to spawn. */
1770
1771 assert(arg_serialization);
1772 assert(fds);
1773
1774 xsprintf(sfd, "%i", fileno(arg_serialization));
1775
1776 i = 0;
1777 args[i++] = SYSTEMD_BINARY_PATH;
1778 if (switch_root_dir)
1779 args[i++] = "--switched-root";
1780 args[i++] = arg_system ? "--system" : "--user";
1781 args[i++] = "--deserialize";
1782 args[i++] = sfd;
1783 args[i++] = NULL;
1784
1785 assert(i <= args_size);
1786
1787 /*
1788 * We want valgrind to print its memory usage summary before reexecution. Valgrind won't do this is on
1789 * its own on exec(), but it will do it on exit(). Hence, to ensure we get a summary here, fork() off
1790 * a child, let it exit() cleanly, so that it prints the summary, and wait() for it in the parent,
1791 * before proceeding into the exec().
1792 */
1793 valgrind_summary_hack();
1794
1795 (void) execv(args[0], (char* const*) args);
1796 log_debug_errno(errno, "Failed to execute our own binary, trying fallback: %m");
1797 }
1798
1799 /* Try the fallback, if there is any, without any serialization. We pass the original argv[] and envp[]. (Well,
1800 * modulo the ordering changes due to getopt() in argv[], and some cleanups in envp[], but let's hope that
1801 * doesn't matter.) */
1802
1803 arg_serialization = safe_fclose(arg_serialization);
1804 fds = fdset_free(fds);
1805
1806 /* Reopen the console */
1807 (void) make_console_stdio();
1808
1809 for (j = 1, i = 1; j < (unsigned) argc; j++)
1810 args[i++] = argv[j];
1811 args[i++] = NULL;
1812 assert(i <= args_size);
1813
1814 /* Re-enable any blocked signals, especially important if we switch from initial ramdisk to init=... */
1815 (void) reset_all_signal_handlers();
1816 (void) reset_signal_mask();
1817 (void) rlimit_nofile_safe();
1818
1819 if (switch_root_init) {
1820 args[0] = switch_root_init;
1821 (void) execve(args[0], (char* const*) args, saved_env);
1822 log_warning_errno(errno, "Failed to execute configured init, trying fallback: %m");
1823 }
1824
1825 args[0] = "/sbin/init";
1826 (void) execv(args[0], (char* const*) args);
1827 r = -errno;
1828
1829 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
1830 ANSI_HIGHLIGHT_RED " !! " ANSI_NORMAL,
1831 "Failed to execute /sbin/init");
1832
1833 if (r == -ENOENT) {
1834 log_warning("No /sbin/init, trying fallback");
1835
1836 args[0] = "/bin/sh";
1837 args[1] = NULL;
1838 (void) execve(args[0], (char* const*) args, saved_env);
1839 log_error_errno(errno, "Failed to execute /bin/sh, giving up: %m");
1840 } else
1841 log_warning_errno(r, "Failed to execute /sbin/init, giving up: %m");
1842
1843 *ret_error_message = "Failed to execute fallback shell";
1844 }
1845
1846 static int invoke_main_loop(
1847 Manager *m,
1848 const struct rlimit *saved_rlimit_nofile,
1849 const struct rlimit *saved_rlimit_memlock,
1850 bool *ret_reexecute,
1851 int *ret_retval, /* Return parameters relevant for shutting down */
1852 const char **ret_shutdown_verb, /* … */
1853 FDSet **ret_fds, /* Return parameters for reexecuting */
1854 char **ret_switch_root_dir, /* … */
1855 char **ret_switch_root_init, /* … */
1856 const char **ret_error_message) {
1857
1858 int r;
1859
1860 assert(m);
1861 assert(saved_rlimit_nofile);
1862 assert(saved_rlimit_memlock);
1863 assert(ret_reexecute);
1864 assert(ret_retval);
1865 assert(ret_shutdown_verb);
1866 assert(ret_fds);
1867 assert(ret_switch_root_dir);
1868 assert(ret_switch_root_init);
1869 assert(ret_error_message);
1870
1871 for (;;) {
1872 r = manager_loop(m);
1873 if (r < 0) {
1874 *ret_error_message = "Failed to run main loop";
1875 return log_emergency_errno(r, "Failed to run main loop: %m");
1876 }
1877
1878 switch ((ManagerObjective) r) {
1879
1880 case MANAGER_RELOAD: {
1881 LogTarget saved_log_target;
1882 int saved_log_level;
1883
1884 log_info("Reloading.");
1885
1886 /* First, save any overridden log level/target, then parse the configuration file, which might
1887 * change the log level to new settings. */
1888
1889 saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
1890 saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
1891
1892 (void) parse_configuration(saved_rlimit_nofile, saved_rlimit_memlock);
1893
1894 set_manager_defaults(m);
1895 set_manager_settings(m);
1896
1897 update_cpu_affinity(false);
1898 update_numa_policy(false);
1899
1900 if (saved_log_level >= 0)
1901 manager_override_log_level(m, saved_log_level);
1902 if (saved_log_target >= 0)
1903 manager_override_log_target(m, saved_log_target);
1904
1905 r = manager_reload(m);
1906 if (r < 0)
1907 /* Reloading failed before the point of no return. Let's continue running as if nothing happened. */
1908 m->objective = MANAGER_OK;
1909
1910 break;
1911 }
1912
1913 case MANAGER_REEXECUTE:
1914
1915 r = prepare_reexecute(m, &arg_serialization, ret_fds, false);
1916 if (r < 0) {
1917 *ret_error_message = "Failed to prepare for reexecution";
1918 return r;
1919 }
1920
1921 log_notice("Reexecuting.");
1922
1923 *ret_reexecute = true;
1924 *ret_retval = EXIT_SUCCESS;
1925 *ret_shutdown_verb = NULL;
1926 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1927
1928 return 0;
1929
1930 case MANAGER_SWITCH_ROOT:
1931 if (!m->switch_root_init) {
1932 r = prepare_reexecute(m, &arg_serialization, ret_fds, true);
1933 if (r < 0) {
1934 *ret_error_message = "Failed to prepare for reexecution";
1935 return r;
1936 }
1937 } else
1938 *ret_fds = NULL;
1939
1940 log_notice("Switching root.");
1941
1942 *ret_reexecute = true;
1943 *ret_retval = EXIT_SUCCESS;
1944 *ret_shutdown_verb = NULL;
1945
1946 /* Steal the switch root parameters */
1947 *ret_switch_root_dir = TAKE_PTR(m->switch_root);
1948 *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
1949
1950 return 0;
1951
1952 case MANAGER_EXIT:
1953
1954 if (MANAGER_IS_USER(m)) {
1955 log_debug("Exit.");
1956
1957 *ret_reexecute = false;
1958 *ret_retval = m->return_value;
1959 *ret_shutdown_verb = NULL;
1960 *ret_fds = NULL;
1961 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1962
1963 return 0;
1964 }
1965
1966 _fallthrough_;
1967 case MANAGER_REBOOT:
1968 case MANAGER_POWEROFF:
1969 case MANAGER_HALT:
1970 case MANAGER_KEXEC: {
1971 static const char * const table[_MANAGER_OBJECTIVE_MAX] = {
1972 [MANAGER_EXIT] = "exit",
1973 [MANAGER_REBOOT] = "reboot",
1974 [MANAGER_POWEROFF] = "poweroff",
1975 [MANAGER_HALT] = "halt",
1976 [MANAGER_KEXEC] = "kexec",
1977 };
1978
1979 log_notice("Shutting down.");
1980
1981 *ret_reexecute = false;
1982 *ret_retval = m->return_value;
1983 assert_se(*ret_shutdown_verb = table[m->objective]);
1984 *ret_fds = NULL;
1985 *ret_switch_root_dir = *ret_switch_root_init = NULL;
1986
1987 return 0;
1988 }
1989
1990 default:
1991 assert_not_reached("Unknown or unexpected manager objective.");
1992 }
1993 }
1994 }
1995
1996 static void log_execution_mode(bool *ret_first_boot) {
1997 assert(ret_first_boot);
1998
1999 if (arg_system) {
2000 int v;
2001
2002 log_info("systemd " GIT_VERSION " running in %ssystem mode. (%s)",
2003 arg_action == ACTION_TEST ? "test " : "",
2004 systemd_features);
2005
2006 v = detect_virtualization();
2007 if (v > 0)
2008 log_info("Detected virtualization %s.", virtualization_to_string(v));
2009
2010 log_info("Detected architecture %s.", architecture_to_string(uname_architecture()));
2011
2012 if (in_initrd()) {
2013 *ret_first_boot = false;
2014 log_info("Running in initial RAM disk.");
2015 } else {
2016 int r;
2017 _cleanup_free_ char *id_text = NULL;
2018
2019 /* Let's check whether we are in first boot. We use /etc/machine-id as flag file
2020 * for this: If it is missing or contains the value "uninitialized", this is the
2021 * first boot. In any other case, it is not. This allows container managers and
2022 * installers to provision a couple of files already. If the container manager
2023 * wants to provision the machine ID itself it should pass $container_uuid to PID 1. */
2024
2025 r = read_one_line_file("/etc/machine-id", &id_text);
2026 if (r < 0 || streq(id_text, "uninitialized")) {
2027 if (r < 0 && r != -ENOENT)
2028 log_warning_errno(r, "Unexpected error while reading /etc/machine-id, ignoring: %m");
2029
2030 *ret_first_boot = true;
2031 log_info("Detected first boot.");
2032 } else {
2033 *ret_first_boot = false;
2034 log_debug("Detected initialized system, this is not the first boot.");
2035 }
2036 }
2037 } else {
2038 if (DEBUG_LOGGING) {
2039 _cleanup_free_ char *t;
2040
2041 t = uid_to_name(getuid());
2042 log_debug("systemd " GIT_VERSION " running in %suser mode for user " UID_FMT "/%s. (%s)",
2043 arg_action == ACTION_TEST ? " test" : "",
2044 getuid(), strna(t), systemd_features);
2045 }
2046
2047 *ret_first_boot = false;
2048 }
2049 }
2050
2051 static int initialize_runtime(
2052 bool skip_setup,
2053 bool first_boot,
2054 struct rlimit *saved_rlimit_nofile,
2055 struct rlimit *saved_rlimit_memlock,
2056 const char **ret_error_message) {
2057 int r;
2058
2059 assert(ret_error_message);
2060
2061 /* Sets up various runtime parameters. Many of these initializations are conditionalized:
2062 *
2063 * - Some only apply to --system instances
2064 * - Some only apply to --user instances
2065 * - Some only apply when we first start up, but not when we reexecute
2066 */
2067
2068 if (arg_action != ACTION_RUN)
2069 return 0;
2070
2071 update_cpu_affinity(skip_setup);
2072 update_numa_policy(skip_setup);
2073
2074 if (arg_system) {
2075 /* Make sure we leave a core dump without panicking the kernel. */
2076 install_crash_handler();
2077
2078 if (!skip_setup) {
2079 r = mount_cgroup_controllers();
2080 if (r < 0) {
2081 *ret_error_message = "Failed to mount cgroup hierarchies";
2082 return r;
2083 }
2084
2085 status_welcome();
2086 (void) hostname_setup(true);
2087 /* Force transient machine-id on first boot. */
2088 machine_id_setup(NULL, first_boot, arg_machine_id, NULL);
2089 (void) loopback_setup();
2090 bump_unix_max_dgram_qlen();
2091 bump_file_max_and_nr_open();
2092 test_usr();
2093 write_container_id();
2094 }
2095
2096 if (arg_watchdog_device) {
2097 r = watchdog_set_device(arg_watchdog_device);
2098 if (r < 0)
2099 log_warning_errno(r, "Failed to set watchdog device to %s, ignoring: %m", arg_watchdog_device);
2100 }
2101 } else {
2102 _cleanup_free_ char *p = NULL;
2103
2104 /* Create the runtime directory and place the inaccessible device nodes there, if we run in
2105 * user mode. In system mode mount_setup() already did that. */
2106
2107 r = xdg_user_runtime_dir(&p, "/systemd");
2108 if (r < 0) {
2109 *ret_error_message = "$XDG_RUNTIME_DIR is not set";
2110 return log_emergency_errno(r, "Failed to determine $XDG_RUNTIME_DIR path: %m");
2111 }
2112
2113 (void) mkdir_p_label(p, 0755);
2114 (void) make_inaccessible_nodes(p, UID_INVALID, GID_INVALID);
2115 }
2116
2117 if (arg_timer_slack_nsec != NSEC_INFINITY)
2118 if (prctl(PR_SET_TIMERSLACK, arg_timer_slack_nsec) < 0)
2119 log_warning_errno(errno, "Failed to adjust timer slack, ignoring: %m");
2120
2121 if (arg_system && !cap_test_all(arg_capability_bounding_set)) {
2122 r = capability_bounding_set_drop_usermode(arg_capability_bounding_set);
2123 if (r < 0) {
2124 *ret_error_message = "Failed to drop capability bounding set of usermode helpers";
2125 return log_emergency_errno(r, "Failed to drop capability bounding set of usermode helpers: %m");
2126 }
2127
2128 r = capability_bounding_set_drop(arg_capability_bounding_set, true);
2129 if (r < 0) {
2130 *ret_error_message = "Failed to drop capability bounding set";
2131 return log_emergency_errno(r, "Failed to drop capability bounding set: %m");
2132 }
2133 }
2134
2135 if (arg_system && arg_no_new_privs) {
2136 if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
2137 *ret_error_message = "Failed to disable new privileges";
2138 return log_emergency_errno(errno, "Failed to disable new privileges: %m");
2139 }
2140 }
2141
2142 if (arg_syscall_archs) {
2143 r = enforce_syscall_archs(arg_syscall_archs);
2144 if (r < 0) {
2145 *ret_error_message = "Failed to set syscall architectures";
2146 return r;
2147 }
2148 }
2149
2150 if (!arg_system)
2151 /* Become reaper of our children */
2152 if (prctl(PR_SET_CHILD_SUBREAPER, 1) < 0)
2153 log_warning_errno(errno, "Failed to make us a subreaper: %m");
2154
2155 /* Bump up RLIMIT_NOFILE for systemd itself */
2156 (void) bump_rlimit_nofile(saved_rlimit_nofile);
2157 (void) bump_rlimit_memlock(saved_rlimit_memlock);
2158
2159 return 0;
2160 }
2161
2162 static int do_queue_default_job(
2163 Manager *m,
2164 const char **ret_error_message) {
2165
2166 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
2167 const char *unit;
2168 Job *job;
2169 Unit *target;
2170 int r;
2171
2172 if (arg_default_unit)
2173 unit = arg_default_unit;
2174 else if (in_initrd())
2175 unit = SPECIAL_INITRD_TARGET;
2176 else
2177 unit = SPECIAL_DEFAULT_TARGET;
2178
2179 log_debug("Activating default unit: %s", unit);
2180
2181 r = manager_load_startable_unit_or_warn(m, unit, NULL, &target);
2182 if (r < 0 && in_initrd() && !arg_default_unit) {
2183 /* Fall back to default.target, which we used to always use by default. Only do this if no
2184 * explicit configuration was given. */
2185
2186 log_info("Falling back to " SPECIAL_DEFAULT_TARGET ".");
2187
2188 r = manager_load_startable_unit_or_warn(m, SPECIAL_DEFAULT_TARGET, NULL, &target);
2189 }
2190 if (r < 0) {
2191 log_info("Falling back to " SPECIAL_RESCUE_TARGET ".");
2192
2193 r = manager_load_startable_unit_or_warn(m, SPECIAL_RESCUE_TARGET, NULL, &target);
2194 if (r < 0) {
2195 *ret_error_message = r == -ERFKILL ? SPECIAL_RESCUE_TARGET " masked"
2196 : "Failed to load " SPECIAL_RESCUE_TARGET;
2197 return r;
2198 }
2199 }
2200
2201 assert(target->load_state == UNIT_LOADED);
2202
2203 r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &job);
2204 if (r == -EPERM) {
2205 log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
2206
2207 sd_bus_error_free(&error);
2208
2209 r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &job);
2210 if (r < 0) {
2211 *ret_error_message = "Failed to start default target";
2212 return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r));
2213 }
2214
2215 } else if (r < 0) {
2216 *ret_error_message = "Failed to isolate default target";
2217 return log_emergency_errno(r, "Failed to isolate default target: %s", bus_error_message(&error, r));
2218 } else
2219 log_info("Queued %s job for default target %s.",
2220 job_type_to_string(job->type),
2221 unit_status_string(job->unit));
2222
2223 m->default_unit_job_id = job->id;
2224
2225 return 0;
2226 }
2227
2228 static void save_rlimits(struct rlimit *saved_rlimit_nofile,
2229 struct rlimit *saved_rlimit_memlock) {
2230
2231 assert(saved_rlimit_nofile);
2232 assert(saved_rlimit_memlock);
2233
2234 if (getrlimit(RLIMIT_NOFILE, saved_rlimit_nofile) < 0)
2235 log_warning_errno(errno, "Reading RLIMIT_NOFILE failed, ignoring: %m");
2236
2237 if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock) < 0)
2238 log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
2239 }
2240
2241 static void fallback_rlimit_nofile(const struct rlimit *saved_rlimit_nofile) {
2242 struct rlimit *rl;
2243
2244 if (arg_default_rlimit[RLIMIT_NOFILE])
2245 return;
2246
2247 /* Make sure forked processes get limits based on the original kernel setting */
2248
2249 rl = newdup(struct rlimit, saved_rlimit_nofile, 1);
2250 if (!rl) {
2251 log_oom();
2252 return;
2253 }
2254
2255 /* Bump the hard limit for system services to a substantially higher value. The default
2256 * hard limit current kernels set is pretty low (4K), mostly for historical
2257 * reasons. According to kernel developers, the fd handling in recent kernels has been
2258 * optimized substantially enough, so that we can bump the limit now, without paying too
2259 * high a price in memory or performance. Note however that we only bump the hard limit,
2260 * not the soft limit. That's because select() works the way it works, and chokes on fds
2261 * >= 1024. If we'd bump the soft limit globally, it might accidentally happen to
2262 * unexpecting programs that they get fds higher than what they can process using
2263 * select(). By only bumping the hard limit but leaving the low limit as it is we avoid
2264 * this pitfall: programs that are written by folks aware of the select() problem in mind
2265 * (and thus use poll()/epoll instead of select(), the way everybody should) can
2266 * explicitly opt into high fds by bumping their soft limit beyond 1024, to the hard limit
2267 * we pass. */
2268 if (arg_system) {
2269 int nr;
2270
2271 /* Get the underlying absolute limit the kernel enforces */
2272 nr = read_nr_open();
2273
2274 rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
2275 }
2276
2277 /* If for some reason we were invoked with a soft limit above 1024 (which should never
2278 * happen!, but who knows what we get passed in from pam_limit when invoked as --user
2279 * instance), then lower what we pass on to not confuse our children */
2280 rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
2281
2282 arg_default_rlimit[RLIMIT_NOFILE] = rl;
2283 }
2284
2285 static void fallback_rlimit_memlock(const struct rlimit *saved_rlimit_memlock) {
2286 struct rlimit *rl;
2287
2288 /* Pass the original value down to invoked processes */
2289
2290 if (arg_default_rlimit[RLIMIT_MEMLOCK])
2291 return;
2292
2293 rl = newdup(struct rlimit, saved_rlimit_memlock, 1);
2294 if (!rl) {
2295 log_oom();
2296 return;
2297 }
2298
2299 arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
2300 }
2301
2302 static void setenv_manager_environment(void) {
2303 char **p;
2304 int r;
2305
2306 STRV_FOREACH(p, arg_manager_environment) {
2307 log_debug("Setting '%s' in our own environment.", *p);
2308
2309 r = putenv_dup(*p, true);
2310 if (r < 0)
2311 log_warning_errno(errno, "Failed to setenv \"%s\", ignoring: %m", *p);
2312 }
2313 }
2314
2315 static void reset_arguments(void) {
2316 /* Frees/resets arg_* variables, with a few exceptions commented below. */
2317
2318 arg_default_unit = mfree(arg_default_unit);
2319
2320 /* arg_system — ignore */
2321
2322 arg_dump_core = true;
2323 arg_crash_chvt = -1;
2324 arg_crash_shell = false;
2325 arg_crash_reboot = false;
2326 arg_confirm_spawn = mfree(arg_confirm_spawn);
2327 arg_show_status = _SHOW_STATUS_INVALID;
2328 arg_status_unit_format = STATUS_UNIT_FORMAT_DEFAULT;
2329 arg_switched_root = false;
2330 arg_pager_flags = 0;
2331 arg_service_watchdogs = true;
2332 arg_default_std_output = EXEC_OUTPUT_JOURNAL;
2333 arg_default_std_error = EXEC_OUTPUT_INHERIT;
2334 arg_default_restart_usec = DEFAULT_RESTART_USEC;
2335 arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
2336 arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
2337 arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
2338 arg_default_timeout_abort_set = false;
2339 arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
2340 arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
2341 arg_runtime_watchdog = 0;
2342 arg_reboot_watchdog = 10 * USEC_PER_MINUTE;
2343 arg_kexec_watchdog = 0;
2344 arg_early_core_pattern = NULL;
2345 arg_watchdog_device = NULL;
2346
2347 arg_default_environment = strv_free(arg_default_environment);
2348 arg_manager_environment = strv_free(arg_manager_environment);
2349 rlimit_free_all(arg_default_rlimit);
2350
2351 arg_capability_bounding_set = CAP_ALL;
2352 arg_no_new_privs = false;
2353 arg_timer_slack_nsec = NSEC_INFINITY;
2354 arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
2355
2356 arg_syscall_archs = set_free(arg_syscall_archs);
2357
2358 /* arg_serialization — ignore */
2359
2360 arg_default_cpu_accounting = -1;
2361 arg_default_io_accounting = false;
2362 arg_default_ip_accounting = false;
2363 arg_default_blockio_accounting = false;
2364 arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
2365 arg_default_tasks_accounting = true;
2366 arg_default_tasks_max = DEFAULT_TASKS_MAX;
2367 arg_machine_id = (sd_id128_t) {};
2368 arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
2369 arg_default_oom_policy = OOM_STOP;
2370
2371 cpu_set_reset(&arg_cpu_affinity);
2372 numa_policy_reset(&arg_numa_policy);
2373
2374 arg_random_seed = mfree(arg_random_seed);
2375 arg_random_seed_size = 0;
2376 arg_clock_usec = 0;
2377 }
2378
2379 static int parse_configuration(const struct rlimit *saved_rlimit_nofile,
2380 const struct rlimit *saved_rlimit_memlock) {
2381 int r;
2382
2383 assert(saved_rlimit_nofile);
2384 assert(saved_rlimit_memlock);
2385
2386 /* Assign configuration defaults */
2387 reset_arguments();
2388
2389 r = parse_config_file();
2390 if (r < 0)
2391 log_warning_errno(r, "Failed to parse config file, ignoring: %m");
2392
2393 if (arg_system) {
2394 r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
2395 if (r < 0)
2396 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2397 }
2398
2399 /* Initialize some default rlimits for services if they haven't been configured */
2400 fallback_rlimit_nofile(saved_rlimit_nofile);
2401 fallback_rlimit_memlock(saved_rlimit_memlock);
2402
2403 /* Note that this also parses bits from the kernel command line, including "debug". */
2404 log_parse_environment();
2405
2406 /* Initialize the show status setting if it hasn't been set explicitly yet */
2407 if (arg_show_status == _SHOW_STATUS_INVALID)
2408 arg_show_status = SHOW_STATUS_YES;
2409
2410 /* Push variables into the manager environment block */
2411 setenv_manager_environment();
2412
2413 return 0;
2414 }
2415
2416 static int safety_checks(void) {
2417
2418 if (getpid_cached() == 1 &&
2419 arg_action != ACTION_RUN)
2420 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2421 "Unsupported execution mode while PID 1.");
2422
2423 if (getpid_cached() == 1 &&
2424 !arg_system)
2425 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2426 "Can't run --user mode as PID 1.");
2427
2428 if (arg_action == ACTION_RUN &&
2429 arg_system &&
2430 getpid_cached() != 1)
2431 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2432 "Can't run system mode unless PID 1.");
2433
2434 if (arg_action == ACTION_TEST &&
2435 geteuid() == 0)
2436 return log_error_errno(SYNTHETIC_ERRNO(EPERM),
2437 "Don't run test mode as root.");
2438
2439 if (!arg_system &&
2440 arg_action == ACTION_RUN &&
2441 sd_booted() <= 0)
2442 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2443 "Trying to run as user instance, but the system has not been booted with systemd.");
2444
2445 if (!arg_system &&
2446 arg_action == ACTION_RUN &&
2447 !getenv("XDG_RUNTIME_DIR"))
2448 return log_error_errno(SYNTHETIC_ERRNO(EUNATCH),
2449 "Trying to run as user instance, but $XDG_RUNTIME_DIR is not set.");
2450
2451 if (arg_system &&
2452 arg_action == ACTION_RUN &&
2453 running_in_chroot() > 0)
2454 return log_error_errno(SYNTHETIC_ERRNO(EOPNOTSUPP),
2455 "Cannot be run in a chroot() environment.");
2456
2457 return 0;
2458 }
2459
2460 static int initialize_security(
2461 bool *loaded_policy,
2462 dual_timestamp *security_start_timestamp,
2463 dual_timestamp *security_finish_timestamp,
2464 const char **ret_error_message) {
2465
2466 int r;
2467
2468 assert(loaded_policy);
2469 assert(security_start_timestamp);
2470 assert(security_finish_timestamp);
2471 assert(ret_error_message);
2472
2473 dual_timestamp_get(security_start_timestamp);
2474
2475 r = mac_selinux_setup(loaded_policy);
2476 if (r < 0) {
2477 *ret_error_message = "Failed to load SELinux policy";
2478 return r;
2479 }
2480
2481 r = mac_smack_setup(loaded_policy);
2482 if (r < 0) {
2483 *ret_error_message = "Failed to load SMACK policy";
2484 return r;
2485 }
2486
2487 r = mac_apparmor_setup();
2488 if (r < 0) {
2489 *ret_error_message = "Failed to load AppArmor policy";
2490 return r;
2491 }
2492
2493 r = ima_setup();
2494 if (r < 0) {
2495 *ret_error_message = "Failed to load IMA policy";
2496 return r;
2497 }
2498
2499 dual_timestamp_get(security_finish_timestamp);
2500 return 0;
2501 }
2502
2503 static void test_summary(Manager *m) {
2504 assert(m);
2505
2506 printf("-> By units:\n");
2507 manager_dump_units(m, stdout, "\t");
2508
2509 printf("-> By jobs:\n");
2510 manager_dump_jobs(m, stdout, "\t");
2511 }
2512
2513 static int collect_fds(FDSet **ret_fds, const char **ret_error_message) {
2514 int r;
2515
2516 assert(ret_fds);
2517 assert(ret_error_message);
2518
2519 r = fdset_new_fill(ret_fds);
2520 if (r < 0) {
2521 *ret_error_message = "Failed to allocate fd set";
2522 return log_emergency_errno(r, "Failed to allocate fd set: %m");
2523 }
2524
2525 fdset_cloexec(*ret_fds, true);
2526
2527 if (arg_serialization)
2528 assert_se(fdset_remove(*ret_fds, fileno(arg_serialization)) >= 0);
2529
2530 return 0;
2531 }
2532
2533 static void setup_console_terminal(bool skip_setup) {
2534
2535 if (!arg_system)
2536 return;
2537
2538 /* Become a session leader if we aren't one yet. */
2539 (void) setsid();
2540
2541 /* If we are init, we connect stdin/stdout/stderr to /dev/null and make sure we don't have a controlling
2542 * tty. */
2543 (void) release_terminal();
2544
2545 /* Reset the console, but only if this is really init and we are freshly booted */
2546 if (getpid_cached() == 1 && !skip_setup)
2547 (void) console_setup();
2548 }
2549
2550 static bool early_skip_setup_check(int argc, char *argv[]) {
2551 bool found_deserialize = false;
2552 int i;
2553
2554 /* Determine if this is a reexecution or normal bootup. We do the full command line parsing much later, so
2555 * let's just have a quick peek here. Note that if we have switched root, do all the special setup things
2556 * anyway, even if in that case we also do deserialization. */
2557
2558 for (i = 1; i < argc; i++) {
2559 if (streq(argv[i], "--switched-root"))
2560 return false; /* If we switched root, don't skip the setup. */
2561 else if (streq(argv[i], "--deserialize"))
2562 found_deserialize = true;
2563 }
2564
2565 return found_deserialize; /* When we are deserializing, then we are reexecuting, hence avoid the extensive setup */
2566 }
2567
2568 static int save_env(void) {
2569 char **l;
2570
2571 l = strv_copy(environ);
2572 if (!l)
2573 return -ENOMEM;
2574
2575 strv_free_and_replace(saved_env, l);
2576 return 0;
2577 }
2578
2579 int main(int argc, char *argv[]) {
2580
2581 dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL,
2582 security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL;
2583 struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
2584 saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
2585 * in. Note we use different values
2586 * for the two that indicate whether
2587 * these fields are initialized! */
2588 bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false;
2589 char *switch_root_dir = NULL, *switch_root_init = NULL;
2590 usec_t before_startup, after_startup;
2591 static char systemd[] = "systemd";
2592 char timespan[FORMAT_TIMESPAN_MAX];
2593 const char *shutdown_verb = NULL, *error_message = NULL;
2594 int r, retval = EXIT_FAILURE;
2595 Manager *m = NULL;
2596 FDSet *fds = NULL;
2597
2598 /* SysV compatibility: redirect init → telinit */
2599 redirect_telinit(argc, argv);
2600
2601 /* Take timestamps early on */
2602 dual_timestamp_from_monotonic(&kernel_timestamp, 0);
2603 dual_timestamp_get(&userspace_timestamp);
2604
2605 /* Figure out whether we need to do initialize the system, or if we already did that because we are
2606 * reexecuting */
2607 skip_setup = early_skip_setup_check(argc, argv);
2608
2609 /* If we get started via the /sbin/init symlink then we are called 'init'. After a subsequent reexecution we
2610 * are then called 'systemd'. That is confusing, hence let's call us systemd right-away. */
2611 program_invocation_short_name = systemd;
2612 (void) prctl(PR_SET_NAME, systemd);
2613
2614 /* Save the original command line */
2615 save_argc_argv(argc, argv);
2616
2617 /* Save the original environment as we might need to restore it if we're requested to execute another
2618 * system manager later. */
2619 r = save_env();
2620 if (r < 0) {
2621 error_message = "Failed to copy environment block";
2622 goto finish;
2623 }
2624
2625 /* Make sure that if the user says "syslog" we actually log to the journal. */
2626 log_set_upgrade_syslog_to_journal(true);
2627
2628 if (getpid_cached() == 1) {
2629 /* When we run as PID 1 force system mode */
2630 arg_system = true;
2631
2632 /* Disable the umask logic */
2633 umask(0);
2634
2635 /* Make sure that at least initially we do not ever log to journald/syslogd, because it might not be
2636 * activated yet (even though the log socket for it exists). */
2637 log_set_prohibit_ipc(true);
2638
2639 /* Always reopen /dev/console when running as PID 1 or one of its pre-execve() children. This is
2640 * important so that we never end up logging to any foreign stderr, for example if we have to log in a
2641 * child process right before execve()'ing the actual binary, at a point in time where socket
2642 * activation stderr/stdout area already set up. */
2643 log_set_always_reopen_console(true);
2644
2645 if (detect_container() <= 0) {
2646
2647 /* Running outside of a container as PID 1 */
2648 log_set_target(LOG_TARGET_KMSG);
2649 log_open();
2650
2651 if (in_initrd())
2652 initrd_timestamp = userspace_timestamp;
2653
2654 if (!skip_setup) {
2655 r = mount_setup_early();
2656 if (r < 0) {
2657 error_message = "Failed to mount early API filesystems";
2658 goto finish;
2659 }
2660
2661 /* Let's open the log backend a second time, in case the first time didn't
2662 * work. Quite possibly we have mounted /dev just now, so /dev/kmsg became
2663 * available, and it previously wasn't. */
2664 log_open();
2665
2666 disable_printk_ratelimit();
2667
2668 r = initialize_security(
2669 &loaded_policy,
2670 &security_start_timestamp,
2671 &security_finish_timestamp,
2672 &error_message);
2673 if (r < 0)
2674 goto finish;
2675 }
2676
2677 if (mac_selinux_init() < 0) {
2678 error_message = "Failed to initialize SELinux support";
2679 goto finish;
2680 }
2681
2682 if (!skip_setup)
2683 initialize_clock();
2684
2685 /* Set the default for later on, but don't actually open the logs like this for now. Note that
2686 * if we are transitioning from the initrd there might still be journal fd open, and we
2687 * shouldn't attempt opening that before we parsed /proc/cmdline which might redirect output
2688 * elsewhere. */
2689 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
2690
2691 } else {
2692 /* Running inside a container, as PID 1 */
2693 log_set_target(LOG_TARGET_CONSOLE);
2694 log_open();
2695
2696 /* For later on, see above... */
2697 log_set_target(LOG_TARGET_JOURNAL);
2698
2699 /* clear the kernel timestamp, because we are in a container */
2700 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2701 }
2702
2703 initialize_coredump(skip_setup);
2704
2705 r = fixup_environment();
2706 if (r < 0) {
2707 log_emergency_errno(r, "Failed to fix up PID 1 environment: %m");
2708 error_message = "Failed to fix up PID1 environment";
2709 goto finish;
2710 }
2711
2712 /* Try to figure out if we can use colors with the console. No need to do that for user instances since
2713 * they never log into the console. */
2714 log_show_color(colors_enabled());
2715
2716 r = make_null_stdio();
2717 if (r < 0)
2718 log_warning_errno(r, "Failed to redirect standard streams to /dev/null, ignoring: %m");
2719
2720 /* Load the kernel modules early. */
2721 if (!skip_setup)
2722 kmod_setup();
2723
2724 /* Mount /proc, /sys and friends, so that /proc/cmdline and /proc/$PID/fd is available. */
2725 r = mount_setup(loaded_policy, skip_setup);
2726 if (r < 0) {
2727 error_message = "Failed to mount API filesystems";
2728 goto finish;
2729 }
2730
2731 /* The efivarfs is now mounted, let's read the random seed off it */
2732 (void) efi_take_random_seed();
2733
2734 /* Cache command-line options passed from EFI variables */
2735 if (!skip_setup)
2736 (void) cache_efi_options_variable();
2737 } else {
2738 /* Running as user instance */
2739 arg_system = false;
2740 log_set_target(LOG_TARGET_AUTO);
2741 log_open();
2742
2743 /* clear the kernel timestamp, because we are not PID 1 */
2744 kernel_timestamp = DUAL_TIMESTAMP_NULL;
2745
2746 if (mac_selinux_init() < 0) {
2747 error_message = "Failed to initialize SELinux support";
2748 goto finish;
2749 }
2750 }
2751
2752 /* Save the original RLIMIT_NOFILE/RLIMIT_MEMLOCK so that we can reset it later when
2753 * transitioning from the initrd to the main systemd or suchlike. */
2754 save_rlimits(&saved_rlimit_nofile, &saved_rlimit_memlock);
2755
2756 /* Reset all signal handlers. */
2757 (void) reset_all_signal_handlers();
2758 (void) ignore_signals(SIGNALS_IGNORE);
2759
2760 (void) parse_configuration(&saved_rlimit_nofile, &saved_rlimit_memlock);
2761
2762 r = parse_argv(argc, argv);
2763 if (r < 0) {
2764 error_message = "Failed to parse commandline arguments";
2765 goto finish;
2766 }
2767
2768 r = safety_checks();
2769 if (r < 0)
2770 goto finish;
2771
2772 if (IN_SET(arg_action, ACTION_TEST, ACTION_HELP, ACTION_DUMP_CONFIGURATION_ITEMS, ACTION_DUMP_BUS_PROPERTIES, ACTION_BUS_INTROSPECT))
2773 (void) pager_open(arg_pager_flags);
2774
2775 if (arg_action != ACTION_RUN)
2776 skip_setup = true;
2777
2778 if (arg_action == ACTION_HELP) {
2779 retval = help() < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
2780 goto finish;
2781 } else if (arg_action == ACTION_VERSION) {
2782 retval = version();
2783 goto finish;
2784 } else if (arg_action == ACTION_DUMP_CONFIGURATION_ITEMS) {
2785 unit_dump_config_items(stdout);
2786 retval = EXIT_SUCCESS;
2787 goto finish;
2788 } else if (arg_action == ACTION_DUMP_BUS_PROPERTIES) {
2789 dump_bus_properties(stdout);
2790 retval = EXIT_SUCCESS;
2791 goto finish;
2792 } else if (arg_action == ACTION_BUS_INTROSPECT) {
2793 r = bus_manager_introspect_implementations(stdout, arg_bus_introspect);
2794 retval = r >= 0 ? EXIT_SUCCESS : EXIT_FAILURE;
2795 goto finish;
2796 }
2797
2798 assert_se(IN_SET(arg_action, ACTION_RUN, ACTION_TEST));
2799
2800 /* Move out of the way, so that we won't block unmounts */
2801 assert_se(chdir("/") == 0);
2802
2803 if (arg_action == ACTION_RUN) {
2804 if (!skip_setup) {
2805 /* Apply the systemd.clock_usec= kernel command line switch */
2806 apply_clock_update();
2807
2808 /* Apply random seed from kernel command line */
2809 cmdline_take_random_seed();
2810 }
2811
2812 /* A core pattern might have been specified via the cmdline. */
2813 initialize_core_pattern(skip_setup);
2814
2815 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
2816 log_close();
2817
2818 /* Remember open file descriptors for later deserialization */
2819 r = collect_fds(&fds, &error_message);
2820 if (r < 0)
2821 goto finish;
2822
2823 /* Give up any control of the console, but make sure its initialized. */
2824 setup_console_terminal(skip_setup);
2825
2826 /* Open the logging devices, if possible and necessary */
2827 log_open();
2828 }
2829
2830 log_execution_mode(&first_boot);
2831
2832 r = initialize_runtime(skip_setup,
2833 first_boot,
2834 &saved_rlimit_nofile,
2835 &saved_rlimit_memlock,
2836 &error_message);
2837 if (r < 0)
2838 goto finish;
2839
2840 r = manager_new(arg_system ? UNIT_FILE_SYSTEM : UNIT_FILE_USER,
2841 arg_action == ACTION_TEST ? MANAGER_TEST_FULL : 0,
2842 &m);
2843 if (r < 0) {
2844 log_emergency_errno(r, "Failed to allocate manager object: %m");
2845 error_message = "Failed to allocate manager object";
2846 goto finish;
2847 }
2848
2849 m->timestamps[MANAGER_TIMESTAMP_KERNEL] = kernel_timestamp;
2850 m->timestamps[MANAGER_TIMESTAMP_INITRD] = initrd_timestamp;
2851 m->timestamps[MANAGER_TIMESTAMP_USERSPACE] = userspace_timestamp;
2852 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_START)] = security_start_timestamp;
2853 m->timestamps[manager_timestamp_initrd_mangle(MANAGER_TIMESTAMP_SECURITY_FINISH)] = security_finish_timestamp;
2854
2855 set_manager_defaults(m);
2856 set_manager_settings(m);
2857 manager_set_first_boot(m, first_boot);
2858
2859 /* Remember whether we should queue the default job */
2860 queue_default_job = !arg_serialization || arg_switched_root;
2861
2862 before_startup = now(CLOCK_MONOTONIC);
2863
2864 r = manager_startup(m, arg_serialization, fds);
2865 if (r < 0) {
2866 error_message = "Failed to start up manager";
2867 goto finish;
2868 }
2869
2870 /* This will close all file descriptors that were opened, but not claimed by any unit. */
2871 fds = fdset_free(fds);
2872 arg_serialization = safe_fclose(arg_serialization);
2873
2874 if (queue_default_job) {
2875 r = do_queue_default_job(m, &error_message);
2876 if (r < 0)
2877 goto finish;
2878 }
2879
2880 after_startup = now(CLOCK_MONOTONIC);
2881
2882 log_full(arg_action == ACTION_TEST ? LOG_INFO : LOG_DEBUG,
2883 "Loaded units and determined initial transaction in %s.",
2884 format_timespan(timespan, sizeof(timespan), after_startup - before_startup, 100 * USEC_PER_MSEC));
2885
2886 if (arg_action == ACTION_TEST) {
2887 test_summary(m);
2888 retval = EXIT_SUCCESS;
2889 goto finish;
2890 }
2891
2892 (void) invoke_main_loop(m,
2893 &saved_rlimit_nofile,
2894 &saved_rlimit_memlock,
2895 &reexecute,
2896 &retval,
2897 &shutdown_verb,
2898 &fds,
2899 &switch_root_dir,
2900 &switch_root_init,
2901 &error_message);
2902
2903 finish:
2904 pager_close();
2905
2906 if (m) {
2907 arg_reboot_watchdog = manager_get_watchdog(m, WATCHDOG_REBOOT);
2908 arg_kexec_watchdog = manager_get_watchdog(m, WATCHDOG_KEXEC);
2909 m = manager_free(m);
2910 }
2911
2912 mac_selinux_finish();
2913
2914 if (reexecute)
2915 do_reexecute(argc, argv,
2916 &saved_rlimit_nofile,
2917 &saved_rlimit_memlock,
2918 fds,
2919 switch_root_dir,
2920 switch_root_init,
2921 &error_message); /* This only returns if reexecution failed */
2922
2923 arg_serialization = safe_fclose(arg_serialization);
2924 fds = fdset_free(fds);
2925
2926 saved_env = strv_free(saved_env);
2927
2928 #if HAVE_VALGRIND_VALGRIND_H
2929 /* If we are PID 1 and running under valgrind, then let's exit
2930 * here explicitly. valgrind will only generate nice output on
2931 * exit(), not on exec(), hence let's do the former not the
2932 * latter here. */
2933 if (getpid_cached() == 1 && RUNNING_ON_VALGRIND) {
2934 /* Cleanup watchdog_device strings for valgrind. We need them
2935 * in become_shutdown() so normally we cannot free them yet. */
2936 watchdog_free_device();
2937 arg_watchdog_device = mfree(arg_watchdog_device);
2938 reset_arguments();
2939 return retval;
2940 }
2941 #endif
2942
2943 #if HAS_FEATURE_ADDRESS_SANITIZER
2944 __lsan_do_leak_check();
2945 #endif
2946
2947 if (shutdown_verb) {
2948 r = become_shutdown(shutdown_verb, retval);
2949 log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");
2950 error_message = "Failed to execute shutdown binary";
2951 }
2952
2953 watchdog_free_device();
2954 arg_watchdog_device = mfree(arg_watchdog_device);
2955
2956 if (getpid_cached() == 1) {
2957 if (error_message)
2958 manager_status_printf(NULL, STATUS_TYPE_EMERGENCY,
2959 ANSI_HIGHLIGHT_RED "!!!!!!" ANSI_NORMAL,
2960 "%s.", error_message);
2961 freeze_or_exit_or_reboot();
2962 }
2963
2964 reset_arguments();
2965 return retval;
2966 }