#include "ima-setup.h"
#include "killall.h"
#include "kmod-setup.h"
+#include "limits-util.h"
#include "load-fragment.h"
#include "log.h"
#include "loopback-setup.h"
#include "virt.h"
#include "watchdog.h"
+#if HAS_FEATURE_ADDRESS_SANITIZER
+#include <sanitizer/lsan_interface.h>
+#endif
+
static enum {
ACTION_RUN,
ACTION_HELP,
ACTION_DUMP_CONFIGURATION_ITEMS,
ACTION_DUMP_BUS_PROPERTIES,
} arg_action = ACTION_RUN;
-static char *arg_default_unit = NULL;
-static bool arg_system = false;
-static bool arg_dump_core = true;
-static int arg_crash_chvt = -1;
-static bool arg_crash_shell = false;
-static bool arg_crash_reboot = false;
-static char *arg_confirm_spawn = NULL;
-static ShowStatus arg_show_status = _SHOW_STATUS_INVALID;
-static bool arg_switched_root = false;
-static PagerFlags arg_pager_flags = 0;
-static bool arg_service_watchdogs = true;
-static ExecOutput arg_default_std_output = EXEC_OUTPUT_JOURNAL;
-static ExecOutput arg_default_std_error = EXEC_OUTPUT_INHERIT;
-static usec_t arg_default_restart_usec = DEFAULT_RESTART_USEC;
-static usec_t arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
-static usec_t arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
-static usec_t arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
-static unsigned arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
-static usec_t arg_runtime_watchdog = 0;
-static usec_t arg_shutdown_watchdog = 10 * USEC_PER_MINUTE;
-static char *arg_early_core_pattern = NULL;
-static char *arg_watchdog_device = NULL;
-static char **arg_default_environment = NULL;
-static struct rlimit *arg_default_rlimit[_RLIMIT_MAX] = {};
-static uint64_t arg_capability_bounding_set = CAP_ALL;
-static bool arg_no_new_privs = false;
-static nsec_t arg_timer_slack_nsec = NSEC_INFINITY;
-static usec_t arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
-static Set* arg_syscall_archs = NULL;
-static FILE* arg_serialization = NULL;
-static int arg_default_cpu_accounting = -1;
-static bool arg_default_io_accounting = false;
-static bool arg_default_ip_accounting = false;
-static bool arg_default_blockio_accounting = false;
-static bool arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
-static bool arg_default_tasks_accounting = true;
-static uint64_t arg_default_tasks_max = UINT64_MAX;
-static sd_id128_t arg_machine_id = {};
-static EmergencyAction arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+
+/* Those variables are initalized to 0 automatically, so we avoid uninitialized memory access.
+ * Real defaults are assigned in reset_arguments() below. */
+static char *arg_default_unit;
+static bool arg_system;
+static bool arg_dump_core;
+static int arg_crash_chvt;
+static bool arg_crash_shell;
+static bool arg_crash_reboot;
+static char *arg_confirm_spawn;
+static ShowStatus arg_show_status;
+static bool arg_switched_root;
+static PagerFlags arg_pager_flags;
+static bool arg_service_watchdogs;
+static ExecOutput arg_default_std_output;
+static ExecOutput arg_default_std_error;
+static usec_t arg_default_restart_usec;
+static usec_t arg_default_timeout_start_usec;
+static usec_t arg_default_timeout_stop_usec;
+static usec_t arg_default_timeout_abort_usec;
+static bool arg_default_timeout_abort_set;
+static usec_t arg_default_start_limit_interval;
+static unsigned arg_default_start_limit_burst;
+static usec_t arg_runtime_watchdog;
+static usec_t arg_shutdown_watchdog;
+static char *arg_early_core_pattern;
+static char *arg_watchdog_device;
+static char **arg_default_environment;
+static struct rlimit *arg_default_rlimit[_RLIMIT_MAX];
+static uint64_t arg_capability_bounding_set;
+static bool arg_no_new_privs;
+static nsec_t arg_timer_slack_nsec;
+static usec_t arg_default_timer_accuracy_usec;
+static Set* arg_syscall_archs;
+static FILE* arg_serialization;
+static int arg_default_cpu_accounting;
+static bool arg_default_io_accounting;
+static bool arg_default_ip_accounting;
+static bool arg_default_blockio_accounting;
+static bool arg_default_memory_accounting;
+static bool arg_default_tasks_accounting;
+static uint64_t arg_default_tasks_max;
+static sd_id128_t arg_machine_id;
+static EmergencyAction arg_cad_burst_action;
+static OOMPolicy arg_default_oom_policy;
+static CPUSet arg_cpu_affinity;
+
+static int parse_configuration(void);
_noreturn_ static void freeze_or_exit_or_reboot(void) {
- /* If we are running in a contianer, let's prefer exiting, after all we can propagate an exit code to the
- * container manager, and thus inform it that something went wrong. */
+ /* If we are running in a container, let's prefer exiting, after all we can propagate an exit code to
+ * the container manager, and thus inform it that something went wrong. */
if (detect_container() > 0) {
log_emergency("Exiting PID 1...");
- exit(EXIT_EXCEPTION);
+ _exit(EXIT_EXCEPTION);
}
if (arg_crash_reboot) {
else if (is_path(value)) /* on with fully qualified path */
s = strdup(value);
else /* on with only a tty file name, not a fully qualified path */
- s = strjoin("/dev/", value);
+ s = path_join("/dev", value);
if (!s)
return -ENOMEM;
void *data,
void *userdata) {
- _cleanup_cpu_free_ cpu_set_t *c = NULL;
- int ncpus;
+ CPUSet *affinity = data;
- ncpus = parse_cpu_set_and_warn(rvalue, &c, unit, filename, line, lvalue);
- if (ncpus < 0)
- return ncpus;
+ assert(affinity);
- if (sched_setaffinity(0, CPU_ALLOC_SIZE(ncpus), c) < 0)
- log_warning_errno(errno, "Failed to set CPU affinity: %m");
+ (void) parse_cpu_set_extend(rvalue, affinity, true, unit, filename, line, lvalue);
return 0;
}
return 0;
}
+static int config_parse_timeout_abort(
+ const char* unit,
+ const char *filename,
+ unsigned line,
+ const char *section,
+ unsigned section_line,
+ const char *lvalue,
+ int ltype,
+ const char *rvalue,
+ void *data,
+ void *userdata) {
+
+ int r;
+
+ assert(filename);
+ assert(lvalue);
+ assert(rvalue);
+
+ rvalue += strspn(rvalue, WHITESPACE);
+ if (isempty(rvalue)) {
+ arg_default_timeout_abort_set = false;
+ return 0;
+ }
+
+ r = parse_sec(rvalue, &arg_default_timeout_abort_usec);
+ if (r < 0) {
+ log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse DefaultTimeoutAbortSec= setting, ignoring: %s", rvalue);
+ return 0;
+ }
+
+ arg_default_timeout_abort_set = true;
+ return 0;
+}
+
static int parse_config_file(void) {
const ConfigTableItem items[] = {
{ "Manager", "CrashShell", config_parse_bool, 0, &arg_crash_shell },
{ "Manager", "CrashReboot", config_parse_bool, 0, &arg_crash_reboot },
{ "Manager", "ShowStatus", config_parse_show_status, 0, &arg_show_status },
- { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, NULL },
+ { "Manager", "CPUAffinity", config_parse_cpu_affinity2, 0, &arg_cpu_affinity },
{ "Manager", "JoinControllers", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
{ "Manager", "RuntimeWatchdogSec", config_parse_sec, 0, &arg_runtime_watchdog },
{ "Manager", "ShutdownWatchdogSec", config_parse_sec, 0, &arg_shutdown_watchdog },
{ "Manager", "DefaultStandardError", config_parse_output_restricted,0, &arg_default_std_error },
{ "Manager", "DefaultTimeoutStartSec", config_parse_sec, 0, &arg_default_timeout_start_usec },
{ "Manager", "DefaultTimeoutStopSec", config_parse_sec, 0, &arg_default_timeout_stop_usec },
+ { "Manager", "DefaultTimeoutAbortSec", config_parse_timeout_abort, 0, NULL },
{ "Manager", "DefaultRestartSec", config_parse_sec, 0, &arg_default_restart_usec },
{ "Manager", "DefaultStartLimitInterval", config_parse_sec, 0, &arg_default_start_limit_interval }, /* obsolete alias */
{ "Manager", "DefaultStartLimitIntervalSec",config_parse_sec, 0, &arg_default_start_limit_interval },
{ "Manager", "DefaultTasksAccounting", config_parse_bool, 0, &arg_default_tasks_accounting },
{ "Manager", "DefaultTasksMax", config_parse_tasks_max, 0, &arg_default_tasks_max },
{ "Manager", "CtrlAltDelBurstAction", config_parse_emergency_action, 0, &arg_cad_burst_action },
+ { "Manager", "DefaultOOMPolicy", config_parse_oom_policy, 0, &arg_default_oom_policy },
{}
};
m->default_std_error = arg_default_std_error;
m->default_timeout_start_usec = arg_default_timeout_start_usec;
m->default_timeout_stop_usec = arg_default_timeout_stop_usec;
+ m->default_timeout_abort_usec = arg_default_timeout_abort_usec;
+ m->default_timeout_abort_set = arg_default_timeout_abort_set;
m->default_restart_usec = arg_default_restart_usec;
m->default_start_limit_interval = arg_default_start_limit_interval;
m->default_start_limit_burst = arg_default_start_limit_burst;
m->default_memory_accounting = arg_default_memory_accounting;
m->default_tasks_accounting = arg_default_tasks_accounting;
m->default_tasks_max = arg_default_tasks_max;
+ m->default_oom_policy = arg_default_oom_policy;
(void) manager_set_default_rlimits(m, arg_default_rlimit);
#endif
#if BUMP_PROC_SYS_FS_FILE_MAX
- /* I so wanted to use STRINGIFY(ULONG_MAX) here, but alas we can't as glibc/gcc define that as
- * "(0x7fffffffffffffffL * 2UL + 1UL)". Seriously. 😢 */
- if (asprintf(&t, "%lu\n", ULONG_MAX) < 0) {
+ /* The maximum the kernel allows for this since 5.2 is LONG_MAX, use that. (Previously thing where
+ * different but the operation would fail silently.) */
+ if (asprintf(&t, "%li\n", LONG_MAX) < 0) {
log_oom();
return;
}
}
static int bump_rlimit_nofile(struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
int r, nr;
assert(saved_rlimit);
if (arg_system)
rl->rlim_max = MIN((rlim_t) nr, MAX(rl->rlim_max, (rlim_t) HIGH_RLIMIT_NOFILE));
+ /* If for some reason we were invoked with a soft limit above 1024 (which should never
+ * happen!, but who knows what we get passed in from pam_limit when invoked as --user
+ * instance), then lower what we pass on to not confuse our children */
+ rl->rlim_cur = MIN(rl->rlim_cur, (rlim_t) FD_SETSIZE);
+
arg_default_rlimit[RLIMIT_NOFILE] = rl;
}
+ /* Calculate the new limits to use for us. Never lower from what we inherited. */
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX((rlim_t) nr, saved_rlimit->rlim_cur),
+ .rlim_max = MAX((rlim_t) nr, saved_rlimit->rlim_max),
+ };
+
+ /* Shortcut if nothing changes. */
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_max &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_cur) {
+ log_debug("RLIMIT_NOFILE is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
/* Bump up the resource limit for ourselves substantially, all the way to the maximum the kernel allows, for
* both hard and soft. */
- r = setrlimit_closest(RLIMIT_NOFILE, &RLIMIT_MAKE_CONST(nr));
+ r = setrlimit_closest(RLIMIT_NOFILE, &new_rlimit);
if (r < 0)
return log_warning_errno(r, "Setting RLIMIT_NOFILE failed, ignoring: %m");
}
static int bump_rlimit_memlock(struct rlimit *saved_rlimit) {
+ struct rlimit new_rlimit;
int r;
assert(saved_rlimit);
if (getrlimit(RLIMIT_MEMLOCK, saved_rlimit) < 0)
return log_warning_errno(errno, "Reading RLIMIT_MEMLOCK failed, ignoring: %m");
- r = setrlimit_closest(RLIMIT_MEMLOCK, &RLIMIT_MAKE_CONST(HIGH_RLIMIT_MEMLOCK));
+ /* Pass the original value down to invoked processes */
+ if (!arg_default_rlimit[RLIMIT_MEMLOCK]) {
+ struct rlimit *rl;
+
+ rl = newdup(struct rlimit, saved_rlimit, 1);
+ if (!rl)
+ return log_oom();
+
+ arg_default_rlimit[RLIMIT_MEMLOCK] = rl;
+ }
+
+ /* Using MAX() on resource limits only is safe if RLIM_INFINITY is > 0. POSIX declares that rlim_t
+ * must be unsigned, hence this is a given, but let's make this clear here. */
+ assert_cc(RLIM_INFINITY > 0);
+
+ new_rlimit = (struct rlimit) {
+ .rlim_cur = MAX(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_cur),
+ .rlim_max = MAX(HIGH_RLIMIT_MEMLOCK, saved_rlimit->rlim_max),
+ };
+
+ if (saved_rlimit->rlim_max >= new_rlimit.rlim_cur &&
+ saved_rlimit->rlim_cur >= new_rlimit.rlim_max) {
+ log_debug("RLIMIT_MEMLOCK is already as high or higher than we need it, not bumping.");
+ return 0;
+ }
+
+ r = setrlimit_closest(RLIMIT_MEMLOCK, &new_rlimit);
if (r < 0)
return log_warning_errno(r, "Setting RLIMIT_MEMLOCK failed, ignoring: %m");
static void test_usr(void) {
- /* Check that /usr is not a separate fs */
+ /* Check that /usr is either on the same file system as / or mounted already. */
if (dir_is_empty("/usr") <= 0)
return;
if (setenv("TERM", t, 1) < 0)
return -errno;
+ /* The kernels sets HOME=/ for init. Let's undo this. */
+ if (path_equal_ptr(getenv("HOME"), "/") &&
+ unsetenv("HOME") < 0)
+ log_warning_errno(errno, "Failed to unset $HOME: %m");
+
return 0;
}
log_warning_errno(r, "Failed to write '%s' to /proc/sys/kernel/core_pattern, ignoring: %m", arg_early_core_pattern);
}
+static void update_cpu_affinity(bool skip_setup) {
+ _cleanup_free_ char *mask = NULL;
+
+ if (skip_setup || !arg_cpu_affinity.set)
+ return;
+
+ assert(arg_cpu_affinity.allocated > 0);
+
+ mask = cpu_set_to_string(&arg_cpu_affinity);
+ log_debug("Setting CPU affinity to %s.", strnull(mask));
+
+ if (sched_setaffinity(0, arg_cpu_affinity.allocated, arg_cpu_affinity.set) < 0)
+ log_warning_errno(errno, "Failed to set CPU affinity: %m");
+}
+
static void do_reexecute(
int argc,
char *argv[],
* we do that */
watchdog_close(true);
- /* Reset the RLIMIT_NOFILE to the kernel default, so that the new systemd can pass the kernel default to its
- * child processes */
-
- if (saved_rlimit_nofile->rlim_cur > 0)
+ /* Reset RLIMIT_NOFILE + RLIMIT_MEMLOCK back to the kernel defaults, so that the new systemd can pass
+ * the kernel default to its child processes */
+ if (saved_rlimit_nofile->rlim_cur != 0)
(void) setrlimit(RLIMIT_NOFILE, saved_rlimit_nofile);
- if (saved_rlimit_memlock->rlim_cur != (rlim_t) -1)
+ if (saved_rlimit_memlock->rlim_cur != RLIM_INFINITY)
(void) setrlimit(RLIMIT_MEMLOCK, saved_rlimit_memlock);
if (switch_root_dir) {
args[i++] = NULL;
assert(i <= args_size);
- /* Reenable any blocked signals, especially important if we switch from initial ramdisk to init=... */
+ /* Re-enable any blocked signals, especially important if we switch from initial ramdisk to init=... */
(void) reset_all_signal_handlers();
(void) reset_signal_mask();
(void) rlimit_nofile_safe();
saved_log_level = m->log_level_overridden ? log_get_max_level() : -1;
saved_log_target = m->log_target_overridden ? log_get_target() : _LOG_TARGET_INVALID;
- r = parse_config_file();
- if (r < 0)
- log_warning_errno(r, "Failed to parse config file, ignoring: %m");
+ (void) parse_configuration();
set_manager_defaults(m);
+ update_cpu_affinity(false);
+
if (saved_log_level >= 0)
manager_override_log_level(m, saved_log_level);
if (saved_log_target >= 0)
*ret_shutdown_verb = NULL;
/* Steal the switch root parameters */
- *ret_switch_root_dir = m->switch_root;
- *ret_switch_root_init = m->switch_root_init;
- m->switch_root = m->switch_root_init = NULL;
+ *ret_switch_root_dir = TAKE_PTR(m->switch_root);
+ *ret_switch_root_init = TAKE_PTR(m->switch_root_init);
return 0;
if (arg_action != ACTION_RUN)
return 0;
+ update_cpu_affinity(skip_setup);
+
if (arg_system) {
- /* Make sure we leave a core dump without panicing the kernel. */
+ /* Make sure we leave a core dump without panicking the kernel. */
install_crash_handler();
if (!skip_setup) {
assert(target->load_state == UNIT_LOADED);
- r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, &error, &default_unit_job);
+ r = manager_add_job(m, JOB_START, target, JOB_ISOLATE, NULL, &error, &default_unit_job);
if (r == -EPERM) {
log_debug_errno(r, "Default target could not be isolated, starting instead: %s", bus_error_message(&error, r));
sd_bus_error_free(&error);
- r = manager_add_job(m, JOB_START, target, JOB_REPLACE, &error, &default_unit_job);
+ r = manager_add_job(m, JOB_START, target, JOB_REPLACE, NULL, &error, &default_unit_job);
if (r < 0) {
*ret_error_message = "Failed to start default target";
return log_emergency_errno(r, "Failed to start default target: %s", bus_error_message(&error, r));
return 0;
}
-static void free_arguments(void) {
-
- /* Frees all arg_* variables, with the exception of arg_serialization */
- rlimit_free_all(arg_default_rlimit);
+static void reset_arguments(void) {
+ /* Frees/resets arg_* variables, with a few exceptions commented below. */
arg_default_unit = mfree(arg_default_unit);
+
+ /* arg_system — ignore */
+
+ arg_dump_core = true;
+ arg_crash_chvt = -1;
+ arg_crash_shell = false;
+ arg_crash_reboot = false;
arg_confirm_spawn = mfree(arg_confirm_spawn);
+ arg_show_status = _SHOW_STATUS_INVALID;
+ arg_switched_root = false;
+ arg_pager_flags = 0;
+ arg_service_watchdogs = true;
+ arg_default_std_output = EXEC_OUTPUT_JOURNAL;
+ arg_default_std_error = EXEC_OUTPUT_INHERIT;
+ arg_default_restart_usec = DEFAULT_RESTART_USEC;
+ arg_default_timeout_start_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_stop_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_abort_usec = DEFAULT_TIMEOUT_USEC;
+ arg_default_timeout_abort_set = false;
+ arg_default_start_limit_interval = DEFAULT_START_LIMIT_INTERVAL;
+ arg_default_start_limit_burst = DEFAULT_START_LIMIT_BURST;
+ arg_runtime_watchdog = 0;
+ arg_shutdown_watchdog = 10 * USEC_PER_MINUTE;
+ arg_early_core_pattern = NULL;
+ arg_watchdog_device = NULL;
+
arg_default_environment = strv_free(arg_default_environment);
+ rlimit_free_all(arg_default_rlimit);
+
+ arg_capability_bounding_set = CAP_ALL;
+ arg_no_new_privs = false;
+ arg_timer_slack_nsec = NSEC_INFINITY;
+ arg_default_timer_accuracy_usec = 1 * USEC_PER_MINUTE;
+
arg_syscall_archs = set_free(arg_syscall_archs);
+
+ /* arg_serialization — ignore */
+
+ arg_default_cpu_accounting = -1;
+ arg_default_io_accounting = false;
+ arg_default_ip_accounting = false;
+ arg_default_blockio_accounting = false;
+ arg_default_memory_accounting = MEMORY_ACCOUNTING_DEFAULT;
+ arg_default_tasks_accounting = true;
+ arg_default_tasks_max = UINT64_MAX;
+ arg_machine_id = (sd_id128_t) {};
+ arg_cad_burst_action = EMERGENCY_ACTION_REBOOT_FORCE;
+ arg_default_oom_policy = OOM_STOP;
+
+ cpu_set_reset(&arg_cpu_affinity);
}
-static int load_configuration(int argc, char **argv, const char **ret_error_message) {
+static int parse_configuration(void) {
int r;
- assert(ret_error_message);
-
arg_default_tasks_max = system_tasks_max_scale(DEFAULT_TASKS_MAX_PERCENTAGE, 100U);
+ /* Assign configuration defaults */
+ reset_arguments();
+
r = parse_config_file();
- if (r < 0) {
- *ret_error_message = "Failed to parse config file";
- return r;
- }
+ if (r < 0)
+ log_warning_errno(r, "Failed to parse config file, ignoring: %m");
if (arg_system) {
r = proc_cmdline_parse(parse_proc_cmdline_item, NULL, 0);
/* Note that this also parses bits from the kernel command line, including "debug". */
log_parse_environment();
+ return 0;
+}
+
+static int load_configuration(int argc, char **argv, const char **ret_error_message) {
+ int r;
+
+ assert(ret_error_message);
+
+ (void) parse_configuration();
+
r = parse_argv(argc, argv);
if (r < 0) {
*ret_error_message = "Failed to parse commandline arguments";
dual_timestamp initrd_timestamp = DUAL_TIMESTAMP_NULL, userspace_timestamp = DUAL_TIMESTAMP_NULL, kernel_timestamp = DUAL_TIMESTAMP_NULL,
security_start_timestamp = DUAL_TIMESTAMP_NULL, security_finish_timestamp = DUAL_TIMESTAMP_NULL;
- struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0), saved_rlimit_memlock = RLIMIT_MAKE_CONST((rlim_t) -1);
+ struct rlimit saved_rlimit_nofile = RLIMIT_MAKE_CONST(0),
+ saved_rlimit_memlock = RLIMIT_MAKE_CONST(RLIM_INFINITY); /* The original rlimits we passed
+ * in. Note we use different values
+ * for the two that indicate whether
+ * these fields are initialized! */
bool skip_setup, loaded_policy = false, queue_default_job = false, first_boot = false, reexecute = false;
char *switch_root_dir = NULL, *switch_root_init = NULL;
usec_t before_startup, after_startup;
(void) prctl(PR_SET_NAME, systemd);
/* Save the original command line */
- saved_argv = argv;
- saved_argc = argc;
+ save_argc_argv(argc, argv);
/* Make sure that if the user says "syslog" we actually log to the journal. */
log_set_upgrade_syslog_to_journal(true);
m = manager_free(m);
}
- free_arguments();
+ reset_arguments();
mac_selinux_finish();
if (reexecute)
}
#endif
+#if HAS_FEATURE_ADDRESS_SANITIZER
+ __lsan_do_leak_check();
+#endif
+
if (shutdown_verb) {
r = become_shutdown(shutdown_verb, retval);
log_error_errno(r, "Failed to execute shutdown binary, %s: %m", getpid_cached() == 1 ? "freezing" : "quitting");