]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
execute: add a new easy-to-use RestrictRealtime= option to units 3583/head
authorLennart Poettering <lennart@poettering.net>
Wed, 22 Jun 2016 23:45:45 +0000 (01:45 +0200)
committerLennart Poettering <lennart@poettering.net>
Wed, 22 Jun 2016 23:45:45 +0000 (01:45 +0200)
It takes a boolean value. If true, access to SCHED_RR, SCHED_FIFO and
SCHED_DEADLINE is blocked, which my be used to lock up the system.

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4

index dbfc7692f78315604756aecf87527c1b5aaa79f9..ed02666daf8aaf26a210c259b1cca3f67b5483ff 100644 (file)
         </para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>RestrictRealtime=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If set, any attempts to enable realtime scheduling in a process of
+        the unit are refused. This restricts access to realtime task scheduling policies such as
+        <constant>SCHED_FIFO</constant>, <constant>SCHED_RR</constant> or <constant>SCHED_DEADLINE</constant>. See
+        <citerefentry><refentrytitle>sched</refentrytitle><manvolnum>7</manvolnum></citerefentry> for details about
+        these scheduling policies. Realtime scheduling policies may be used to monopolize CPU time for longer periods
+        of time, and may hence be used to lock up or otherwise trigger Denial-of-Service situations on the system. It
+        is hence recommended to restrict access to realtime scheduling to the few programs that actually require
+        them. Defaults to off.</para></listitem>
+      </varlistentry>
+
     </variablelist>
   </refsect1>
 
index 4c88c41127bfc13767988b2d056c9b2eb543f7f8..644b9561b570110e9214ee8d01bf5e9b23cc3487 100644 (file)
@@ -720,6 +720,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("RuntimeDirectoryMode", "u", bus_property_get_mode, offsetof(ExecContext, runtime_directory_mode), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("RuntimeDirectory", "as", NULL, offsetof(ExecContext, runtime_directory), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("MemoryDenyWriteExecute", "b", bus_property_get_bool, offsetof(ExecContext, memory_deny_write_execute), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("RestrictRealtime", "b", bus_property_get_bool, offsetof(ExecContext, restrict_realtime), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_VTABLE_END
 };
 
@@ -1057,7 +1058,7 @@ int bus_exec_context_set_transient_property(
         } else if (STR_IN_SET(name,
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork",
-                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute")) {
+                              "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute", "RestrictRealtime")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1083,6 +1084,8 @@ int bus_exec_context_set_transient_property(
                                 c->syslog_level_prefix = b;
                         else if (streq(name, "MemoryDenyWriteExecute"))
                                 c->memory_deny_write_execute = b;
+                        else if (streq(name, "RestrictRealtime"))
+                                c->restrict_realtime = b;
 
                         unit_write_drop_in_private_format(u, mode, name, "%s=%s", name, yes_no(b));
                 }
index cf52355fc4f1911fc2816bad840bb7d44e79118b..8cb18dbd5b5c4fca55cda809b6c8af2aab02e03b 100644 (file)
@@ -1264,6 +1264,76 @@ finish:
         return r;
 }
 
+static int apply_restrict_realtime(const ExecContext *c) {
+        static const int permitted_policies[] = {
+                SCHED_OTHER,
+                SCHED_BATCH,
+                SCHED_IDLE,
+        };
+
+        scmp_filter_ctx *seccomp;
+        unsigned i;
+        int r, p, max_policy = 0;
+
+        assert(c);
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        /* Determine the highest policy constant we want to allow */
+        for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+                if (permitted_policies[i] > max_policy)
+                        max_policy = permitted_policies[i];
+
+        /* Go through all policies with lower values than that, and block them -- unless they appear in the
+         * whitelist. */
+        for (p = 0; p < max_policy; p++) {
+                bool good = false;
+
+                /* Check if this is in the whitelist. */
+                for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
+                        if (permitted_policies[i] == p) {
+                                good = true;
+                                break;
+                        }
+
+                if (good)
+                        continue;
+
+                /* Deny this policy */
+                r = seccomp_rule_add(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                SCMP_SYS(sched_setscheduler),
+                                1,
+                                SCMP_A1(SCMP_CMP_EQ, p));
+                if (r < 0)
+                        goto finish;
+        }
+
+        /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are unsigned here,
+         * hence no need no check for < 0 values. */
+        r = seccomp_rule_add(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(sched_setscheduler),
+                        1,
+                        SCMP_A1(SCMP_CMP_GT, max_policy));
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 #endif
 
 static void do_idle_pipe_dance(int idle_pipe[4]) {
@@ -1962,6 +2032,14 @@ static int exec_child(
                         }
                 }
 
+                /* Set the RTPRIO resource limit to 0, but only if nothing else was explicitly requested. */
+                if (context->restrict_realtime && !context->rlimit[RLIMIT_RTPRIO]) {
+                        if (setrlimit(RLIMIT_RTPRIO, &RLIMIT_MAKE_CONST(0)) < 0) {
+                                *exit_status = EXIT_LIMITS;
+                                return -errno;
+                        }
+                }
+
                 if (!cap_test_all(context->capability_bounding_set)) {
                         r = capability_bounding_set_drop(context->capability_bounding_set, false);
                         if (r < 0) {
@@ -2017,7 +2095,7 @@ static int exec_child(
                         }
 
                 if (context->no_new_privileges ||
-                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || use_syscall_filter)))
+                    (!have_effective_cap(CAP_SYS_ADMIN) && (use_address_families || context->memory_deny_write_execute || context->restrict_realtime || use_syscall_filter)))
                         if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) < 0) {
                                 *exit_status = EXIT_NO_NEW_PRIVILEGES;
                                 return -errno;
@@ -2039,6 +2117,15 @@ static int exec_child(
                                 return r;
                         }
                 }
+
+                if (context->restrict_realtime) {
+                        r = apply_restrict_realtime(context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
                 if (use_syscall_filter) {
                         r = apply_seccomp(context);
                         if (r < 0) {
@@ -2474,7 +2561,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 "%sProtectHome: %s\n"
                 "%sProtectSystem: %s\n"
                 "%sIgnoreSIGPIPE: %s\n"
-                "%sMemoryDenyWriteExecute: %s\n",
+                "%sMemoryDenyWriteExecute: %s\n"
+                "%sRestrictRealtime: %s\n",
                 prefix, c->umask,
                 prefix, c->working_directory ? c->working_directory : "/",
                 prefix, c->root_directory ? c->root_directory : "/",
@@ -2485,7 +2573,8 @@ void exec_context_dump(ExecContext *c, FILE* f, const char *prefix) {
                 prefix, protect_home_to_string(c->protect_home),
                 prefix, protect_system_to_string(c->protect_system),
                 prefix, yes_no(c->ignore_sigpipe),
-                prefix, yes_no(c->memory_deny_write_execute));
+                prefix, yes_no(c->memory_deny_write_execute),
+                prefix, yes_no(c->restrict_realtime));
 
         STRV_FOREACH(e, c->environment)
                 fprintf(f, "%sEnvironment: %s\n", prefix, *e);
index cd1f7b36f647ec4746ec3d997d85ed80b46a2164..210eea0e8203d034d238c5d84a274672caba5313 100644 (file)
@@ -193,12 +193,14 @@ struct ExecContext {
         char **runtime_directory;
         mode_t runtime_directory_mode;
 
+        bool memory_deny_write_execute;
+        bool restrict_realtime;
+
         bool oom_score_adjust_set:1;
         bool nice_set:1;
         bool ioprio_set:1;
         bool cpu_sched_set:1;
         bool no_new_privileges_set:1;
-        bool memory_deny_write_execute;
 };
 
 #include "cgroup-util.h"
index eb58586523897be5a234de5260367b30735178ae..fe1006830bb82e94e52cf8741bde8478f3cdc24d 100644 (file)
@@ -56,11 +56,13 @@ m4_ifdef(`HAVE_SECCOMP',
 $1.SystemCallArchitectures,      config_parse_syscall_archs,         0,                             offsetof($1, exec_context.syscall_archs)
 $1.SystemCallErrorNumber,        config_parse_syscall_errno,         0,                             offsetof($1, exec_context)
 $1.MemoryDenyWriteExecute,       config_parse_bool,                  0,                             offsetof($1, exec_context.memory_deny_write_execute)
+$1.RestrictRealtime,             config_parse_bool,                  0,                             offsetof($1, exec_context.restrict_realtime)
 $1.RestrictAddressFamilies,      config_parse_address_families,      0,                             offsetof($1, exec_context)',
 `$1.SystemCallFilter,            config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.SystemCallArchitectures,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.SystemCallErrorNumber,        config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.MemoryDenyWriteExecute,       config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
+$1.RestrictRealtime,             config_parse_warn_compat,           DISABLED_CONFIGURATION,        0
 $1.RestrictAddressFamilies,      config_parse_warn_compat,           DISABLED_CONFIGURATION,        0')
 $1.LimitCPU,                     config_parse_limit,                 RLIMIT_CPU,                    offsetof($1, exec_context.rlimit)
 $1.LimitFSIZE,                   config_parse_limit,                 RLIMIT_FSIZE,                  offsetof($1, exec_context.rlimit)