]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core:sandbox: Add ProtectKernelModules= option
authorDjalal Harouni <tixxdz@opendz.org>
Wed, 12 Oct 2016 11:31:21 +0000 (13:31 +0200)
committerDjalal Harouni <tixxdz@opendz.org>
Wed, 12 Oct 2016 11:31:21 +0000 (13:31 +0200)
This is useful to turn off explicit module load and unload operations on modular
kernels. This option removes CAP_SYS_MODULE from the capability bounding set for
the unit, and installs a system call filter to block module system calls.

This option will not prevent the kernel from loading modules using the module
auto-load feature which is a system wide operation.

man/systemd.exec.xml
src/core/dbus-execute.c
src/core/execute.c
src/core/execute.h
src/core/load-fragment-gperf.gperf.m4
src/core/unit.c
src/shared/bus-unit-util.c

index 986985ad35df70fe287405cac5133de4459bef89..3bea4976b379fce39c5fd32bcff387208889bcc1 100644 (file)
         logging. This does not affect commands prefixed with <literal>+</literal>.</para></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>ProtectKernelModules=</varname></term>
+
+        <listitem><para>Takes a boolean argument. If true, explicit module loading will
+        be denied. This allows to turn off module load and unload operations on modular
+        kernels. It is recomended to turn this on for most services that do not need special
+        file systems or extra kernel modules to work. Default to off. Enabling this option
+        removes <constant>CAP_SYS_MODULE</constant> from the capability bounding set for
+        the unit, and installs a system call filter to block module system calls.
+        Note that limited automatic module loading due to user configuration or kernel
+        mapping tables might still happen as side effect of requested user operations,
+        both privileged and unprivileged. To disable module auto-load feature please see
+        <citerefentry><refentrytitle>sysctl.d</refentrytitle><manvolnum>5</manvolnum></citerefentry>
+        <constant>kernel.modules_disabled</constant> mechanism and
+        <filename>/proc/sys/kernel/modules_disabled</filename> documentation.</para></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>Personality=</varname></term>
 
index eec4500c8ce4fbfeb14b02df36450f277be6bed8..b8720d7d3d495c959e934ad2e1efeaba88ab7917 100644 (file)
@@ -708,6 +708,7 @@ const sd_bus_vtable bus_exec_vtable[] = {
         SD_BUS_PROPERTY("PrivateTmp", "b", bus_property_get_bool, offsetof(ExecContext, private_tmp), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateDevices", "b", bus_property_get_bool, offsetof(ExecContext, private_devices), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectKernelTunables", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_tunables), SD_BUS_VTABLE_PROPERTY_CONST),
+        SD_BUS_PROPERTY("ProtectKernelModules", "b", bus_property_get_bool, offsetof(ExecContext, protect_kernel_modules), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("ProtectControlGroups", "b", bus_property_get_bool, offsetof(ExecContext, protect_control_groups), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateNetwork", "b", bus_property_get_bool, offsetof(ExecContext, private_network), SD_BUS_VTABLE_PROPERTY_CONST),
         SD_BUS_PROPERTY("PrivateUsers", "b", bus_property_get_bool, offsetof(ExecContext, private_users), SD_BUS_VTABLE_PROPERTY_CONST),
@@ -1075,7 +1076,7 @@ int bus_exec_context_set_transient_property(
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers",
                               "NoNewPrivileges", "SyslogLevelPrefix", "MemoryDenyWriteExecute",
                               "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
-                              "ProtectControlGroups")) {
+                              "ProtectKernelModules", "ProtectControlGroups")) {
                 int b;
 
                 r = sd_bus_message_read(message, "b", &b);
@@ -1111,6 +1112,8 @@ int bus_exec_context_set_transient_property(
                                 c->remove_ipc = b;
                         else if (streq(name, "ProtectKernelTunables"))
                                 c->protect_kernel_tunables = b;
+                        else if (streq(name, "ProtectKernelModules"))
+                                c->protect_kernel_modules = b;
                         else if (streq(name, "ProtectControlGroups"))
                                 c->protect_control_groups = b;
 
index 0c983f4953da183e11f7f5a9cf538c33bc26876d..7a278b7d3193792359f0b7622aa109897f4e7012 100644 (file)
@@ -1436,6 +1436,50 @@ finish:
         return r;
 }
 
+static int apply_protect_kernel_modules(Unit *u, const ExecContext *c) {
+        static const int module_syscalls[] = {
+                SCMP_SYS(delete_module),
+                SCMP_SYS(finit_module),
+                SCMP_SYS(init_module),
+        };
+
+        scmp_filter_ctx *seccomp;
+        unsigned i;
+        int r;
+
+        assert(c);
+
+        /* Turn of module syscalls on ProtectKernelModules=yes */
+
+        if (skip_seccomp_unavailable(u, "ProtectKernelModules="))
+                return 0;
+
+        seccomp = seccomp_init(SCMP_ACT_ALLOW);
+        if (!seccomp)
+                return -ENOMEM;
+
+        r = seccomp_add_secondary_archs(seccomp);
+        if (r < 0)
+                goto finish;
+
+        for (i = 0; i < ELEMENTSOF(module_syscalls); i++) {
+                r = seccomp_rule_add(seccomp, SCMP_ACT_ERRNO(EPERM),
+                                     module_syscalls[i], 0);
+                if (r < 0)
+                        goto finish;
+        }
+
+        r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
+        if (r < 0)
+                goto finish;
+
+        r = seccomp_load(seccomp);
+
+finish:
+        seccomp_release(seccomp);
+        return r;
+}
+
 static int apply_private_devices(Unit *u, const ExecContext *c) {
         const SystemCallFilterSet *set;
         scmp_filter_ctx *seccomp;
@@ -2690,6 +2734,14 @@ static int exec_child(
                         }
                 }
 
+                if (context->protect_kernel_modules) {
+                        r = apply_protect_kernel_modules(unit, context);
+                        if (r < 0) {
+                                *exit_status = EXIT_SECCOMP;
+                                return r;
+                        }
+                }
+
                 if (context->private_devices) {
                         r = apply_private_devices(unit, context);
                         if (r < 0) {
index 449180c9037e956f00ab5cbd9551a6efb7ee564f..1de439c3ad49fcf34d0e77ca3dccf6e47e50d30b 100644 (file)
@@ -175,6 +175,7 @@ struct ExecContext {
         ProtectSystem protect_system;
         ProtectHome protect_home;
         bool protect_kernel_tunables;
+        bool protect_kernel_modules;
         bool protect_control_groups;
 
         bool no_new_privileges;
index c49c1d6732137b555807dd777c81dbbfa08fe24e..a700d853cc5bcf26450acfb8a2b551707b8210c5 100644 (file)
@@ -90,6 +90,7 @@ $1.InaccessiblePaths,            config_parse_namespace_path_strv,   0,
 $1.PrivateTmp,                   config_parse_bool,                  0,                             offsetof($1, exec_context.private_tmp)
 $1.PrivateDevices,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_devices)
 $1.ProtectKernelTunables,        config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_tunables)
+$1.ProtectKernelModules,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_kernel_modules)
 $1.ProtectControlGroups,         config_parse_bool,                  0,                             offsetof($1, exec_context.protect_control_groups)
 $1.PrivateNetwork,               config_parse_bool,                  0,                             offsetof($1, exec_context.private_network)
 $1.PrivateUsers,                 config_parse_bool,                  0,                             offsetof($1, exec_context.private_users)
index 690f7f7dd9ca0813f242609f52e2e9e9b0ec9411..71f95c0b96aa639c42fc33c6af65931411673e7e 100644 (file)
@@ -3401,6 +3401,9 @@ int unit_patch_contexts(Unit *u) {
                 if (ec->private_devices)
                         ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_MKNOD);
 
+                if (ec->protect_kernel_modules)
+                        ec->capability_bounding_set &= ~(UINT64_C(1) << CAP_SYS_MODULE);
+
                 if (ec->dynamic_user) {
                         if (!ec->user) {
                                 r = user_from_unit_name(u, &ec->user);
index a550a370b58c19ad2d7c16d37b2c8117f5c5d34b..f639e0e83288b337310ace480d06ed70fed77ac1 100644 (file)
@@ -204,7 +204,8 @@ int bus_append_unit_property_assignment(sd_bus_message *m, const char *assignmen
                               "IgnoreSIGPIPE", "TTYVHangup", "TTYReset", "RemainAfterExit",
                               "PrivateTmp", "PrivateDevices", "PrivateNetwork", "PrivateUsers", "NoNewPrivileges",
                               "SyslogLevelPrefix", "Delegate", "RemainAfterElapse", "MemoryDenyWriteExecute",
-                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables", "ProtectControlGroups")) {
+                              "RestrictRealtime", "DynamicUser", "RemoveIPC", "ProtectKernelTunables",
+                              "ProtectKernelModules", "ProtectControlGroups")) {
 
                 r = parse_boolean(eq);
                 if (r < 0)