]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
core: introduce ProtectSystem=strict
authorLennart Poettering <lennart@poettering.net>
Thu, 25 Aug 2016 13:57:21 +0000 (15:57 +0200)
committerDjalal Harouni <tixxdz@opendz.org>
Sun, 25 Sep 2016 08:42:18 +0000 (10:42 +0200)
Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a
new setting "strict". If set, the entire directory tree of the system is
mounted read-only, but the API file systems /proc, /dev, /sys are excluded
(they may be managed with PrivateDevices= and ProtectKernelTunables=). Also,
/home and /root are excluded as those are left for ProtectHome= to manage.

In this mode, all "real" file systems (i.e. non-API file systems) are mounted
read-only, and specific directories may only be excluded via
ReadWriteDirectories=, thus implementing an effective whitelist instead of
blacklist of writable directories.

While we are at, also add /efi to the list of paths always affected by
ProtectSystem=. This is a follow-up for
b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for
/boot. Our namespacing logic should respect that too.

man/systemd.exec.xml
src/core/namespace.c
src/core/namespace.h

index 07128b489eaf191bb0108f4ce1628ed6a0feb563..1b672fe0c916cf7a38452f8536f54a1841144c42 100644 (file)
       <varlistentry>
         <term><varname>ProtectSystem=</varname></term>
 
-        <listitem><para>Takes a boolean argument or
-        <literal>full</literal>. If true, mounts the
-        <filename>/usr</filename> and <filename>/boot</filename>
-        directories read-only for processes invoked by this unit. If
-        set to <literal>full</literal>, the <filename>/etc</filename>
-        directory is mounted read-only, too. This setting ensures that
-        any modification of the vendor-supplied operating system (and
-        optionally its configuration) is prohibited for the service.
-        It is recommended to enable this setting for all long-running
-        services, unless they are involved with system updates or need
-        to modify the operating system in other ways. Note however
-        that processes retaining the CAP_SYS_ADMIN capability can undo
-        the effect of this setting. This setting is hence particularly
-        useful for daemons which have this capability removed, for
-        example with <varname>CapabilityBoundingSet=</varname>.
-        Defaults to off.</para></listitem>
+        <listitem><para>Takes a boolean argument or the special values <literal>full</literal> or
+        <literal>strict</literal>. If true, mounts the <filename>/usr</filename> and <filename>/boot</filename>
+        directories read-only for processes invoked by this unit. If set to <literal>full</literal>, the
+        <filename>/etc</filename> directory is mounted read-only, too. If set to <literal>strict</literal> the entire
+        file system hierarchy is mounted read-only, except for the API file system subtrees <filename>/dev</filename>,
+        <filename>/proc</filename> and <filename>/sys</filename> (protect these directories using
+        <varname>PrivateDevices=</varname>, <varname>ProtectKernelTunables=</varname>,
+        <varname>ProtectControlGroups=</varname>). This setting ensures that any modification of the vendor-supplied
+        operating system (and optionally its configuration, and local mounts) is prohibited for the service.  It is
+        recommended to enable this setting for all long-running services, unless they are involved with system updates
+        or need to modify the operating system in other ways. If this option is used,
+        <varname>ReadWritePaths=</varname> may be used to exclude specific directories from being made read-only. Note
+        that processes retaining the <constant>CAP_SYS_ADMIN</constant> capability (and with no system call filter that
+        prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence
+        particularly useful for daemons which have this either the <literal>@mount</literal> set filtered using
+        <varname>SystemCallFilter=</varname>, or have the <constant>CAP_SYS_ADMIN</constant> capability removed, for
+        example with <varname>CapabilityBoundingSet=</varname>.  Defaults to off.</para></listitem>
       </varlistentry>
 
       <varlistentry>
index e08d7459c5d6969cca595a71c15026b98a5abbeb..498cd139bf85bde5eece2e442979382e7a29f757 100644 (file)
@@ -472,9 +472,11 @@ int setup_namespace(
                 private_dev +
                 (protect_sysctl ? 3 : 0) +
                 (protect_cgroups != protect_sysctl) +
-                (protect_home != PROTECT_HOME_NO ? 3 : 0) +
-                (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) +
-                (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0);
+                (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) +
+                (protect_system == PROTECT_SYSTEM_STRICT ?
+                 (2 + !private_dev + !protect_sysctl) :
+                 ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) +
+                  (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0)));
 
         if (n > 0) {
                 m = mounts = (BindMount *) alloca0(n * sizeof(BindMount));
@@ -529,9 +531,13 @@ int setup_namespace(
                         m++;
                 }
 
-                if (protect_home != PROTECT_HOME_NO) {
+                if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) {
                         const char *home_dir, *run_user_dir, *root_dir;
 
+                        /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in
+                         * strict system protection mode, then also add entries for these directories, but mark them
+                         * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */
+
                         home_dir = prefix_roota(root_directory, "/home");
                         home_dir = strjoina("-", home_dir);
                         run_user_dir = prefix_roota(root_directory, "/run/user");
@@ -540,22 +546,53 @@ int setup_namespace(
                         root_dir = strjoina("-", root_dir);
 
                         r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir),
-                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE);
+                                protect_home == PROTECT_HOME_READ_ONLY ? READONLY :
+                                protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE);
                         if (r < 0)
                                 return r;
                 }
 
-                if (protect_system != PROTECT_SYSTEM_NO) {
-                        const char *usr_dir, *boot_dir, *etc_dir;
+                if (protect_system == PROTECT_SYSTEM_STRICT) {
+                        /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the
+                         * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables=
+                         * protect those, and these options should be fully orthogonal. (And of course /home and
+                         * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see
+                         * above). */
+
+                        m->path = prefix_roota(root_directory, "/");
+                        m->mode = READONLY;
+                        m++;
+
+                        m->path = prefix_roota(root_directory, "/proc");
+                        m->mode = READWRITE;
+                        m++;
+
+                        if (!private_dev) {
+                                m->path = prefix_roota(root_directory, "/dev");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+                        if (!protect_sysctl) {
+                                m->path = prefix_roota(root_directory, "/sys");
+                                m->mode = READWRITE;
+                                m++;
+                        }
+
+                } else if (protect_system != PROTECT_SYSTEM_NO) {
+                        const char *usr_dir, *boot_dir, *efi_dir, *etc_dir;
+
+                        /* In any other mode we simply mark the relevant three directories ready-only. */
 
                         usr_dir = prefix_roota(root_directory, "/usr");
                         boot_dir = prefix_roota(root_directory, "/boot");
                         boot_dir = strjoina("-", boot_dir);
+                        efi_dir = prefix_roota(root_directory, "/efi");
+                        efi_dir = strjoina("-", efi_dir);
                         etc_dir = prefix_roota(root_directory, "/etc");
 
                         r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL
-                                ? STRV_MAKE(usr_dir, boot_dir, etc_dir)
-                                : STRV_MAKE(usr_dir, boot_dir), READONLY);
+                                          ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir)
+                                          : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY);
                         if (r < 0)
                                 return r;
                 }
@@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = {
         [PROTECT_SYSTEM_NO] = "no",
         [PROTECT_SYSTEM_YES] = "yes",
         [PROTECT_SYSTEM_FULL] = "full",
+        [PROTECT_SYSTEM_STRICT] = "strict",
 };
 
 DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem);
index 38453362870e076e5caebc5d494488259bb83fb6..6505bcc499eaee44901c3e26fa92f9449324f714 100644 (file)
@@ -35,6 +35,7 @@ typedef enum ProtectSystem {
         PROTECT_SYSTEM_NO,
         PROTECT_SYSTEM_YES,
         PROTECT_SYSTEM_FULL,
+        PROTECT_SYSTEM_STRICT,
         _PROTECT_SYSTEM_MAX,
         _PROTECT_SYSTEM_INVALID = -1
 } ProtectSystem;