From 3f815163ff8fdcdbd329680580df36f94e15325d Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 25 Aug 2016 15:57:21 +0200 Subject: [PATCH] core: introduce ProtectSystem=strict Let's tighten our sandbox a bit more: with this change ProtectSystem= gains a new setting "strict". If set, the entire directory tree of the system is mounted read-only, but the API file systems /proc, /dev, /sys are excluded (they may be managed with PrivateDevices= and ProtectKernelTunables=). Also, /home and /root are excluded as those are left for ProtectHome= to manage. In this mode, all "real" file systems (i.e. non-API file systems) are mounted read-only, and specific directories may only be excluded via ReadWriteDirectories=, thus implementing an effective whitelist instead of blacklist of writable directories. While we are at, also add /efi to the list of paths always affected by ProtectSystem=. This is a follow-up for b52a109ad38cd37b660ccd5394ff5c171a5e5355 which added /efi as alternative for /boot. Our namespacing logic should respect that too. --- man/systemd.exec.xml | 33 +++++++++++++------------- src/core/namespace.c | 56 +++++++++++++++++++++++++++++++++++++------- src/core/namespace.h | 1 + 3 files changed, 65 insertions(+), 25 deletions(-) diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 07128b489ea..1b672fe0c91 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -1020,22 +1020,23 @@ ProtectSystem= - Takes a boolean argument or - full. If true, mounts the - /usr and /boot - directories read-only for processes invoked by this unit. If - set to full, the /etc - directory is mounted read-only, too. This setting ensures that - any modification of the vendor-supplied operating system (and - optionally its configuration) is prohibited for the service. - It is recommended to enable this setting for all long-running - services, unless they are involved with system updates or need - to modify the operating system in other ways. Note however - that processes retaining the CAP_SYS_ADMIN capability can undo - the effect of this setting. This setting is hence particularly - useful for daemons which have this capability removed, for - example with CapabilityBoundingSet=. - Defaults to off. + Takes a boolean argument or the special values full or + strict. If true, mounts the /usr and /boot + directories read-only for processes invoked by this unit. If set to full, the + /etc directory is mounted read-only, too. If set to strict the entire + file system hierarchy is mounted read-only, except for the API file system subtrees /dev, + /proc and /sys (protect these directories using + PrivateDevices=, ProtectKernelTunables=, + ProtectControlGroups=). This setting ensures that any modification of the vendor-supplied + operating system (and optionally its configuration, and local mounts) is prohibited for the service. It is + recommended to enable this setting for all long-running services, unless they are involved with system updates + or need to modify the operating system in other ways. If this option is used, + ReadWritePaths= may be used to exclude specific directories from being made read-only. Note + that processes retaining the CAP_SYS_ADMIN capability (and with no system call filter that + prohibits mount-related system calls applied) can undo the effect of this setting. This setting is hence + particularly useful for daemons which have this either the @mount set filtered using + SystemCallFilter=, or have the CAP_SYS_ADMIN capability removed, for + example with CapabilityBoundingSet=. Defaults to off. diff --git a/src/core/namespace.c b/src/core/namespace.c index e08d7459c5d..498cd139bf8 100644 --- a/src/core/namespace.c +++ b/src/core/namespace.c @@ -472,9 +472,11 @@ int setup_namespace( private_dev + (protect_sysctl ? 3 : 0) + (protect_cgroups != protect_sysctl) + - (protect_home != PROTECT_HOME_NO ? 3 : 0) + - (protect_system != PROTECT_SYSTEM_NO ? 2 : 0) + - (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0); + (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT ? 3 : 0) + + (protect_system == PROTECT_SYSTEM_STRICT ? + (2 + !private_dev + !protect_sysctl) : + ((protect_system != PROTECT_SYSTEM_NO ? 3 : 0) + + (protect_system == PROTECT_SYSTEM_FULL ? 1 : 0))); if (n > 0) { m = mounts = (BindMount *) alloca0(n * sizeof(BindMount)); @@ -529,9 +531,13 @@ int setup_namespace( m++; } - if (protect_home != PROTECT_HOME_NO) { + if (protect_home != PROTECT_HOME_NO || protect_system == PROTECT_SYSTEM_STRICT) { const char *home_dir, *run_user_dir, *root_dir; + /* If protection of $HOME and $XDG_RUNTIME_DIR is requested, then go for it. If we are in + * strict system protection mode, then also add entries for these directories, but mark them + * writable. This is because we want ProtectHome= and ProtectSystem= to be fully orthogonal. */ + home_dir = prefix_roota(root_directory, "/home"); home_dir = strjoina("-", home_dir); run_user_dir = prefix_roota(root_directory, "/run/user"); @@ -540,22 +546,53 @@ int setup_namespace( root_dir = strjoina("-", root_dir); r = append_mounts(&m, STRV_MAKE(home_dir, run_user_dir, root_dir), - protect_home == PROTECT_HOME_READ_ONLY ? READONLY : INACCESSIBLE); + protect_home == PROTECT_HOME_READ_ONLY ? READONLY : + protect_home == PROTECT_HOME_YES ? INACCESSIBLE : READWRITE); if (r < 0) return r; } - if (protect_system != PROTECT_SYSTEM_NO) { - const char *usr_dir, *boot_dir, *etc_dir; + if (protect_system == PROTECT_SYSTEM_STRICT) { + /* In strict mode, we mount everything read-only, except for /proc, /dev, /sys which are the + * kernel API VFS, which are left writable, but PrivateDevices= + ProtectKernelTunables= + * protect those, and these options should be fully orthogonal. (And of course /home and + * friends are also left writable, as ProtectHome= shall manage those, orthogonally, see + * above). */ + + m->path = prefix_roota(root_directory, "/"); + m->mode = READONLY; + m++; + + m->path = prefix_roota(root_directory, "/proc"); + m->mode = READWRITE; + m++; + + if (!private_dev) { + m->path = prefix_roota(root_directory, "/dev"); + m->mode = READWRITE; + m++; + } + if (!protect_sysctl) { + m->path = prefix_roota(root_directory, "/sys"); + m->mode = READWRITE; + m++; + } + + } else if (protect_system != PROTECT_SYSTEM_NO) { + const char *usr_dir, *boot_dir, *efi_dir, *etc_dir; + + /* In any other mode we simply mark the relevant three directories ready-only. */ usr_dir = prefix_roota(root_directory, "/usr"); boot_dir = prefix_roota(root_directory, "/boot"); boot_dir = strjoina("-", boot_dir); + efi_dir = prefix_roota(root_directory, "/efi"); + efi_dir = strjoina("-", efi_dir); etc_dir = prefix_roota(root_directory, "/etc"); r = append_mounts(&m, protect_system == PROTECT_SYSTEM_FULL - ? STRV_MAKE(usr_dir, boot_dir, etc_dir) - : STRV_MAKE(usr_dir, boot_dir), READONLY); + ? STRV_MAKE(usr_dir, boot_dir, efi_dir, etc_dir) + : STRV_MAKE(usr_dir, boot_dir, efi_dir), READONLY); if (r < 0) return r; } @@ -780,6 +817,7 @@ static const char *const protect_system_table[_PROTECT_SYSTEM_MAX] = { [PROTECT_SYSTEM_NO] = "no", [PROTECT_SYSTEM_YES] = "yes", [PROTECT_SYSTEM_FULL] = "full", + [PROTECT_SYSTEM_STRICT] = "strict", }; DEFINE_STRING_TABLE_LOOKUP(protect_system, ProtectSystem); diff --git a/src/core/namespace.h b/src/core/namespace.h index 38453362870..6505bcc499e 100644 --- a/src/core/namespace.h +++ b/src/core/namespace.h @@ -35,6 +35,7 @@ typedef enum ProtectSystem { PROTECT_SYSTEM_NO, PROTECT_SYSTEM_YES, PROTECT_SYSTEM_FULL, + PROTECT_SYSTEM_STRICT, _PROTECT_SYSTEM_MAX, _PROTECT_SYSTEM_INVALID = -1 } ProtectSystem; -- 2.39.2