]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
pid1: add ProtectSystem= as system-wide configuration, and default it to true in...
authorLennart Poettering <lennart@poettering.net>
Wed, 29 Nov 2023 17:52:28 +0000 (18:52 +0100)
committerLennart Poettering <lennart@poettering.net>
Wed, 6 Dec 2023 21:10:20 +0000 (22:10 +0100)
This adds a new ProtectSystem= setting that mirrors the option of the
same of services, but in a more restrictive way. If enabled will remount
/usr/ to read-only, very early at boot. Takes a special value "auto"
(which is the default) which is equivalent to true in the initrd, and
false otherwise.

Unlike the per-service option we don't support full/strict modes, but
the door is open to eventually support that too if it makes sense. It's
not entirely trivial though as we have very little mounted this early,
and hence the mechanism might not apply 1:1. Hence in this PR is a
conservative first step.

My primary goal with this is to lock down initrds a bit, since they
conceptually are mostly immutable, but they are unpacked into a mutable
tmpfs. let's tighten the screws a bit on that, and at least make /usr/
immutable.

This is particularly nice on USIs (i.e. Unified System Images, that pack
a whole OS into a UKI without transitioning out of it), such as
diskomator.

man/systemd-system.conf.xml
src/core/main.c
src/core/system.conf.in

index 3c06b65f9350e7d5146335b12cffc0b283629512..0546283b285a69ccffa3812c671664ad363543e1 100644 (file)
         <xi:include href="version-info.xml" xpointer="v239"/></listitem>
       </varlistentry>
 
+      <varlistentry>
+        <term><varname>ProtectSystem=</varname></term>
+
+        <listitem><para>Takes a boolean argument or the string <literal>auto</literal>. If set to true this
+        will remount <filename>/usr/</filename> read-only. If set to <literal>auto</literal> (the default)
+        and running in an initrd equivalent to true, otherwise false. This implements a restricted subset of
+        the per-unit setting of the same name, see
+        <citerefentry><refentrytitle>systemd.exec</refentrytitle><manvolnum>5</manvolnum></citerefentry> for
+        details: currently, the <literal>full</literal> or <literal>struct</literal> values are not
+        supported.</para>
+
+        <xi:include href="version-info.xml" xpointer="v256"/></listitem>
+      </varlistentry>
+
       <varlistentry>
         <term><varname>SystemCallArchitectures=</varname></term>
 
index 2ac59dabf5a23bd9490b5e48a2843ed9e23d25a1..dc166452a008b282c550a4928664187b247f43ba 100644 (file)
@@ -68,6 +68,7 @@
 #include "manager-serialize.h"
 #include "mkdir-label.h"
 #include "mount-setup.h"
+#include "mount-util.h"
 #include "os-util.h"
 #include "pager.h"
 #include "parse-argument.h"
@@ -140,6 +141,7 @@ static char **arg_default_environment;
 static char **arg_manager_environment;
 static uint64_t arg_capability_bounding_set;
 static bool arg_no_new_privs;
+static int arg_protect_system;
 static nsec_t arg_timer_slack_nsec;
 static Set* arg_syscall_archs;
 static FILE* arg_serialization;
@@ -610,6 +612,43 @@ static int config_parse_oom_score_adjust(
         return 0;
 }
 
+static int config_parse_protect_system_pid1(
+                const char *unit,
+                const char *filename,
+                unsigned line,
+                const char *section,
+                unsigned section_line,
+                const char *lvalue,
+                int ltype,
+                const char *rvalue,
+                void *data,
+                void *userdata) {
+
+        int *v = ASSERT_PTR(data), r;
+
+        /* This is modelled after the per-service ProtectSystem= setting, but a bit more restricted on one
+         * hand, and more automatic in another. i.e. we currently only support yes/no (not "strict" or
+         * "full"). And we will enable this automatically for the initrd unless configured otherwise.
+         *
+         * We might extend this later to match more closely what the per-service ProtectSystem= can do, but
+         * this is not trivial, due to ordering constraints: besides /usr/ we don't really have much mounted
+         * at the moment we enable this logic. */
+
+        if (isempty(rvalue) || streq(rvalue, "auto")) {
+                *v = -1;
+                return 0;
+        }
+
+        r = parse_boolean(rvalue);
+        if (r < 0) {
+                log_syntax(unit, LOG_WARNING, filename, line, r, "Failed to parse ProtectSystem= argument '%s', ignoring: %m", rvalue);
+                return 0;
+        }
+
+        *v = r;
+        return 0;
+}
+
 static int parse_config_file(void) {
         const ConfigTableItem items[] = {
                 { "Manager", "LogLevel",                     config_parse_level2,                0,                        NULL                              },
@@ -637,6 +676,7 @@ static int parse_config_file(void) {
                 { "Manager", "RuntimeWatchdogPreGovernor",   config_parse_string,                CONFIG_PARSE_STRING_SAFE, &arg_watchdog_pretimeout_governor },
                 { "Manager", "CapabilityBoundingSet",        config_parse_capability_set,        0,                        &arg_capability_bounding_set      },
                 { "Manager", "NoNewPrivileges",              config_parse_bool,                  0,                        &arg_no_new_privs                 },
+                { "Manager", "ProtectSystem",                config_parse_protect_system_pid1,   0,                        &arg_protect_system               },
 #if HAVE_SECCOMP
                 { "Manager", "SystemCallArchitectures",      config_parse_syscall_archs,         0,                        &arg_syscall_archs                },
 #else
@@ -1684,6 +1724,35 @@ static void initialize_core_pattern(bool skip_setup) {
                                   arg_early_core_pattern);
 }
 
+static void apply_protect_system(bool skip_setup) {
+        int r;
+
+        if (skip_setup || getpid_cached() != 1 || arg_protect_system == 0)
+                return;
+
+        if (arg_protect_system < 0 && !in_initrd()) {
+                log_debug("ProtectSystem=auto selected, but not running in an initrd, skipping.");
+                return;
+        }
+
+        r = make_mount_point("/usr");
+        if (r < 0) {
+                log_warning_errno(r, "Failed to make /usr/ a mount point, ignoring: %m");
+                return;
+        }
+
+        if (mount_nofollow_verbose(
+                        LOG_WARNING,
+                        /* what= */ NULL,
+                        "/usr",
+                        /* fstype= */ NULL,
+                        MS_BIND|MS_REMOUNT|MS_RDONLY,
+                        /* options= */ NULL) < 0)
+                return;
+
+        log_info("Successfully made /usr/ read-only.");
+}
+
 static void update_cpu_affinity(bool skip_setup) {
         _cleanup_free_ char *mask = NULL;
 
@@ -2531,6 +2600,7 @@ static void reset_arguments(void) {
 
         arg_capability_bounding_set = CAP_MASK_UNSET;
         arg_no_new_privs = false;
+        arg_protect_system = -1;
         arg_timer_slack_nsec = NSEC_INFINITY;
 
         arg_syscall_archs = set_free(arg_syscall_archs);
@@ -3040,9 +3110,12 @@ int main(int argc, char *argv[]) {
                         cmdline_take_random_seed();
                 }
 
-                /* A core pattern might have been specified via the cmdline.  */
+                /* A core pattern might have been specified via the cmdline. */
                 initialize_core_pattern(skip_setup);
 
+                /* Make /usr/ read-only */
+                apply_protect_system(skip_setup);
+
                 /* Close logging fds, in order not to confuse collecting passed fds and terminal logic below */
                 log_close();
 
index 05eb6812700640605553482cbb50a6a8a4aee2d3..9b89a6aa77d7ce030a5614cf3c4a6853c263dd67 100644 (file)
@@ -39,6 +39,7 @@
 #WatchdogDevice=
 #CapabilityBoundingSet=
 #NoNewPrivileges=no
+#ProtectSystem=auto
 #SystemCallArchitectures=
 #TimerSlackNSec=
 #StatusUnitFormat={{STATUS_UNIT_FORMAT_DEFAULT_STR}}