]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
execute: use prctl(PR_SET_MDWE) for MemoryDenyWriteExecute=yes
authorTopi Miettinen <toiwoton@gmail.com>
Sun, 6 Nov 2022 19:12:45 +0000 (21:12 +0200)
committerTopi Miettinen <topimiettinen@users.noreply.github.com>
Mon, 13 Mar 2023 18:44:36 +0000 (18:44 +0000)
On some ARM platforms, the dynamic linker could use PROT_BTI memory protection
flag with `mprotect(..., PROT_BTI | PROT_EXEC)` to enable additional memory
protection for executable pages. But `MemoryDenyWriteExecute=yes` blocks this
with seccomp filter denying all `mprotect(..., x | PROT_EXEC)`.

Newly preferred method is to use prctl(PR_SET_MDWE) on supported kernels. Then
in-kernel implementation can allow PROT_BTI as necessary, without weakening
MDWE. In-kernel version may also be extended to more sophisticated protections
in the future.

man/systemd.exec.xml
src/basic/missing_prctl.h
src/core/execute.c

index 795e2f0671cc091844c98078bd6e917e6d3ac444..653aa0d7921ce240d19976da8d594131e8ac225c 100644 (file)
@@ -2080,9 +2080,11 @@ RestrictNamespaces=~cgroup net</programlisting>
 
         <listitem><para>Takes a boolean argument. If set, attempts to create memory mappings that are writable and
         executable at the same time, or to change existing memory mappings to become executable, or mapping shared
-        memory segments as executable, are prohibited. Specifically, a system call filter is added that rejects
-        <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls with both
-        <constant>PROT_EXEC</constant> and <constant>PROT_WRITE</constant> set,
+        memory segments as executable, are prohibited. Specifically, a system call filter is added (or
+        preferably, an equivalent kernel check is enabled with
+        <citerefentry><refentrytitle>prctl</refentrytitle><manvolnum>2</manvolnum></citerefentry>) that
+        rejects <citerefentry><refentrytitle>mmap</refentrytitle><manvolnum>2</manvolnum></citerefentry>
+        system calls with both <constant>PROT_EXEC</constant> and <constant>PROT_WRITE</constant> set,
         <citerefentry><refentrytitle>mprotect</refentrytitle><manvolnum>2</manvolnum></citerefentry> or
         <citerefentry><refentrytitle>pkey_mprotect</refentrytitle><manvolnum>2</manvolnum></citerefentry> system calls
         with <constant>PROT_EXEC</constant> set and
index ab851306bac60710747183febc867e59656f3a1a..016085bb02d33f01fd124738c9dd5b343c65cc3d 100644 (file)
 #define PR_CAP_AMBIENT_LOWER     3
 #define PR_CAP_AMBIENT_CLEAR_ALL 4
 #endif
+
+/* b507808ebce23561d4ff8c2aa1fb949fe402bc61 (6.3) */
+#ifndef PR_SET_MDWE
+#define PR_SET_MDWE 65
+#endif
+#ifndef PR_MDWE_REFUSE_EXEC_GAIN
+#define PR_MDWE_REFUSE_EXEC_GAIN 1
+#endif
index 3800c7a38bf8951f185f81886f4465a83bb4eb6f..857b0b0070b61cdcc08ec0f5e71455122caf5997 100644 (file)
@@ -73,6 +73,7 @@
 #include "memory-util.h"
 #include "missing_fs.h"
 #include "missing_ioprio.h"
+#include "missing_prctl.h"
 #include "mkdir-label.h"
 #include "mount-util.h"
 #include "mountpoint-util.h"
@@ -1571,12 +1572,25 @@ static int apply_address_families(const Unit* u, const ExecContext *c) {
 }
 
 static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) {
+        int r;
+
         assert(u);
         assert(c);
 
         if (!c->memory_deny_write_execute)
                 return 0;
 
+        /* use prctl() if kernel supports it (6.3) */
+        r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0);
+        if (r == 0) {
+                log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE");
+                return 0;
+        }
+        if (r < 0 && errno != EINVAL)
+                return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m");
+        /* else use seccomp */
+        log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp");
+
         if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute="))
                 return 0;