From: Topi Miettinen Date: Sun, 6 Nov 2022 19:12:45 +0000 (+0200) Subject: execute: use prctl(PR_SET_MDWE) for MemoryDenyWriteExecute=yes X-Git-Tag: v254-rc1~1045 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=7a114ed4b39e9670f6a511f3eecb6fd58274d27b;p=thirdparty%2Fsystemd.git execute: use prctl(PR_SET_MDWE) for MemoryDenyWriteExecute=yes On some ARM platforms, the dynamic linker could use PROT_BTI memory protection flag with `mprotect(..., PROT_BTI | PROT_EXEC)` to enable additional memory protection for executable pages. But `MemoryDenyWriteExecute=yes` blocks this with seccomp filter denying all `mprotect(..., x | PROT_EXEC)`. Newly preferred method is to use prctl(PR_SET_MDWE) on supported kernels. Then in-kernel implementation can allow PROT_BTI as necessary, without weakening MDWE. In-kernel version may also be extended to more sophisticated protections in the future. --- diff --git a/man/systemd.exec.xml b/man/systemd.exec.xml index 795e2f0671c..653aa0d7921 100644 --- a/man/systemd.exec.xml +++ b/man/systemd.exec.xml @@ -2080,9 +2080,11 @@ RestrictNamespaces=~cgroup net Takes a boolean argument. If set, attempts to create memory mappings that are writable and executable at the same time, or to change existing memory mappings to become executable, or mapping shared - memory segments as executable, are prohibited. Specifically, a system call filter is added that rejects - mmap2 system calls with both - PROT_EXEC and PROT_WRITE set, + memory segments as executable, are prohibited. Specifically, a system call filter is added (or + preferably, an equivalent kernel check is enabled with + prctl2) that + rejects mmap2 + system calls with both PROT_EXEC and PROT_WRITE set, mprotect2 or pkey_mprotect2 system calls with PROT_EXEC set and diff --git a/src/basic/missing_prctl.h b/src/basic/missing_prctl.h index ab851306bac..016085bb02d 100644 --- a/src/basic/missing_prctl.h +++ b/src/basic/missing_prctl.h @@ -12,3 +12,11 @@ #define PR_CAP_AMBIENT_LOWER 3 #define PR_CAP_AMBIENT_CLEAR_ALL 4 #endif + +/* b507808ebce23561d4ff8c2aa1fb949fe402bc61 (6.3) */ +#ifndef PR_SET_MDWE +#define PR_SET_MDWE 65 +#endif +#ifndef PR_MDWE_REFUSE_EXEC_GAIN +#define PR_MDWE_REFUSE_EXEC_GAIN 1 +#endif diff --git a/src/core/execute.c b/src/core/execute.c index 3800c7a38bf..857b0b0070b 100644 --- a/src/core/execute.c +++ b/src/core/execute.c @@ -73,6 +73,7 @@ #include "memory-util.h" #include "missing_fs.h" #include "missing_ioprio.h" +#include "missing_prctl.h" #include "mkdir-label.h" #include "mount-util.h" #include "mountpoint-util.h" @@ -1571,12 +1572,25 @@ static int apply_address_families(const Unit* u, const ExecContext *c) { } static int apply_memory_deny_write_execute(const Unit* u, const ExecContext *c) { + int r; + assert(u); assert(c); if (!c->memory_deny_write_execute) return 0; + /* use prctl() if kernel supports it (6.3) */ + r = prctl(PR_SET_MDWE, PR_MDWE_REFUSE_EXEC_GAIN, 0, 0, 0); + if (r == 0) { + log_unit_debug(u, "Enabled MemoryDenyWriteExecute= with PR_SET_MDWE"); + return 0; + } + if (r < 0 && errno != EINVAL) + return log_unit_debug_errno(u, errno, "Failed to enable MemoryDenyWriteExecute= with PR_SET_MDWE: %m"); + /* else use seccomp */ + log_unit_debug(u, "Kernel doesn't support PR_SET_MDWE: falling back to seccomp"); + if (skip_seccomp_unavailable(u, "MemoryDenyWriteExecute=")) return 0;