]> git.ipfire.org Git - thirdparty/systemd.git/commitdiff
seccomp: don't install filters for archs that can't use syscalls
authorGreg Depoire--Ferrer <greg.depoire@gmail.com>
Wed, 28 Oct 2020 23:51:30 +0000 (00:51 +0100)
committerLennart Poettering <lennart@poettering.net>
Thu, 10 Dec 2020 15:13:02 +0000 (16:13 +0100)
When seccomp_restrict_archs is called, architectures that are blocked
are replaced by the SECCOMP_LOCAL_ARCH_BLOCKED marker so that they are
not disabled again and filters are not installed for them.

This can make some service that use SystemCallArchitecture= and
SystemCallFilter= start faster.

TODO
src/shared/seccomp-util.c
src/shared/seccomp-util.h

diff --git a/TODO b/TODO
index 4affa324dcba91219febc6b2b8d87497e1c143f7..c4f000e20faa68dbca42b3067a8cce26507ab419 100644 (file)
--- a/TODO
+++ b/TODO
@@ -135,10 +135,6 @@ Features:
   o move into separate libsystemd-shared-iptables.so .so
     - iptables-libs (only used by nspawn + networkd)
 
-* seccomp: when SystemCallArchitectures=native is set then don't install any
-  other seccomp filters for any of the other archs, in order to reduce the
-  number of seccomp filters we install needlessly.
-
 * seccomp: maybe use seccomp_merge() to merge our filters per-arch if we can.
   Apparently kernel performance is much better with fewer larger seccomp
   filters than with more smaller seccomp filters.
index ccae9d4f143bb409dbd8af580212974e48b8ee1e..b5f9ad9adc2c67192e3eff8a104c5a85e655d572 100644 (file)
@@ -23,7 +23,8 @@
 #include "string-util.h"
 #include "strv.h"
 
-const uint32_t seccomp_local_archs[] = {
+/* This array will be modified at runtime as seccomp_restrict_archs is called. */
+uint32_t seccomp_local_archs[] = {
 
         /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
 
@@ -94,7 +95,7 @@ const uint32_t seccomp_local_archs[] = {
 #elif defined(__s390__)
                 SCMP_ARCH_S390,
 #endif
-                (uint32_t) -1
+                SECCOMP_LOCAL_ARCH_END
         };
 
 const char* seccomp_arch_to_string(uint32_t c) {
@@ -1758,8 +1759,8 @@ int seccomp_memory_deny_write_execute(void) {
 
 int seccomp_restrict_archs(Set *archs) {
         _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-        void *id;
         int r;
+        bool blocked_new = false;
 
         /* This installs a filter with no rules, but that restricts the system call architectures to the specified
          * list.
@@ -1775,25 +1776,37 @@ int seccomp_restrict_archs(Set *archs) {
         if (!seccomp)
                 return -ENOMEM;
 
-        SET_FOREACH(id, archs) {
-                r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
-                if (r < 0 && r != -EEXIST)
-                        return r;
-        }
+        for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
+                uint32_t arch = seccomp_local_archs[i];
 
-        /* The vdso for x32 assumes that x86-64 syscalls are available.  Let's allow them, since x32
-         * x32 syscalls should basically match x86-64 for everything except the pointer type.
-         * The important thing is that you can block the old 32-bit x86 syscalls.
-         * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
+                /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
+                if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
+                        continue;
 
-        if (seccomp_arch_native() == SCMP_ARCH_X32 ||
-            set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
+                bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
 
-                r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
-                if (r < 0 && r != -EEXIST)
-                        return r;
+                /* The vdso for x32 assumes that x86-64 syscalls are available.  Let's allow them, since x32
+                 * x32 syscalls should basically match x86-64 for everything except the pointer type.
+                 * The important thing is that you can block the old 32-bit x86 syscalls.
+                 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
+                if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
+                        block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
+
+                if (block) {
+                        seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
+                        blocked_new = true;
+                } else {
+                        r = seccomp_arch_add(seccomp, arch);
+                        if (r < 0 && r != -EEXIST)
+                                return r;
+                }
         }
 
+        /* All architectures that will be blocked by the seccomp program were
+         * already blocked. */
+        if (!blocked_new)
+                return 0;
+
         r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
         if (r < 0)
                 return r;
index 610597127e4e4810b0d7a2fcf46d271378e2e982..53b74bdc3424f585b961ef2e320faf4cf73dc74a 100644 (file)
@@ -100,12 +100,20 @@ int seccomp_lock_personality(unsigned long personality);
 int seccomp_protect_hostname(void);
 int seccomp_restrict_suid_sgid(void);
 
-extern const uint32_t seccomp_local_archs[];
+extern uint32_t seccomp_local_archs[];
+
+#define SECCOMP_LOCAL_ARCH_END UINT32_MAX
+
+/* Note: 0 is safe to use here because although SCMP_ARCH_NATIVE is 0, it would
+ * never be in the seccomp_local_archs array anyway so we can use it as a
+ * marker. */
+#define SECCOMP_LOCAL_ARCH_BLOCKED 0
 
 #define SECCOMP_FOREACH_LOCAL_ARCH(arch) \
         for (unsigned _i = ({ (arch) = seccomp_local_archs[0]; 0; });   \
-             seccomp_local_archs[_i] != (uint32_t) -1;                  \
-             (arch) = seccomp_local_archs[++_i])
+             (arch) != SECCOMP_LOCAL_ARCH_END;                          \
+             (arch) = seccomp_local_archs[++_i])                        \
+                if ((arch) != SECCOMP_LOCAL_ARCH_BLOCKED)
 
 /* EACCES: does not have the CAP_SYS_ADMIN or no_new_privs == 1
  * ENOMEM: out of memory, failed to allocate space for a libseccomp structure, or would exceed a defined constant