]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/shared/seccomp-util.c
nulstr-util: Declare NULSTR_FOREACH() iterator inline
[thirdparty/systemd.git] / src / shared / seccomp-util.c
index ff90af538b390a144b44f48233de98164c5fa97c..8ece7f163f1be53370957aff2e2cfc61bce90814 100644 (file)
@@ -3,18 +3,22 @@
 #include <errno.h>
 #include <fcntl.h>
 #include <linux/seccomp.h>
-#include <seccomp.h>
 #include <stddef.h>
 #include <sys/mman.h>
 #include <sys/prctl.h>
 #include <sys/shm.h>
 #include <sys/stat.h>
 
+/* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
+#include "missing_syscall_def.h"
+#include <seccomp.h>
+
 #include "af-list.h"
 #include "alloc-util.h"
 #include "env-util.h"
 #include "errno-list.h"
 #include "macro.h"
+#include "namespace-util.h"
 #include "nsflags.h"
 #include "nulstr-util.h"
 #include "process-util.h"
@@ -77,6 +81,11 @@ uint32_t seccomp_local_archs[] = {
                 SCMP_ARCH_MIPSEL64,
                 SCMP_ARCH_MIPS64N32,
                 SCMP_ARCH_MIPSEL64N32, /* native */
+#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
+                SCMP_ARCH_PARISC,
+                SCMP_ARCH_PARISC64,    /* native */
+#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
+                SCMP_ARCH_PARISC,
 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
                 SCMP_ARCH_PPC,
                 SCMP_ARCH_PPC64LE,
@@ -104,7 +113,7 @@ const char* seccomp_arch_to_string(uint32_t c) {
          * Names used here should be the same as those used for ConditionArchitecture=,
          * except for "subarchitectures" like x32. */
 
-        switch(c) {
+        switch (c) {
         case SCMP_ARCH_NATIVE:
                 return "native";
         case SCMP_ARCH_X86:
@@ -129,6 +138,14 @@ const char* seccomp_arch_to_string(uint32_t c) {
                 return "mips64-le";
         case SCMP_ARCH_MIPSEL64N32:
                 return "mips64-le-n32";
+#ifdef SCMP_ARCH_PARISC
+        case SCMP_ARCH_PARISC:
+                return "parisc";
+#endif
+#ifdef SCMP_ARCH_PARISC64
+        case SCMP_ARCH_PARISC64:
+                return "parisc64";
+#endif
         case SCMP_ARCH_PPC:
                 return "ppc";
         case SCMP_ARCH_PPC64:
@@ -178,6 +195,14 @@ int seccomp_arch_from_string(const char *n, uint32_t *ret) {
                 *ret = SCMP_ARCH_MIPSEL64;
         else if (streq(n, "mips64-le-n32"))
                 *ret = SCMP_ARCH_MIPSEL64N32;
+#ifdef SCMP_ARCH_PARISC
+        else if (streq(n, "parisc"))
+                *ret = SCMP_ARCH_PARISC;
+#endif
+#ifdef SCMP_ARCH_PARISC64
+        else if (streq(n, "parisc64"))
+                *ret = SCMP_ARCH_PARISC64;
+#endif
         else if (streq(n, "ppc"))
                 *ret = SCMP_ARCH_PPC;
         else if (streq(n, "ppc64"))
@@ -283,6 +308,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .name = "@default",
                 .help = "System calls that are always permitted",
                 .value =
+                "arch_prctl\0"      /* Used during platform-specific initialization by ld-linux.so. */
                 "brk\0"
                 "cacheflush\0"
                 "clock_getres\0"
@@ -324,11 +350,13 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "membarrier\0"
                 "mmap\0"
                 "mmap2\0"
+                "mprotect\0"
                 "munmap\0"
                 "nanosleep\0"
                 "pause\0"
                 "prlimit64\0"
                 "restart_syscall\0"
+                "riscv_flush_icache\0"
                 "rseq\0"
                 "rt_sigreturn\0"
                 "sched_getaffinity\0"
@@ -420,9 +448,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "pidfd_getfd\0"
                 "ptrace\0"
                 "rtas\0"
-#if defined __s390__ || defined __s390x__
                 "s390_runtime_instr\0"
-#endif
                 "sys_debug_setcontext\0"
         },
         [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
@@ -711,9 +737,11 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .name = "@process",
                 .help = "Process control, execution, namespacing operations",
                 .value =
-                "arch_prctl\0"
                 "capget\0"      /* Able to query arbitrary processes */
                 "clone\0"
+                /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
+                 * implement seccomp, so we don't need to list it at all. C.f.
+                 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
                 "clone3\0"
                 "execveat\0"
                 "fork\0"
@@ -744,10 +772,8 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "pciconfig_iobase\0"
                 "pciconfig_read\0"
                 "pciconfig_write\0"
-#if defined __s390__ || defined __s390x__
                 "s390_pci_mmio_read\0"
                 "s390_pci_mmio_write\0"
-#endif
         },
         [SYSCALL_FILTER_SET_REBOOT] = {
                 .name = "@reboot",
@@ -864,7 +890,6 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "ioprio_get\0"
                 "kcmp\0"
                 "madvise\0"
-                "mprotect\0"
                 "mremap\0"
                 "name_to_handle_at\0"
                 "oldolduname\0"
@@ -922,6 +947,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .name = "@known",
                 .help = "All known syscalls declared in the kernel",
                 .value =
+                "@obsolete\0"
 #include "syscall-list.h"
         },
 };
@@ -1013,7 +1039,6 @@ static int add_syscall_filter_set(
                 bool log_missing,
                 char ***added) {
 
-        const char *sys;
         int r;
 
         /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
@@ -1117,7 +1142,7 @@ int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter
                 if (ERRNO_IS_SECCOMP_FATAL(r))
                         return r;
                 if (r < 0)
-                        log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
+                        log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
                                         seccomp_arch_to_string(arch));
         }
 
@@ -1143,7 +1168,6 @@ int seccomp_parse_syscall_filter(
 
         if (name[0] == '@') {
                 const SyscallFilterSet *set;
-                const char *i;
 
                 set = syscall_filter_set_find(name);
                 if (!set) {
@@ -1224,6 +1248,21 @@ int seccomp_restrict_namespaces(unsigned long retain) {
                 if (r < 0)
                         return r;
 
+                /* We cannot filter on individual flags to clone3(), and we need to disable the
+                 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
+                 * users shall fall back to clone(), as if on an older kernel.
+                 *
+                 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
+                 * https://github.com/moby/moby/issues/42680. */
+
+                r = seccomp_rule_add_exact(
+                                seccomp,
+                                SCMP_ACT_ERRNO(ENOSYS),
+                                SCMP_SYS(clone3),
+                                0);
+                if (r < 0)
+                        log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
+
                 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
                         /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
                          * altogether. */
@@ -1246,16 +1285,16 @@ int seccomp_restrict_namespaces(unsigned long retain) {
                         continue;
                 }
 
-                for (unsigned i = 0; namespace_flag_map[i].name; i++) {
+                for (unsigned i = 0; namespace_info[i].proc_name; i++) {
                         unsigned long f;
 
-                        f = namespace_flag_map[i].flag;
+                        f = namespace_info[i].clone_flag;
                         if (FLAGS_SET(retain, f)) {
-                                log_debug("Permitting %s.", namespace_flag_map[i].name);
+                                log_debug("Permitting %s.", namespace_info[i].proc_name);
                                 continue;
                         }
 
-                        log_debug("Blocking %s.", namespace_flag_map[i].name);
+                        log_debug("Blocking %s.", namespace_info[i].proc_name);
 
                         r = seccomp_rule_add_exact(
                                         seccomp,
@@ -1421,6 +1460,12 @@ int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
                 case SCMP_ARCH_X86:
                 case SCMP_ARCH_MIPSEL:
                 case SCMP_ARCH_MIPS:
+#ifdef SCMP_ARCH_PARISC
+                case SCMP_ARCH_PARISC:
+#endif
+#ifdef SCMP_ARCH_PARISC64
+                case SCMP_ARCH_PARISC64:
+#endif
                 case SCMP_ARCH_PPC:
                 case SCMP_ARCH_PPC64:
                 case SCMP_ARCH_PPC64LE:
@@ -1553,7 +1598,7 @@ int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
         return 0;
 }
 
-int seccomp_restrict_realtime(void) {
+int seccomp_restrict_realtime_full(int error_code) {
         static const int permitted_policies[] = {
                 SCHED_OTHER,
                 SCHED_BATCH,
@@ -1564,6 +1609,8 @@ int seccomp_restrict_realtime(void) {
         uint32_t arch;
         unsigned i;
 
+        assert(error_code > 0);
+
         /* Determine the highest policy constant we want to allow */
         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                 if (permitted_policies[i] > max_policy)
@@ -1597,7 +1644,7 @@ int seccomp_restrict_realtime(void) {
                         /* Deny this policy */
                         r = seccomp_rule_add_exact(
                                         seccomp,
-                                        SCMP_ACT_ERRNO(EPERM),
+                                        SCMP_ACT_ERRNO(error_code),
                                         SCMP_SYS(sched_setscheduler),
                                         1,
                                         SCMP_A1(SCMP_CMP_EQ, p));
@@ -1611,7 +1658,7 @@ int seccomp_restrict_realtime(void) {
                  * are unsigned here, hence no need no check for < 0 values. */
                 r = seccomp_rule_add_exact(
                                 seccomp,
-                                SCMP_ACT_ERRNO(EPERM),
+                                SCMP_ACT_ERRNO(error_code),
                                 SCMP_SYS(sched_setscheduler),
                                 1,
                                 SCMP_A1(SCMP_CMP_GT, max_policy));
@@ -1671,7 +1718,11 @@ int seccomp_memory_deny_write_execute(void) {
 
                 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
                  * We ignore that here, which means there's still a way to get writable/executable
-                 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
+                 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
+                 *
+                 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
+                 * on that front (kernel work done in 5.18).
+                 */
 
                 case SCMP_ARCH_X86:
                 case SCMP_ARCH_S390:
@@ -1705,7 +1756,7 @@ int seccomp_memory_deny_write_execute(void) {
 
                 /* Please add more definitions here, if you port systemd to other architectures! */
 
-#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
+#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
 #warning "Consider adding the right mmap() syscall definitions here!"
 #endif
                 }
@@ -1736,13 +1787,11 @@ int seccomp_memory_deny_write_execute(void) {
                 if (r < 0)
                         continue;
 
-#ifdef __NR_pkey_mprotect
                 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
                                                1,
                                                SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
                 if (r < 0)
                         continue;
-#endif
 
                 if (shmat_syscall > 0) {
                         r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
@@ -1836,7 +1885,6 @@ int seccomp_restrict_archs(Set *archs) {
 
 int parse_syscall_archs(char **l, Set **ret_archs) {
         _cleanup_set_free_ Set *archs = NULL;
-        char **s;
         int r;
 
         assert(l);
@@ -1859,7 +1907,6 @@ int parse_syscall_archs(char **l, Set **ret_archs) {
 }
 
 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
-        const char *i;
         int r;
 
         assert(set);
@@ -2063,7 +2110,6 @@ static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
         else
                 any = true;
 
-#if SCMP_SYS(open) > 0
         r = seccomp_rule_add_exact(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),
@@ -2075,7 +2121,6 @@ static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
                 log_debug_errno(r, "Failed to add filter for open: %m");
         else
                 any = true;
-#endif
 
         r = seccomp_rule_add_exact(
                         seccomp,
@@ -2213,7 +2258,6 @@ static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
         /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
          * EINVAL, in the hope the client code will retry without O_SYNC then.  */
 
-#if SCMP_SYS(open) > 0
         r = seccomp_rule_add_exact(
                         seccomp,
                         SCMP_ACT_ERRNO(EINVAL),
@@ -2224,7 +2268,6 @@ static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
                 log_debug_errno(r, "Failed to add filter for open: %m");
         else
                 any = true;
-#endif
 
         r = seccomp_rule_add_exact(
                         seccomp,
@@ -2262,7 +2305,6 @@ int seccomp_suppress_sync(void) {
 
         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-                const char *c;
 
                 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
                 if (r < 0)