]> git.ipfire.org Git - thirdparty/systemd.git/blobdiff - src/shared/seccomp-util.c
tree-wide: avoid some loaded terms
[thirdparty/systemd.git] / src / shared / seccomp-util.c
index acfe4352839799140718fd74d5df863e53779856..a8dd069a758dbb82edd9266bddb63e1cbadcee49 100644 (file)
@@ -24,7 +24,7 @@
 
 const uint32_t seccomp_local_archs[] = {
 
-        /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
+        /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
 
 #if defined(__x86_64__) && defined(__ILP32__)
                 SCMP_ARCH_X86,
@@ -258,12 +258,16 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .help = "System calls that are always permitted",
                 .value =
                 "clock_getres\0"
+                "clock_getres_time64\0"
                 "clock_gettime\0"
+                "clock_gettime64\0"
                 "clock_nanosleep\0"
+                "clock_nanosleep_time64\0"
                 "execve\0"
                 "exit\0"
                 "exit_group\0"
                 "futex\0"
+                "futex_time64\0"
                 "get_robust_list\0"
                 "get_thread_area\0"
                 "getegid\0"
@@ -312,8 +316,12 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "io_destroy\0"
                 "io_getevents\0"
                 "io_pgetevents\0"
+                "io_pgetevents_time64\0"
                 "io_setup\0"
                 "io_submit\0"
+                "io_uring_enter\0"
+                "io_uring_register\0"
+                "io_uring_setup\0"
         },
         [SYSCALL_FILTER_SET_BASIC_IO] = {
                 .name = "@basic-io",
@@ -354,7 +362,9 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .value =
                 "adjtimex\0"
                 "clock_adjtime\0"
+                "clock_adjtime64\0"
                 "clock_settime\0"
+                "clock_settime64\0"
                 "settimeofday\0"
                 "stime\0"
         },
@@ -374,6 +384,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .value =
                 "lookup_dcookie\0"
                 "perf_event_open\0"
+                "pidfd_getfd\0"
                 "ptrace\0"
                 "rtas\0"
 #ifdef __NR_s390_runtime_instr
@@ -439,6 +450,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "oldstat\0"
                 "open\0"
                 "openat\0"
+                "openat2\0"
                 "readlink\0"
                 "readlinkat\0"
                 "removexattr\0"
@@ -462,6 +474,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "unlinkat\0"
                 "utime\0"
                 "utimensat\0"
+                "utimensat_time64\0"
                 "utimes\0"
         },
         [SYSCALL_FILTER_SET_IO_EVENT] = {
@@ -480,7 +493,9 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "eventfd2\0"
                 "poll\0"
                 "ppoll\0"
+                "ppoll_time64\0"
                 "pselect6\0"
+                "pselect6_time64\0"
                 "select\0"
         },
         [SYSCALL_FILTER_SET_IPC] = {
@@ -493,7 +508,9 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "mq_notify\0"
                 "mq_open\0"
                 "mq_timedreceive\0"
+                "mq_timedreceive_time64\0"
                 "mq_timedsend\0"
+                "mq_timedsend_time64\0"
                 "mq_unlink\0"
                 "msgctl\0"
                 "msgget\0"
@@ -507,6 +524,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "semget\0"
                 "semop\0"
                 "semtimedop\0"
+                "semtimedop_time64\0"
                 "shmat\0"
                 "shmctl\0"
                 "shmdt\0"
@@ -543,7 +561,13 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 .help = "Mounting and unmounting of file systems",
                 .value =
                 "chroot\0"
+                "fsconfig\0"
+                "fsmount\0"
+                "fsopen\0"
+                "fspick\0"
                 "mount\0"
+                "move_mount\0"
+                "open_tree\0"
                 "pivot_root\0"
                 "umount\0"
                 "umount2\0"
@@ -563,6 +587,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "recv\0"
                 "recvfrom\0"
                 "recvmmsg\0"
+                "recvmmsg_time64\0"
                 "recvmsg\0"
                 "send\0"
                 "sendmmsg\0"
@@ -606,6 +631,14 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "ustat\0"
                 "vserver\0"
         },
+        [SYSCALL_FILTER_SET_PKEY] = {
+                .name = "@pkey",
+                .help = "System calls used for memory protection keys",
+                .value =
+                "pkey_alloc\0"
+                "pkey_free\0"
+                "pkey_mprotect\0"
+        },
         [SYSCALL_FILTER_SET_PRIVILEGED] = {
                 .name = "@privileged",
                 .help = "All system calls which need super-user capabilities",
@@ -622,6 +655,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "capset\0"
                 "chroot\0"
                 "fanotify_init\0"
+                "fanotify_mark\0"
                 "nfsservctl\0"
                 "open_by_handle_at\0"
                 "pivot_root\0"
@@ -647,10 +681,12 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "arch_prctl\0"
                 "capget\0"      /* Able to query arbitrary processes */
                 "clone\0"
+                "clone3\0"
                 "execveat\0"
                 "fork\0"
                 "getrusage\0"
                 "kill\0"
+                "pidfd_open\0"
                 "pidfd_send_signal\0"
                 "prctl\0"
                 "rt_sigqueueinfo\0"
@@ -735,6 +771,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "rt_sigprocmask\0"
                 "rt_sigsuspend\0"
                 "rt_sigtimedwait\0"
+                "rt_sigtimedwait_time64\0"
                 "sigaction\0"
                 "sigaltstack\0"
                 "signal\0"
@@ -814,6 +851,7 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "sched_getparam\0"
                 "sched_getscheduler\0"
                 "sched_rr_get_interval\0"
+                "sched_rr_get_interval_time64\0"
                 "sched_yield\0"
                 "sendfile\0"
                 "sendfile64\0"
@@ -842,10 +880,14 @@ const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
                 "timer_delete\0"
                 "timer_getoverrun\0"
                 "timer_gettime\0"
+                "timer_gettime64\0"
                 "timer_settime\0"
+                "timer_settime64\0"
                 "timerfd_create\0"
                 "timerfd_gettime\0"
+                "timerfd_gettime64\0"
                 "timerfd_settime\0"
+                "timerfd_settime64\0"
                 "times\0"
         },
 };
@@ -1070,7 +1112,7 @@ int seccomp_parse_syscall_filter(
 
                 /* If we previously wanted to forbid a syscall and now
                  * we want to allow it, then remove it from the list. */
-                if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
+                if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
                         r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
                         if (r < 0)
                                 switch (r) {
@@ -1241,7 +1283,39 @@ int seccomp_protect_sysctl(void) {
         return 0;
 }
 
-int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
+int seccomp_protect_syslog(void) {
+        uint32_t arch;
+        int r;
+
+        SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+                _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+                r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+                if (r < 0)
+                        return r;
+
+                r = seccomp_rule_add_exact(
+                                seccomp,
+                                SCMP_ACT_ERRNO(EPERM),
+                                SCMP_SYS(syslog),
+                                0);
+
+                if (r < 0) {
+                        log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+                        continue;
+                }
+
+                r = seccomp_load(seccomp);
+                if (ERRNO_IS_SECCOMP_FATAL(r))
+                        return r;
+                if (r < 0)
+                        log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+        }
+
+        return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
         uint32_t arch;
         int r;
 
@@ -1288,13 +1362,13 @@ int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
                 if (r < 0)
                         return r;
 
-                if (whitelist) {
+                if (allow_list) {
                         int af, first = 0, last = 0;
                         void *afp;
 
-                        /* If this is a whitelist, we first block the address families that are out of range and then
-                         * everything that is not in the set. First, we find the lowest and highest address family in
-                         * the set. */
+                        /* If this is an allow list, we first block the address families that are out of
+                         * range and then everything that is not in the set. First, we find the lowest and
+                         * highest address family in the set. */
 
                         SET_FOREACH(afp, address_families, i) {
                                 af = PTR_TO_INT(afp);
@@ -1374,9 +1448,8 @@ int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
                 } else {
                         void *af;
 
-                        /* If this is a blacklist, then generate one rule for
-                         * each address family that are then combined in OR
-                         * checks. */
+                        /* If this is a deny list, then generate one rule for each address family that are
+                         * then combined in OR checks. */
 
                         SET_FOREACH(af, address_families, i) {
 
@@ -1432,11 +1505,11 @@ int seccomp_restrict_realtime(void) {
                         return r;
 
                 /* Go through all policies with lower values than that, and block them -- unless they appear in the
-                 * whitelist. */
+                 * allow list. */
                 for (p = 0; p < max_policy; p++) {
                         bool good = false;
 
-                        /* Check if this is in the whitelist. */
+                        /* Check if this is in the allow list. */
                         for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
                                 if (permitted_policies[i] == p) {
                                         good = true;
@@ -1459,8 +1532,8 @@ int seccomp_restrict_realtime(void) {
                         }
                 }
 
-                /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
-                 * unsigned here, hence no need no check for < 0 values. */
+                /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
+                 * are unsigned here, hence no need no check for < 0 values. */
                 r = seccomp_rule_add_exact(
                                 seccomp,
                                 SCMP_ACT_ERRNO(EPERM),
@@ -1511,32 +1584,33 @@ assert_cc(SCMP_SYS(shmdt) > 0);
 
 int seccomp_memory_deny_write_execute(void) {
         uint32_t arch;
-        int r;
+        unsigned loaded = 0;
 
         SECCOMP_FOREACH_LOCAL_ARCH(arch) {
                 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
-                int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
+                int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
 
                 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
 
                 switch (arch) {
 
+                /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
+                 * We ignore that here, which means there's still a way to get writable/executable
+                 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
+
                 case SCMP_ARCH_X86:
                 case SCMP_ARCH_S390:
                         filter_syscall = SCMP_SYS(mmap2);
                         block_syscall = SCMP_SYS(mmap);
-                        shmat_syscall = SCMP_SYS(shmat);
+                        /* shmat multiplexed, see above */
                         break;
 
                 case SCMP_ARCH_PPC:
                 case SCMP_ARCH_PPC64:
                 case SCMP_ARCH_PPC64LE:
+                case SCMP_ARCH_S390X:
                         filter_syscall = SCMP_SYS(mmap);
-
-                        /* Note that shmat() isn't available, and the call is multiplexed through ipc().
-                         * We ignore that here, which means there's still a way to get writable/executable
-                         * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
-
+                        /* shmat multiplexed, see above */
                         break;
 
                 case SCMP_ARCH_ARM:
@@ -1547,8 +1621,7 @@ int seccomp_memory_deny_write_execute(void) {
                 case SCMP_ARCH_X86_64:
                 case SCMP_ARCH_X32:
                 case SCMP_ARCH_AARCH64:
-                case SCMP_ARCH_S390X:
-                        filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
+                        filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
                         shmat_syscall = SCMP_SYS(shmat);
                         break;
 
@@ -1594,7 +1667,7 @@ int seccomp_memory_deny_write_execute(void) {
 #endif
 
                 if (shmat_syscall > 0) {
-                        r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
+                        r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
                                                        1,
                                                        SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
                         if (r < 0)
@@ -1605,10 +1678,15 @@ int seccomp_memory_deny_write_execute(void) {
                 if (ERRNO_IS_SECCOMP_FATAL(r))
                         return r;
                 if (r < 0)
-                        log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+                        log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
+                                        seccomp_arch_to_string(arch));
+                loaded++;
         }
 
-        return 0;
+        if (loaded == 0)
+                log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
+
+        return loaded;
 }
 
 int seccomp_restrict_archs(Set *archs) {
@@ -1663,17 +1741,13 @@ int seccomp_restrict_archs(Set *archs) {
         return 0;
 }
 
-int parse_syscall_archs(char **l, Set **archs) {
-        _cleanup_set_free_ Set *_archs;
+int parse_syscall_archs(char **l, Set **ret_archs) {
+        _cleanup_set_free_ Set *archs = NULL;
         char **s;
         int r;
 
         assert(l);
-        assert(archs);
-
-        r = set_ensure_allocated(&_archs, NULL);
-        if (r < 0)
-                return r;
+        assert(ret_archs);
 
         STRV_FOREACH(s, l) {
                 uint32_t a;
@@ -1682,13 +1756,12 @@ int parse_syscall_archs(char **l, Set **archs) {
                 if (r < 0)
                         return -EINVAL;
 
-                r = set_put(_archs, UINT32_TO_PTR(a + 1));
+                r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
                 if (r < 0)
                         return -ENOMEM;
         }
 
-        *archs = TAKE_PTR(_archs);
-
+        *ret_archs = TAKE_PTR(archs);
         return 0;
 }
 
@@ -1923,6 +1996,22 @@ static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
         else
                 any = true;
 
+#if defined(__SNR_openat2)
+        /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
+         * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
+         * for now, since openat2() is very new and code generally needs fallback logic anyway to be
+         * compatible with kernels that are not absolutely recent. */
+        r = seccomp_rule_add_exact(
+                        seccomp,
+                        SCMP_ACT_ERRNO(EPERM),
+                        SCMP_SYS(openat2),
+                        0);
+        if (r < 0)
+                log_debug_errno(r, "Failed to add filter for openat2: %m");
+        else
+                any = true;
+#endif
+
         r = seccomp_rule_add_exact(
                         seccomp,
                         SCMP_ACT_ERRNO(EPERM),