const uint32_t seccomp_local_archs[] = {
- /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
+ /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
#if defined(__x86_64__) && defined(__ILP32__)
SCMP_ARCH_X86,
.help = "System calls that are always permitted",
.value =
"clock_getres\0"
+ "clock_getres_time64\0"
"clock_gettime\0"
+ "clock_gettime64\0"
"clock_nanosleep\0"
+ "clock_nanosleep_time64\0"
"execve\0"
"exit\0"
"exit_group\0"
"futex\0"
+ "futex_time64\0"
"get_robust_list\0"
"get_thread_area\0"
"getegid\0"
"io_destroy\0"
"io_getevents\0"
"io_pgetevents\0"
+ "io_pgetevents_time64\0"
"io_setup\0"
"io_submit\0"
+ "io_uring_enter\0"
+ "io_uring_register\0"
+ "io_uring_setup\0"
},
[SYSCALL_FILTER_SET_BASIC_IO] = {
.name = "@basic-io",
.value =
"adjtimex\0"
"clock_adjtime\0"
+ "clock_adjtime64\0"
"clock_settime\0"
+ "clock_settime64\0"
"settimeofday\0"
"stime\0"
},
.value =
"lookup_dcookie\0"
"perf_event_open\0"
+ "pidfd_getfd\0"
"ptrace\0"
"rtas\0"
#ifdef __NR_s390_runtime_instr
"oldstat\0"
"open\0"
"openat\0"
+ "openat2\0"
"readlink\0"
"readlinkat\0"
"removexattr\0"
"unlinkat\0"
"utime\0"
"utimensat\0"
+ "utimensat_time64\0"
"utimes\0"
},
[SYSCALL_FILTER_SET_IO_EVENT] = {
"eventfd2\0"
"poll\0"
"ppoll\0"
+ "ppoll_time64\0"
"pselect6\0"
+ "pselect6_time64\0"
"select\0"
},
[SYSCALL_FILTER_SET_IPC] = {
"mq_notify\0"
"mq_open\0"
"mq_timedreceive\0"
+ "mq_timedreceive_time64\0"
"mq_timedsend\0"
+ "mq_timedsend_time64\0"
"mq_unlink\0"
"msgctl\0"
"msgget\0"
"semget\0"
"semop\0"
"semtimedop\0"
+ "semtimedop_time64\0"
"shmat\0"
"shmctl\0"
"shmdt\0"
.help = "Mounting and unmounting of file systems",
.value =
"chroot\0"
+ "fsconfig\0"
+ "fsmount\0"
+ "fsopen\0"
+ "fspick\0"
"mount\0"
+ "move_mount\0"
+ "open_tree\0"
"pivot_root\0"
"umount\0"
"umount2\0"
"recv\0"
"recvfrom\0"
"recvmmsg\0"
+ "recvmmsg_time64\0"
"recvmsg\0"
"send\0"
"sendmmsg\0"
"ustat\0"
"vserver\0"
},
+ [SYSCALL_FILTER_SET_PKEY] = {
+ .name = "@pkey",
+ .help = "System calls used for memory protection keys",
+ .value =
+ "pkey_alloc\0"
+ "pkey_free\0"
+ "pkey_mprotect\0"
+ },
[SYSCALL_FILTER_SET_PRIVILEGED] = {
.name = "@privileged",
.help = "All system calls which need super-user capabilities",
"capset\0"
"chroot\0"
"fanotify_init\0"
+ "fanotify_mark\0"
"nfsservctl\0"
"open_by_handle_at\0"
"pivot_root\0"
"arch_prctl\0"
"capget\0" /* Able to query arbitrary processes */
"clone\0"
+ "clone3\0"
"execveat\0"
"fork\0"
"getrusage\0"
"kill\0"
+ "pidfd_open\0"
"pidfd_send_signal\0"
"prctl\0"
"rt_sigqueueinfo\0"
"rt_sigprocmask\0"
"rt_sigsuspend\0"
"rt_sigtimedwait\0"
+ "rt_sigtimedwait_time64\0"
"sigaction\0"
"sigaltstack\0"
"signal\0"
"sched_getparam\0"
"sched_getscheduler\0"
"sched_rr_get_interval\0"
+ "sched_rr_get_interval_time64\0"
"sched_yield\0"
"sendfile\0"
"sendfile64\0"
"timer_delete\0"
"timer_getoverrun\0"
"timer_gettime\0"
+ "timer_gettime64\0"
"timer_settime\0"
+ "timer_settime64\0"
"timerfd_create\0"
"timerfd_gettime\0"
+ "timerfd_gettime64\0"
"timerfd_settime\0"
+ "timerfd_settime64\0"
"times\0"
},
};
/* If we previously wanted to forbid a syscall and now
* we want to allow it, then remove it from the list. */
- if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
+ if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
if (r < 0)
switch (r) {
return 0;
}
-int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
+int seccomp_protect_syslog(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(syslog),
+ 0);
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
uint32_t arch;
int r;
if (r < 0)
return r;
- if (whitelist) {
+ if (allow_list) {
int af, first = 0, last = 0;
void *afp;
- /* If this is a whitelist, we first block the address families that are out of range and then
- * everything that is not in the set. First, we find the lowest and highest address family in
- * the set. */
+ /* If this is an allow list, we first block the address families that are out of
+ * range and then everything that is not in the set. First, we find the lowest and
+ * highest address family in the set. */
SET_FOREACH(afp, address_families, i) {
af = PTR_TO_INT(afp);
} else {
void *af;
- /* If this is a blacklist, then generate one rule for
- * each address family that are then combined in OR
- * checks. */
+ /* If this is a deny list, then generate one rule for each address family that are
+ * then combined in OR checks. */
SET_FOREACH(af, address_families, i) {
return r;
/* Go through all policies with lower values than that, and block them -- unless they appear in the
- * whitelist. */
+ * allow list. */
for (p = 0; p < max_policy; p++) {
bool good = false;
- /* Check if this is in the whitelist. */
+ /* Check if this is in the allow list. */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] == p) {
good = true;
}
}
- /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
- * unsigned here, hence no need no check for < 0 values. */
+ /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
+ * are unsigned here, hence no need no check for < 0 values. */
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
int seccomp_memory_deny_write_execute(void) {
uint32_t arch;
- int r;
+ unsigned loaded = 0;
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
- int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
+ int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
switch (arch) {
+ /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
+ * We ignore that here, which means there's still a way to get writable/executable
+ * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
+
case SCMP_ARCH_X86:
case SCMP_ARCH_S390:
filter_syscall = SCMP_SYS(mmap2);
block_syscall = SCMP_SYS(mmap);
- shmat_syscall = SCMP_SYS(shmat);
+ /* shmat multiplexed, see above */
break;
case SCMP_ARCH_PPC:
case SCMP_ARCH_PPC64:
case SCMP_ARCH_PPC64LE:
+ case SCMP_ARCH_S390X:
filter_syscall = SCMP_SYS(mmap);
-
- /* Note that shmat() isn't available, and the call is multiplexed through ipc().
- * We ignore that here, which means there's still a way to get writable/executable
- * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
-
+ /* shmat multiplexed, see above */
break;
case SCMP_ARCH_ARM:
case SCMP_ARCH_X86_64:
case SCMP_ARCH_X32:
case SCMP_ARCH_AARCH64:
- case SCMP_ARCH_S390X:
- filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
+ filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
shmat_syscall = SCMP_SYS(shmat);
break;
#endif
if (shmat_syscall > 0) {
- r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
+ r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
if (r < 0)
if (ERRNO_IS_SECCOMP_FATAL(r))
return r;
if (r < 0)
- log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ loaded++;
}
- return 0;
+ if (loaded == 0)
+ log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
+
+ return loaded;
}
int seccomp_restrict_archs(Set *archs) {
return 0;
}
-int parse_syscall_archs(char **l, Set **archs) {
- _cleanup_set_free_ Set *_archs;
+int parse_syscall_archs(char **l, Set **ret_archs) {
+ _cleanup_set_free_ Set *archs = NULL;
char **s;
int r;
assert(l);
- assert(archs);
-
- r = set_ensure_allocated(&_archs, NULL);
- if (r < 0)
- return r;
+ assert(ret_archs);
STRV_FOREACH(s, l) {
uint32_t a;
if (r < 0)
return -EINVAL;
- r = set_put(_archs, UINT32_TO_PTR(a + 1));
+ r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
if (r < 0)
return -ENOMEM;
}
- *archs = TAKE_PTR(_archs);
-
+ *ret_archs = TAKE_PTR(archs);
return 0;
}
else
any = true;
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
+ * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
+ * for now, since openat2() is very new and code generally needs fallback logic anyway to be
+ * compatible with kernels that are not absolutely recent. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),