const uint32_t seccomp_local_archs[] = {
- /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
+ /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
#if defined(__x86_64__) && defined(__ILP32__)
SCMP_ARCH_X86,
.value =
"lookup_dcookie\0"
"perf_event_open\0"
+ "pidfd_getfd\0"
"ptrace\0"
"rtas\0"
#ifdef __NR_s390_runtime_instr
"oldstat\0"
"open\0"
"openat\0"
+ "openat2\0"
"readlink\0"
"readlinkat\0"
"removexattr\0"
"ustat\0"
"vserver\0"
},
+ [SYSCALL_FILTER_SET_PKEY] = {
+ .name = "@pkey",
+ .help = "System calls used for memory protection keys",
+ .value =
+ "pkey_alloc\0"
+ "pkey_free\0"
+ "pkey_mprotect\0"
+ },
[SYSCALL_FILTER_SET_PRIVILEGED] = {
.name = "@privileged",
.help = "All system calls which need super-user capabilities",
/* If we previously wanted to forbid a syscall and now
* we want to allow it, then remove it from the list. */
- if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
+ if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
if (r < 0)
switch (r) {
return 0;
}
-int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
+int seccomp_protect_syslog(void) {
+ uint32_t arch;
+ int r;
+
+ SECCOMP_FOREACH_LOCAL_ARCH(arch) {
+ _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
+
+ r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
+ if (r < 0)
+ return r;
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(syslog),
+ 0);
+
+ if (r < 0) {
+ log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+ continue;
+ }
+
+ r = seccomp_load(seccomp);
+ if (ERRNO_IS_SECCOMP_FATAL(r))
+ return r;
+ if (r < 0)
+ log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
+ }
+
+ return 0;
+}
+
+int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
uint32_t arch;
int r;
if (r < 0)
return r;
- if (whitelist) {
+ if (allow_list) {
int af, first = 0, last = 0;
void *afp;
- /* If this is a whitelist, we first block the address families that are out of range and then
- * everything that is not in the set. First, we find the lowest and highest address family in
- * the set. */
+ /* If this is an allow list, we first block the address families that are out of
+ * range and then everything that is not in the set. First, we find the lowest and
+ * highest address family in the set. */
SET_FOREACH(afp, address_families, i) {
af = PTR_TO_INT(afp);
} else {
void *af;
- /* If this is a blacklist, then generate one rule for
- * each address family that are then combined in OR
- * checks. */
+ /* If this is a deny list, then generate one rule for each address family that are
+ * then combined in OR checks. */
SET_FOREACH(af, address_families, i) {
return r;
/* Go through all policies with lower values than that, and block them -- unless they appear in the
- * whitelist. */
+ * allow list. */
for (p = 0; p < max_policy; p++) {
bool good = false;
- /* Check if this is in the whitelist. */
+ /* Check if this is in the allow list. */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] == p) {
good = true;
}
}
- /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
- * unsigned here, hence no need no check for < 0 values. */
+ /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
+ * are unsigned here, hence no need no check for < 0 values. */
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),
int seccomp_memory_deny_write_execute(void) {
uint32_t arch;
- int r;
+ unsigned loaded = 0;
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
- int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
+ int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
switch (arch) {
+ /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
+ * We ignore that here, which means there's still a way to get writable/executable
+ * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
+
case SCMP_ARCH_X86:
case SCMP_ARCH_S390:
filter_syscall = SCMP_SYS(mmap2);
block_syscall = SCMP_SYS(mmap);
- shmat_syscall = SCMP_SYS(shmat);
+ /* shmat multiplexed, see above */
break;
case SCMP_ARCH_PPC:
case SCMP_ARCH_PPC64:
case SCMP_ARCH_PPC64LE:
+ case SCMP_ARCH_S390X:
filter_syscall = SCMP_SYS(mmap);
-
- /* Note that shmat() isn't available, and the call is multiplexed through ipc().
- * We ignore that here, which means there's still a way to get writable/executable
- * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
-
+ /* shmat multiplexed, see above */
break;
case SCMP_ARCH_ARM:
case SCMP_ARCH_X86_64:
case SCMP_ARCH_X32:
case SCMP_ARCH_AARCH64:
- case SCMP_ARCH_S390X:
- filter_syscall = SCMP_SYS(mmap); /* amd64, x32, s390x, and arm64 have only mmap */
+ filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
shmat_syscall = SCMP_SYS(shmat);
break;
#endif
if (shmat_syscall > 0) {
- r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
+ r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1,
SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
if (r < 0)
if (ERRNO_IS_SECCOMP_FATAL(r))
return r;
if (r < 0)
- log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
+ log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
+ seccomp_arch_to_string(arch));
+ loaded++;
}
- return 0;
+ if (loaded == 0)
+ log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
+
+ return loaded;
}
int seccomp_restrict_archs(Set *archs) {
return 0;
}
-int parse_syscall_archs(char **l, Set **archs) {
- _cleanup_set_free_ Set *_archs;
+int parse_syscall_archs(char **l, Set **ret_archs) {
+ _cleanup_set_free_ Set *archs = NULL;
char **s;
int r;
assert(l);
- assert(archs);
-
- r = set_ensure_allocated(&_archs, NULL);
- if (r < 0)
- return r;
+ assert(ret_archs);
STRV_FOREACH(s, l) {
uint32_t a;
if (r < 0)
return -EINVAL;
- r = set_put(_archs, UINT32_TO_PTR(a + 1));
+ r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
if (r < 0)
return -ENOMEM;
}
- *archs = TAKE_PTR(_archs);
-
+ *ret_archs = TAKE_PTR(archs);
return 0;
}
else
any = true;
+#if defined(__SNR_openat2)
+ /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
+ * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
+ * for now, since openat2() is very new and code generally needs fallback logic anyway to be
+ * compatible with kernels that are not absolutely recent. */
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(EPERM),
+ SCMP_SYS(openat2),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add filter for openat2: %m");
+ else
+ any = true;
+#endif
+
r = seccomp_rule_add_exact(
seccomp,
SCMP_ACT_ERRNO(EPERM),