#include "env-util.h"
#include "errno-list.h"
#include "macro.h"
+#include "namespace-util.h"
#include "nsflags.h"
#include "nulstr-util.h"
#include "process-util.h"
SCMP_ARCH_MIPSEL64,
SCMP_ARCH_MIPS64N32,
SCMP_ARCH_MIPSEL64N32, /* native */
+#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
+ SCMP_ARCH_PARISC,
+ SCMP_ARCH_PARISC64, /* native */
+#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
+ SCMP_ARCH_PARISC,
#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
SCMP_ARCH_PPC,
SCMP_ARCH_PPC64LE,
* Names used here should be the same as those used for ConditionArchitecture=,
* except for "subarchitectures" like x32. */
- switch(c) {
+ switch (c) {
case SCMP_ARCH_NATIVE:
return "native";
case SCMP_ARCH_X86:
return "mips64-le";
case SCMP_ARCH_MIPSEL64N32:
return "mips64-le-n32";
+#ifdef SCMP_ARCH_PARISC
+ case SCMP_ARCH_PARISC:
+ return "parisc";
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ case SCMP_ARCH_PARISC64:
+ return "parisc64";
+#endif
case SCMP_ARCH_PPC:
return "ppc";
case SCMP_ARCH_PPC64:
*ret = SCMP_ARCH_MIPSEL64;
else if (streq(n, "mips64-le-n32"))
*ret = SCMP_ARCH_MIPSEL64N32;
+#ifdef SCMP_ARCH_PARISC
+ else if (streq(n, "parisc"))
+ *ret = SCMP_ARCH_PARISC;
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ else if (streq(n, "parisc64"))
+ *ret = SCMP_ARCH_PARISC64;
+#endif
else if (streq(n, "ppc"))
*ret = SCMP_ARCH_PPC;
else if (streq(n, "ppc64"))
.name = "@default",
.help = "System calls that are always permitted",
.value =
+ "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
"brk\0"
"cacheflush\0"
"clock_getres\0"
"pause\0"
"prlimit64\0"
"restart_syscall\0"
+ "riscv_flush_icache\0"
"rseq\0"
"rt_sigreturn\0"
"sched_getaffinity\0"
"pidfd_getfd\0"
"ptrace\0"
"rtas\0"
-#if defined __s390__ || defined __s390x__
"s390_runtime_instr\0"
-#endif
"sys_debug_setcontext\0"
},
[SYSCALL_FILTER_SET_FILE_SYSTEM] = {
.name = "@process",
.help = "Process control, execution, namespacing operations",
.value =
- "arch_prctl\0"
"capget\0" /* Able to query arbitrary processes */
"clone\0"
+ /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
+ * implement seccomp, so we don't need to list it at all. C.f.
+ * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
"clone3\0"
"execveat\0"
"fork\0"
"pciconfig_iobase\0"
"pciconfig_read\0"
"pciconfig_write\0"
-#if defined __s390__ || defined __s390x__
"s390_pci_mmio_read\0"
"s390_pci_mmio_write\0"
-#endif
},
[SYSCALL_FILTER_SET_REBOOT] = {
.name = "@reboot",
.name = "@known",
.help = "All known syscalls declared in the kernel",
.value =
+ "@obsolete\0"
#include "syscall-list.h"
},
};
bool log_missing,
char ***added) {
- const char *sys;
int r;
/* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
if (ERRNO_IS_SECCOMP_FATAL(r))
return r;
if (r < 0)
- log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
+ log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
seccomp_arch_to_string(arch));
}
if (name[0] == '@') {
const SyscallFilterSet *set;
- const char *i;
set = syscall_filter_set_find(name);
if (!set) {
if (r < 0)
return r;
+ /* We cannot filter on individual flags to clone3(), and we need to disable the
+ * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
+ * users shall fall back to clone(), as if on an older kernel.
+ *
+ * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
+ * https://github.com/moby/moby/issues/42680. */
+
+ r = seccomp_rule_add_exact(
+ seccomp,
+ SCMP_ACT_ERRNO(ENOSYS),
+ SCMP_SYS(clone3),
+ 0);
+ if (r < 0)
+ log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
+
if ((retain & NAMESPACE_FLAGS_ALL) == 0)
/* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
* altogether. */
continue;
}
- for (unsigned i = 0; namespace_flag_map[i].name; i++) {
+ for (unsigned i = 0; namespace_info[i].proc_name; i++) {
unsigned long f;
- f = namespace_flag_map[i].flag;
+ f = namespace_info[i].clone_flag;
if (FLAGS_SET(retain, f)) {
- log_debug("Permitting %s.", namespace_flag_map[i].name);
+ log_debug("Permitting %s.", namespace_info[i].proc_name);
continue;
}
- log_debug("Blocking %s.", namespace_flag_map[i].name);
+ log_debug("Blocking %s.", namespace_info[i].proc_name);
r = seccomp_rule_add_exact(
seccomp,
case SCMP_ARCH_X86:
case SCMP_ARCH_MIPSEL:
case SCMP_ARCH_MIPS:
+#ifdef SCMP_ARCH_PARISC
+ case SCMP_ARCH_PARISC:
+#endif
+#ifdef SCMP_ARCH_PARISC64
+ case SCMP_ARCH_PARISC64:
+#endif
case SCMP_ARCH_PPC:
case SCMP_ARCH_PPC64:
case SCMP_ARCH_PPC64LE:
return 0;
}
-int seccomp_restrict_realtime(void) {
+int seccomp_restrict_realtime_full(int error_code) {
static const int permitted_policies[] = {
SCHED_OTHER,
SCHED_BATCH,
uint32_t arch;
unsigned i;
+ assert(error_code > 0);
+
/* Determine the highest policy constant we want to allow */
for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
if (permitted_policies[i] > max_policy)
/* Deny this policy */
r = seccomp_rule_add_exact(
seccomp,
- SCMP_ACT_ERRNO(EPERM),
+ SCMP_ACT_ERRNO(error_code),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_EQ, p));
* are unsigned here, hence no need no check for < 0 values. */
r = seccomp_rule_add_exact(
seccomp,
- SCMP_ACT_ERRNO(EPERM),
+ SCMP_ACT_ERRNO(error_code),
SCMP_SYS(sched_setscheduler),
1,
SCMP_A1(SCMP_CMP_GT, max_policy));
/* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
* We ignore that here, which means there's still a way to get writable/executable
- * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
+ * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
+ *
+ * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
+ * on that front (kernel work done in 5.18).
+ */
case SCMP_ARCH_X86:
case SCMP_ARCH_S390:
/* Please add more definitions here, if you port systemd to other architectures! */
-#if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
+#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
#warning "Consider adding the right mmap() syscall definitions here!"
#endif
}
int parse_syscall_archs(char **l, Set **ret_archs) {
_cleanup_set_free_ Set *archs = NULL;
- char **s;
int r;
assert(l);
}
int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
- const char *i;
int r;
assert(set);
SECCOMP_FOREACH_LOCAL_ARCH(arch) {
_cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
- const char *c;
r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
if (r < 0)