1 /* SPDX-License-Identifier: LGPL-2.1+ */
5 #include <linux/seccomp.h>
14 #include "alloc-util.h"
15 #include "errno-list.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
22 #include "string-util.h"
25 const uint32_t seccomp_local_archs
[] = {
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
29 #if defined(__x86_64__) && defined(__ILP32__)
32 SCMP_ARCH_X32
, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
36 SCMP_ARCH_X86_64
, /* native */
37 #elif defined(__i386__)
39 #elif defined(__aarch64__)
41 SCMP_ARCH_AARCH64
, /* native */
42 #elif defined(__arm__)
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPS
, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
49 SCMP_ARCH_MIPSEL
, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
53 SCMP_ARCH_MIPSEL64N32
,
56 SCMP_ARCH_MIPS64
, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
61 SCMP_ARCH_MIPSEL64N32
,
63 SCMP_ARCH_MIPSEL64
, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
69 SCMP_ARCH_MIPSEL64N32
,
70 SCMP_ARCH_MIPS64N32
, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPSEL64N32
, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
81 SCMP_ARCH_PPC64
, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
85 SCMP_ARCH_PPC64LE
, /* native */
86 #elif defined(__powerpc__)
88 #elif defined(__s390x__)
90 SCMP_ARCH_S390X
, /* native */
91 #elif defined(__s390__)
97 const char* seccomp_arch_to_string(uint32_t c
) {
98 /* Maintain order used in <seccomp.h>.
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
104 case SCMP_ARCH_NATIVE
:
108 case SCMP_ARCH_X86_64
:
114 case SCMP_ARCH_AARCH64
:
118 case SCMP_ARCH_MIPS64
:
120 case SCMP_ARCH_MIPS64N32
:
122 case SCMP_ARCH_MIPSEL
:
124 case SCMP_ARCH_MIPSEL64
:
126 case SCMP_ARCH_MIPSEL64N32
:
127 return "mips64-le-n32";
130 case SCMP_ARCH_PPC64
:
132 case SCMP_ARCH_PPC64LE
:
136 case SCMP_ARCH_S390X
:
143 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
149 if (streq(n
, "native"))
150 *ret
= SCMP_ARCH_NATIVE
;
151 else if (streq(n
, "x86"))
152 *ret
= SCMP_ARCH_X86
;
153 else if (streq(n
, "x86-64"))
154 *ret
= SCMP_ARCH_X86_64
;
155 else if (streq(n
, "x32"))
156 *ret
= SCMP_ARCH_X32
;
157 else if (streq(n
, "arm"))
158 *ret
= SCMP_ARCH_ARM
;
159 else if (streq(n
, "arm64"))
160 *ret
= SCMP_ARCH_AARCH64
;
161 else if (streq(n
, "mips"))
162 *ret
= SCMP_ARCH_MIPS
;
163 else if (streq(n
, "mips64"))
164 *ret
= SCMP_ARCH_MIPS64
;
165 else if (streq(n
, "mips64-n32"))
166 *ret
= SCMP_ARCH_MIPS64N32
;
167 else if (streq(n
, "mips-le"))
168 *ret
= SCMP_ARCH_MIPSEL
;
169 else if (streq(n
, "mips64-le"))
170 *ret
= SCMP_ARCH_MIPSEL64
;
171 else if (streq(n
, "mips64-le-n32"))
172 *ret
= SCMP_ARCH_MIPSEL64N32
;
173 else if (streq(n
, "ppc"))
174 *ret
= SCMP_ARCH_PPC
;
175 else if (streq(n
, "ppc64"))
176 *ret
= SCMP_ARCH_PPC64
;
177 else if (streq(n
, "ppc64-le"))
178 *ret
= SCMP_ARCH_PPC64LE
;
179 else if (streq(n
, "s390"))
180 *ret
= SCMP_ARCH_S390
;
181 else if (streq(n
, "s390x"))
182 *ret
= SCMP_ARCH_S390X
;
189 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
190 scmp_filter_ctx seccomp
;
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
196 seccomp
= seccomp_init(default_action
);
200 if (arch
!= SCMP_ARCH_NATIVE
&&
201 arch
!= seccomp_arch_native()) {
203 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
207 r
= seccomp_arch_add(seccomp
, arch
);
211 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
212 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
213 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
215 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
216 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
219 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
223 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
231 seccomp_release(seccomp
);
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
244 bool is_seccomp_available(void) {
245 static int cached_enabled
= -1;
247 if (cached_enabled
< 0)
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
252 return cached_enabled
;
255 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
256 [SYSCALL_FILTER_SET_DEFAULT
] = {
258 .help
= "System calls that are always permitted",
261 "clock_getres_time64\0"
265 "clock_nanosleep_time64\0"
289 "getrlimit\0" /* make sure processes can query stack size and such */
311 [SYSCALL_FILTER_SET_AIO
] = {
313 .help
= "Asynchronous IO",
319 "io_pgetevents_time64\0"
323 "io_uring_register\0"
326 [SYSCALL_FILTER_SET_BASIC_IO
] = {
347 [SYSCALL_FILTER_SET_CHOWN
] = {
349 .help
= "Change ownership of files and directories",
359 [SYSCALL_FILTER_SET_CLOCK
] = {
361 .help
= "Change the system time",
371 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
372 .name
= "@cpu-emulation",
373 .help
= "System calls for CPU emulation functionality",
381 [SYSCALL_FILTER_SET_DEBUG
] = {
383 .help
= "Debugging, performance monitoring and tracing functionality",
390 #ifdef __NR_s390_runtime_instr
391 "s390_runtime_instr\0"
393 "sys_debug_setcontext\0"
395 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
396 .name
= "@file-system",
397 .help
= "File system operations",
427 "inotify_add_watch\0"
480 [SYSCALL_FILTER_SET_IO_EVENT
] = {
482 .help
= "Event loop system calls",
501 [SYSCALL_FILTER_SET_IPC
] = {
503 .help
= "SysV IPC, POSIX Message Queues or other IPC",
511 "mq_timedreceive_time64\0"
513 "mq_timedsend_time64\0"
522 "process_vm_writev\0"
527 "semtimedop_time64\0"
533 [SYSCALL_FILTER_SET_KEYRING
] = {
535 .help
= "Kernel keyring access",
541 [SYSCALL_FILTER_SET_MEMLOCK
] = {
543 .help
= "Memory locking control",
551 [SYSCALL_FILTER_SET_MODULE
] = {
553 .help
= "Loading and unloading of kernel modules",
559 [SYSCALL_FILTER_SET_MOUNT
] = {
561 .help
= "Mounting and unmounting of file systems",
575 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
576 .name
= "@network-io",
577 .help
= "Network or Unix socket IO, should not be needed if not network facing",
602 [SYSCALL_FILTER_SET_OBSOLETE
] = {
603 /* some unknown even to libseccomp */
605 .help
= "Unusual, obsolete or unimplemented system calls",
634 [SYSCALL_FILTER_SET_PKEY
] = {
636 .help
= "System calls used for memory protection keys",
642 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
643 .name
= "@privileged",
644 .help
= "All system calls which need super-user capabilities",
660 "open_by_handle_at\0"
673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
677 [SYSCALL_FILTER_SET_PROCESS
] = {
679 .help
= "Process control, execution, namespaceing operations",
682 "capget\0" /* Able to query arbitrary processes */
690 "pidfd_send_signal\0"
693 "rt_tgsigqueueinfo\0"
695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
705 [SYSCALL_FILTER_SET_RAW_IO
] = {
707 .help
= "Raw I/O port access",
714 #ifdef __NR_s390_pci_mmio_read
715 "s390_pci_mmio_read\0"
717 #ifdef __NR_s390_pci_mmio_write
718 "s390_pci_mmio_write\0"
721 [SYSCALL_FILTER_SET_REBOOT
] = {
723 .help
= "Reboot and reboot preparation/kexec",
729 [SYSCALL_FILTER_SET_RESOURCES
] = {
730 .name
= "@resources",
731 .help
= "Alter resource settings",
738 "sched_setaffinity\0"
741 "sched_setscheduler\0"
746 [SYSCALL_FILTER_SET_SETUID
] = {
748 .help
= "Operations for changing user/group credentials",
765 [SYSCALL_FILTER_SET_SIGNAL
] = {
767 .help
= "Process signal handling",
774 "rt_sigtimedwait_time64\0"
784 [SYSCALL_FILTER_SET_SWAP
] = {
786 .help
= "Enable/disable swap devices",
791 [SYSCALL_FILTER_SET_SYNC
] = {
793 .help
= "Synchronize files and memory to storage",
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE
] = {
804 .name
= "@system-service",
805 .help
= "General system service operations",
840 "name_to_handle_at\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
854 "sched_rr_get_interval_time64\0"
872 [SYSCALL_FILTER_SET_TIMER
] = {
874 .help
= "Schedule operations by time",
888 "timerfd_gettime64\0"
890 "timerfd_settime64\0"
895 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
898 if (isempty(name
) || name
[0] != '@')
901 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
902 if (streq(syscall_filter_sets
[i
].name
, name
))
903 return syscall_filter_sets
+ i
;
908 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
, char **exclude
, bool log_missing
);
910 int seccomp_add_syscall_filter_item(scmp_filter_ctx
*seccomp
, const char *name
, uint32_t action
, char **exclude
, bool log_missing
) {
914 if (strv_contains(exclude
, name
))
917 if (name
[0] == '@') {
918 const SyscallFilterSet
*other
;
920 other
= syscall_filter_set_find(name
);
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
923 "Filter set %s is not known!",
926 return seccomp_add_syscall_filter_set(seccomp
, other
, action
, exclude
, log_missing
);
931 id
= seccomp_syscall_resolve_name(name
);
932 if (id
== __NR_SCMP_ERROR
) {
934 log_debug("System call %s is not known, ignoring.", name
);
938 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
941 bool ignore
= r
== -EDOM
;
943 if (!ignore
|| log_missing
)
944 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
945 name
, id
, ignore
? ", ignoring" : "");
954 static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp
,
956 const SyscallFilterSet
*set
,
967 NULSTR_FOREACH(sys
, set
->value
) {
968 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
, log_missing
);
976 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
, bool log_missing
) {
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
983 * each local arch. */
985 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
986 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
990 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
994 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
, NULL
, log_missing
);
996 return log_debug_errno(r
, "Failed to add filter set: %m");
998 r
= seccomp_load(seccomp
);
999 if (ERRNO_IS_SECCOMP_FATAL(r
))
1002 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1008 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* set
, uint32_t action
, bool log_missing
) {
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
1015 if (hashmap_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1019 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1021 void *syscall_id
, *val
;
1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1025 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1029 HASHMAP_FOREACH_KEY(val
, syscall_id
, set
, i
) {
1030 uint32_t a
= action
;
1031 int id
= PTR_TO_INT(syscall_id
) - 1;
1032 int error
= PTR_TO_INT(val
);
1034 if (action
!= SCMP_ACT_ALLOW
&& error
>= 0)
1035 a
= SCMP_ACT_ERRNO(error
);
1037 r
= seccomp_rule_add_exact(seccomp
, a
, id
, 0);
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_
char *n
= NULL
;
1043 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, id
);
1044 ignore
= r
== -EDOM
;
1045 if (!ignore
|| log_missing
)
1046 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n
), id
, ignore
? ", ignoring" : "");
1053 r
= seccomp_load(seccomp
);
1054 if (ERRNO_IS_SECCOMP_FATAL(r
))
1057 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1063 int seccomp_parse_syscall_filter(
1067 SeccompParseFlags flags
,
1069 const char *filename
,
1077 if (name
[0] == '@') {
1078 const SyscallFilterSet
*set
;
1081 set
= syscall_filter_set_find(name
);
1083 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
1086 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1087 "Unknown system call group, ignoring: %s", name
);
1091 NULSTR_FOREACH(i
, set
->value
) {
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1096 r
= seccomp_parse_syscall_filter(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
1103 id
= seccomp_syscall_resolve_name(name
);
1104 if (id
== __NR_SCMP_ERROR
) {
1105 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
1108 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1109 "Failed to parse system call, ignoring: %s", name
);
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
1115 if (!(flags
& SECCOMP_PARSE_INVERT
) == !!(flags
& SECCOMP_PARSE_WHITELIST
)) {
1116 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1120 return flags
& SECCOMP_PARSE_LOG
? log_oom() : -ENOMEM
;
1122 assert_se(hashmap_update(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
)) == 0);
1128 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1134 int seccomp_restrict_namespaces(unsigned long retain
) {
1138 if (DEBUG_LOGGING
) {
1139 _cleanup_free_
char *s
= NULL
;
1141 (void) namespace_flags_to_string(retain
, &s
);
1142 log_debug("Restricting namespace to: %s.", strna(s
));
1146 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1150 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1155 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1159 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1162 r
= seccomp_rule_add_exact(
1164 SCMP_ACT_ERRNO(EPERM
),
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r
= seccomp_rule_add_exact(
1172 SCMP_ACT_ERRNO(EPERM
),
1175 SCMP_A1(SCMP_CMP_EQ
, 0));
1177 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1181 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
1184 f
= namespace_flag_map
[i
].flag
;
1185 if ((retain
& f
) == f
) {
1186 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1190 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1192 r
= seccomp_rule_add_exact(
1194 SCMP_ACT_ERRNO(EPERM
),
1197 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1199 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1205 r
= seccomp_rule_add_exact(
1207 SCMP_ACT_ERRNO(EPERM
),
1210 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1212 r
= seccomp_rule_add_exact(
1214 SCMP_ACT_ERRNO(EPERM
),
1217 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1219 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1223 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1224 r
= seccomp_rule_add_exact(
1226 SCMP_ACT_ERRNO(EPERM
),
1229 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1231 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1239 r
= seccomp_load(seccomp
);
1240 if (ERRNO_IS_SECCOMP_FATAL(r
))
1243 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1249 int seccomp_protect_sysctl(void) {
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1254 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1258 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
1259 /* No _sysctl syscall */
1262 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1266 r
= seccomp_rule_add_exact(
1268 SCMP_ACT_ERRNO(EPERM
),
1272 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1276 r
= seccomp_load(seccomp
);
1277 if (ERRNO_IS_SECCOMP_FATAL(r
))
1280 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1286 int seccomp_protect_syslog(void) {
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1291 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1293 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1297 r
= seccomp_rule_add_exact(
1299 SCMP_ACT_ERRNO(EPERM
),
1304 log_debug_errno(r
, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1308 r
= seccomp_load(seccomp
);
1309 if (ERRNO_IS_SECCOMP_FATAL(r
))
1312 log_debug_errno(r
, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1318 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1323 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1331 case SCMP_ARCH_X86_64
:
1334 case SCMP_ARCH_AARCH64
:
1336 case SCMP_ARCH_PPC64
:
1337 case SCMP_ARCH_PPC64LE
:
1338 case SCMP_ARCH_MIPSEL64N32
:
1339 case SCMP_ARCH_MIPS64N32
:
1340 case SCMP_ARCH_MIPSEL64
:
1341 case SCMP_ARCH_MIPS64
:
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1346 case SCMP_ARCH_S390
:
1347 case SCMP_ARCH_S390X
:
1349 case SCMP_ARCH_MIPSEL
:
1350 case SCMP_ARCH_MIPS
:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1361 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1366 int af
, first
= 0, last
= 0;
1369 /* If this is a whitelist, we first block the address families that are out of range and then
1370 * everything that is not in the set. First, we find the lowest and highest address family in
1373 SET_FOREACH(afp
, address_families
, i
) {
1374 af
= PTR_TO_INT(afp
);
1376 if (af
<= 0 || af
>= af_max())
1379 if (first
== 0 || af
< first
)
1382 if (last
== 0 || af
> last
)
1386 assert((first
== 0) == (last
== 0));
1390 /* No entries in the valid range, block everything */
1391 r
= seccomp_rule_add_exact(
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1397 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1403 /* Block everything below the first entry */
1404 r
= seccomp_rule_add_exact(
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1409 SCMP_A0(SCMP_CMP_LT
, first
));
1411 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1415 /* Block everything above the last entry */
1416 r
= seccomp_rule_add_exact(
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1421 SCMP_A0(SCMP_CMP_GT
, last
));
1423 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1427 /* Block everything between the first and last entry */
1428 for (af
= 1; af
< af_max(); af
++) {
1430 if (set_contains(address_families
, INT_TO_PTR(af
)))
1433 r
= seccomp_rule_add_exact(
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1438 SCMP_A0(SCMP_CMP_EQ
, af
));
1443 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1451 /* If this is a blacklist, then generate one rule for
1452 * each address family that are then combined in OR
1455 SET_FOREACH(af
, address_families
, i
) {
1457 r
= seccomp_rule_add_exact(
1459 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1462 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1467 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1472 r
= seccomp_load(seccomp
);
1473 if (ERRNO_IS_SECCOMP_FATAL(r
))
1476 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1482 int seccomp_restrict_realtime(void) {
1483 static const int permitted_policies
[] = {
1489 int r
, max_policy
= 0;
1493 /* Determine the highest policy constant we want to allow */
1494 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1495 if (permitted_policies
[i
] > max_policy
)
1496 max_policy
= permitted_policies
[i
];
1498 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1499 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1502 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1504 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1508 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1510 for (p
= 0; p
< max_policy
; p
++) {
1513 /* Check if this is in the whitelist. */
1514 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1515 if (permitted_policies
[i
] == p
) {
1523 /* Deny this policy */
1524 r
= seccomp_rule_add_exact(
1526 SCMP_ACT_ERRNO(EPERM
),
1527 SCMP_SYS(sched_setscheduler
),
1529 SCMP_A1(SCMP_CMP_EQ
, p
));
1531 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1536 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1537 * unsigned here, hence no need no check for < 0 values. */
1538 r
= seccomp_rule_add_exact(
1540 SCMP_ACT_ERRNO(EPERM
),
1541 SCMP_SYS(sched_setscheduler
),
1543 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1545 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1549 r
= seccomp_load(seccomp
);
1550 if (ERRNO_IS_SECCOMP_FATAL(r
))
1553 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1559 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1563 const struct scmp_arg_cmp arg
) {
1566 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1568 _cleanup_free_
char *n
= NULL
;
1570 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1571 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1573 seccomp_arch_to_string(arch
));
1579 /* For known architectures, check that syscalls are indeed defined or not. */
1580 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1581 assert_cc(SCMP_SYS(shmget
) > 0);
1582 assert_cc(SCMP_SYS(shmat
) > 0);
1583 assert_cc(SCMP_SYS(shmdt
) > 0);
1586 int seccomp_memory_deny_write_execute(void) {
1588 unsigned loaded
= 0;
1590 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1591 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1592 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0, r
;
1594 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1598 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1599 * We ignore that here, which means there's still a way to get writable/executable
1600 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1603 case SCMP_ARCH_S390
:
1604 filter_syscall
= SCMP_SYS(mmap2
);
1605 block_syscall
= SCMP_SYS(mmap
);
1606 /* shmat multiplexed, see above */
1610 case SCMP_ARCH_PPC64
:
1611 case SCMP_ARCH_PPC64LE
:
1612 case SCMP_ARCH_S390X
:
1613 filter_syscall
= SCMP_SYS(mmap
);
1614 /* shmat multiplexed, see above */
1618 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1619 shmat_syscall
= SCMP_SYS(shmat
);
1622 case SCMP_ARCH_X86_64
:
1624 case SCMP_ARCH_AARCH64
:
1625 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32 and arm64 have only mmap */
1626 shmat_syscall
= SCMP_SYS(shmat
);
1629 /* Please add more definitions here, if you port systemd to other architectures! */
1631 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1632 #warning "Consider adding the right mmap() syscall definitions here!"
1636 /* Can't filter mmap() on this arch, then skip it */
1637 if (filter_syscall
== 0)
1640 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1644 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1646 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1650 if (block_syscall
!= 0) {
1651 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1656 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1658 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1662 #ifdef __NR_pkey_mprotect
1663 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1665 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1670 if (shmat_syscall
> 0) {
1671 r
= add_seccomp_syscall_filter(seccomp
, arch
, shmat_syscall
,
1673 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1678 r
= seccomp_load(seccomp
);
1679 if (ERRNO_IS_SECCOMP_FATAL(r
))
1682 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1683 seccomp_arch_to_string(arch
));
1688 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1693 int seccomp_restrict_archs(Set
*archs
) {
1694 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1699 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1702 * There are some qualifications. However the most important use is to stop processes from bypassing
1703 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1704 * in a non-native architecture. There are no holes in this use case, at least so far. */
1706 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1707 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1708 * to run a program with the restrictions applied. */
1709 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1713 SET_FOREACH(id
, archs
, i
) {
1714 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1715 if (r
< 0 && r
!= -EEXIST
)
1719 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1720 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1721 * The important thing is that you can block the old 32-bit x86 syscalls.
1722 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1724 if (seccomp_arch_native() == SCMP_ARCH_X32
||
1725 set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1))) {
1727 r
= seccomp_arch_add(seccomp
, SCMP_ARCH_X86_64
);
1728 if (r
< 0 && r
!= -EEXIST
)
1732 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1736 r
= seccomp_load(seccomp
);
1737 if (ERRNO_IS_SECCOMP_FATAL(r
))
1740 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1745 int parse_syscall_archs(char **l
, Set
**archs
) {
1746 _cleanup_set_free_ Set
*_archs
= NULL
;
1753 r
= set_ensure_allocated(&_archs
, NULL
);
1757 STRV_FOREACH(s
, l
) {
1760 r
= seccomp_arch_from_string(*s
, &a
);
1764 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1769 *archs
= TAKE_PTR(_archs
);
1774 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1780 NULSTR_FOREACH(i
, set
->value
) {
1783 const SyscallFilterSet
*more
;
1785 more
= syscall_filter_set_find(i
);
1789 r
= seccomp_filter_set_add(filter
, add
, more
);
1795 id
= seccomp_syscall_resolve_name(i
);
1796 if (id
== __NR_SCMP_ERROR
) {
1797 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1802 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1806 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1813 int seccomp_lock_personality(unsigned long personality
) {
1817 if (personality
>= PERSONALITY_INVALID
)
1820 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1821 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1823 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1827 r
= seccomp_rule_add_exact(
1829 SCMP_ACT_ERRNO(EPERM
),
1830 SCMP_SYS(personality
),
1832 SCMP_A0(SCMP_CMP_NE
, personality
));
1834 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1838 r
= seccomp_load(seccomp
);
1839 if (ERRNO_IS_SECCOMP_FATAL(r
))
1842 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1848 int seccomp_protect_hostname(void) {
1852 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1853 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1855 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1859 r
= seccomp_rule_add_exact(
1861 SCMP_ACT_ERRNO(EPERM
),
1862 SCMP_SYS(sethostname
),
1865 log_debug_errno(r
, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1869 r
= seccomp_rule_add_exact(
1871 SCMP_ACT_ERRNO(EPERM
),
1872 SCMP_SYS(setdomainname
),
1875 log_debug_errno(r
, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1879 r
= seccomp_load(seccomp
);
1880 if (ERRNO_IS_SECCOMP_FATAL(r
))
1883 log_debug_errno(r
, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1889 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp
, mode_t m
) {
1890 /* Checks the mode_t parameter of the following system calls:
1892 * → chmod() + fchmod() + fchmodat()
1893 * → open() + creat() + openat()
1894 * → mkdir() + mkdirat()
1895 * → mknod() + mknodat()
1897 * Returns error if *everything* failed, and 0 otherwise.
1902 r
= seccomp_rule_add_exact(
1904 SCMP_ACT_ERRNO(EPERM
),
1907 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1909 log_debug_errno(r
, "Failed to add filter for chmod: %m");
1913 r
= seccomp_rule_add_exact(
1915 SCMP_ACT_ERRNO(EPERM
),
1918 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1920 log_debug_errno(r
, "Failed to add filter for fchmod: %m");
1924 r
= seccomp_rule_add_exact(
1926 SCMP_ACT_ERRNO(EPERM
),
1929 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
1931 log_debug_errno(r
, "Failed to add filter for fchmodat: %m");
1935 r
= seccomp_rule_add_exact(
1937 SCMP_ACT_ERRNO(EPERM
),
1940 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1942 log_debug_errno(r
, "Failed to add filter for mkdir: %m");
1946 r
= seccomp_rule_add_exact(
1948 SCMP_ACT_ERRNO(EPERM
),
1951 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
1953 log_debug_errno(r
, "Failed to add filter for mkdirat: %m");
1957 r
= seccomp_rule_add_exact(
1959 SCMP_ACT_ERRNO(EPERM
),
1962 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1964 log_debug_errno(r
, "Failed to add filter for mknod: %m");
1968 r
= seccomp_rule_add_exact(
1970 SCMP_ACT_ERRNO(EPERM
),
1973 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
1975 log_debug_errno(r
, "Failed to add filter for mknodat: %m");
1979 #if SCMP_SYS(open) > 0
1980 r
= seccomp_rule_add_exact(
1982 SCMP_ACT_ERRNO(EPERM
),
1985 SCMP_A1(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
1986 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
1988 log_debug_errno(r
, "Failed to add filter for open: %m");
1993 r
= seccomp_rule_add_exact(
1995 SCMP_ACT_ERRNO(EPERM
),
1998 SCMP_A2(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
1999 SCMP_A3(SCMP_CMP_MASKED_EQ
, m
, m
));
2001 log_debug_errno(r
, "Failed to add filter for openat: %m");
2005 #if defined(__SNR_openat2)
2006 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2007 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2008 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2009 * compatible with kernels that are not absolutely recent. */
2010 r
= seccomp_rule_add_exact(
2012 SCMP_ACT_ERRNO(EPERM
),
2016 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2021 r
= seccomp_rule_add_exact(
2023 SCMP_ACT_ERRNO(EPERM
),
2026 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2028 log_debug_errno(r
, "Failed to add filter for creat: %m");
2035 int seccomp_restrict_suid_sgid(void) {
2039 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2040 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2042 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2046 r
= seccomp_restrict_sxid(seccomp
, S_ISUID
);
2048 log_debug_errno(r
, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2050 k
= seccomp_restrict_sxid(seccomp
, S_ISGID
);
2052 log_debug_errno(r
, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2057 r
= seccomp_load(seccomp
);
2058 if (ERRNO_IS_SECCOMP_FATAL(r
))
2061 log_debug_errno(r
, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2067 uint32_t scmp_act_kill_process(void) {
2069 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2070 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2071 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2072 * for single-threaded apps does the right thing. */
2074 #ifdef SCMP_ACT_KILL_PROCESS
2075 if (seccomp_api_get() >= 3)
2076 return SCMP_ACT_KILL_PROCESS
;
2079 return SCMP_ACT_KILL
; /* same as SCMP_ACT_KILL_THREAD */