1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 Copyright 2014 Lennart Poettering
7 #include <linux/seccomp.h>
11 #include <sys/prctl.h>
15 #include "alloc-util.h"
18 #include "process-util.h"
19 #include "seccomp-util.h"
21 #include "string-util.h"
24 #include "errno-list.h"
26 const uint32_t seccomp_local_archs
[] = {
28 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
30 #if defined(__x86_64__) && defined(__ILP32__)
33 SCMP_ARCH_X32
, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
37 SCMP_ARCH_X86_64
, /* native */
38 #elif defined(__i386__)
40 #elif defined(__aarch64__)
42 SCMP_ARCH_AARCH64
, /* native */
43 #elif defined(__arm__)
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
47 SCMP_ARCH_MIPS
, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL
, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
54 SCMP_ARCH_MIPSEL64N32
,
57 SCMP_ARCH_MIPS64
, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
62 SCMP_ARCH_MIPSEL64N32
,
64 SCMP_ARCH_MIPSEL64
, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL64N32
,
71 SCMP_ARCH_MIPS64N32
, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPSEL64N32
, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
82 SCMP_ARCH_PPC64
, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
86 SCMP_ARCH_PPC64LE
, /* native */
87 #elif defined(__powerpc__)
89 #elif defined(__s390x__)
91 SCMP_ARCH_S390X
, /* native */
92 #elif defined(__s390__)
98 const char* seccomp_arch_to_string(uint32_t c
) {
99 /* Maintain order used in <seccomp.h>.
101 * Names used here should be the same as those used for ConditionArchitecture=,
102 * except for "subarchitectures" like x32. */
105 case SCMP_ARCH_NATIVE
:
109 case SCMP_ARCH_X86_64
:
115 case SCMP_ARCH_AARCH64
:
119 case SCMP_ARCH_MIPS64
:
121 case SCMP_ARCH_MIPS64N32
:
123 case SCMP_ARCH_MIPSEL
:
125 case SCMP_ARCH_MIPSEL64
:
127 case SCMP_ARCH_MIPSEL64N32
:
128 return "mips64-le-n32";
131 case SCMP_ARCH_PPC64
:
133 case SCMP_ARCH_PPC64LE
:
137 case SCMP_ARCH_S390X
:
144 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
150 if (streq(n
, "native"))
151 *ret
= SCMP_ARCH_NATIVE
;
152 else if (streq(n
, "x86"))
153 *ret
= SCMP_ARCH_X86
;
154 else if (streq(n
, "x86-64"))
155 *ret
= SCMP_ARCH_X86_64
;
156 else if (streq(n
, "x32"))
157 *ret
= SCMP_ARCH_X32
;
158 else if (streq(n
, "arm"))
159 *ret
= SCMP_ARCH_ARM
;
160 else if (streq(n
, "arm64"))
161 *ret
= SCMP_ARCH_AARCH64
;
162 else if (streq(n
, "mips"))
163 *ret
= SCMP_ARCH_MIPS
;
164 else if (streq(n
, "mips64"))
165 *ret
= SCMP_ARCH_MIPS64
;
166 else if (streq(n
, "mips64-n32"))
167 *ret
= SCMP_ARCH_MIPS64N32
;
168 else if (streq(n
, "mips-le"))
169 *ret
= SCMP_ARCH_MIPSEL
;
170 else if (streq(n
, "mips64-le"))
171 *ret
= SCMP_ARCH_MIPSEL64
;
172 else if (streq(n
, "mips64-le-n32"))
173 *ret
= SCMP_ARCH_MIPSEL64N32
;
174 else if (streq(n
, "ppc"))
175 *ret
= SCMP_ARCH_PPC
;
176 else if (streq(n
, "ppc64"))
177 *ret
= SCMP_ARCH_PPC64
;
178 else if (streq(n
, "ppc64-le"))
179 *ret
= SCMP_ARCH_PPC64LE
;
180 else if (streq(n
, "s390"))
181 *ret
= SCMP_ARCH_S390
;
182 else if (streq(n
, "s390x"))
183 *ret
= SCMP_ARCH_S390X
;
190 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
191 scmp_filter_ctx seccomp
;
194 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
195 * any others. Also, turns off the NNP fiddling. */
197 seccomp
= seccomp_init(default_action
);
201 if (arch
!= SCMP_ARCH_NATIVE
&&
202 arch
!= seccomp_arch_native()) {
204 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
208 r
= seccomp_arch_add(seccomp
, arch
);
212 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
213 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
214 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
216 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
217 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
220 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
224 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
232 seccomp_release(seccomp
);
236 static bool is_basic_seccomp_available(void) {
237 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
240 static bool is_seccomp_filter_available(void) {
241 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
245 bool is_seccomp_available(void) {
246 static int cached_enabled
= -1;
248 if (cached_enabled
< 0)
250 is_basic_seccomp_available() &&
251 is_seccomp_filter_available();
253 return cached_enabled
;
256 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
257 [SYSCALL_FILTER_SET_DEFAULT
] = {
259 .help
= "System calls that are always permitted",
286 "getrlimit\0" /* make sure processes can query stack size and such */
307 [SYSCALL_FILTER_SET_AIO
] = {
309 .help
= "Asynchronous IO",
317 [SYSCALL_FILTER_SET_BASIC_IO
] = {
338 [SYSCALL_FILTER_SET_CHOWN
] = {
340 .help
= "Change ownership of files and directories",
350 [SYSCALL_FILTER_SET_CLOCK
] = {
352 .help
= "Change the system time",
360 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
361 .name
= "@cpu-emulation",
362 .help
= "System calls for CPU emulation functionality",
370 [SYSCALL_FILTER_SET_DEBUG
] = {
372 .help
= "Debugging, performance monitoring and tracing functionality",
377 "process_vm_writev\0"
380 #ifdef __NR_s390_runtime_instr
381 "s390_runtime_instr\0"
383 "sys_debug_setcontext\0"
385 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
386 .name
= "@file-system",
387 .help
= "File system operations",
417 "inotify_add_watch\0"
468 [SYSCALL_FILTER_SET_IO_EVENT
] = {
470 .help
= "Event loop system calls",
487 [SYSCALL_FILTER_SET_IPC
] = {
489 .help
= "SysV IPC, POSIX Message Queues or other IPC",
506 "process_vm_writev\0"
516 [SYSCALL_FILTER_SET_KEYRING
] = {
518 .help
= "Kernel keyring access",
524 [SYSCALL_FILTER_SET_MEMLOCK
] = {
526 .help
= "Memory locking control",
534 [SYSCALL_FILTER_SET_MODULE
] = {
536 .help
= "Loading and unloading of kernel modules",
542 [SYSCALL_FILTER_SET_MOUNT
] = {
544 .help
= "Mounting and unmounting of file systems",
552 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
553 .name
= "@network-io",
554 .help
= "Network or Unix socket IO, should not be needed if not network facing",
578 [SYSCALL_FILTER_SET_OBSOLETE
] = {
579 /* some unknown even to libseccomp */
581 .help
= "Unusual, obsolete or unimplemented system calls",
610 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
611 .name
= "@privileged",
612 .help
= "All system calls which need super-user capabilities",
642 [SYSCALL_FILTER_SET_PROCESS
] = {
644 .help
= "Process control, execution, namespaceing operations",
647 "capget\0" /* Able to query arbitrary processes */
655 "rt_tgsigqueueinfo\0"
666 [SYSCALL_FILTER_SET_RAW_IO
] = {
668 .help
= "Raw I/O port access",
675 #ifdef __NR_s390_pci_mmio_read
676 "s390_pci_mmio_read\0"
678 #ifdef __NR_s390_pci_mmio_write
679 "s390_pci_mmio_write\0"
682 [SYSCALL_FILTER_SET_REBOOT
] = {
684 .help
= "Reboot and reboot preparation/kexec",
690 [SYSCALL_FILTER_SET_RESOURCES
] = {
691 .name
= "@resources",
692 .help
= "Alter resource settings",
699 "sched_setaffinity\0"
702 "sched_setscheduler\0"
707 [SYSCALL_FILTER_SET_SETUID
] = {
709 .help
= "Operations for changing user/group credentials",
726 [SYSCALL_FILTER_SET_SIGNAL
] = {
728 .help
= "Process signal handling",
744 [SYSCALL_FILTER_SET_SWAP
] = {
746 .help
= "Enable/disable swap devices",
751 [SYSCALL_FILTER_SET_SYNC
] = {
753 .help
= "Synchronize files and memory to storage",
762 [SYSCALL_FILTER_SET_TIMER
] = {
764 .help
= "Schedule operations by time",
781 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
784 if (isempty(name
) || name
[0] != '@')
787 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
788 if (streq(syscall_filter_sets
[i
].name
, name
))
789 return syscall_filter_sets
+ i
;
794 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
, char **exclude
);
796 int seccomp_add_syscall_filter_item(scmp_filter_ctx
*seccomp
, const char *name
, uint32_t action
, char **exclude
) {
802 if (strv_contains(exclude
, name
))
805 if (name
[0] == '@') {
806 const SyscallFilterSet
*other
;
808 other
= syscall_filter_set_find(name
);
810 log_debug("Filter set %s is not known!", name
);
814 r
= seccomp_add_syscall_filter_set(seccomp
, other
, action
, exclude
);
820 id
= seccomp_syscall_resolve_name(name
);
821 if (id
== __NR_SCMP_ERROR
) {
822 log_debug("System call %s is not known, ignoring.", name
);
826 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
828 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
829 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", name
, id
);
835 static int seccomp_add_syscall_filter_set(
836 scmp_filter_ctx seccomp
,
837 const SyscallFilterSet
*set
,
847 NULSTR_FOREACH(sys
, set
->value
) {
848 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
);
856 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
862 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
863 * earch local arch. */
865 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
866 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
868 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
870 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
874 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
, NULL
);
876 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
880 r
= seccomp_load(seccomp
);
881 if (IN_SET(r
, -EPERM
, -EACCES
))
884 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
890 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* set
, uint32_t action
) {
894 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
895 * SyscallFilterSet* table. */
897 if (hashmap_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
900 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
901 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
905 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
907 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
911 HASHMAP_FOREACH_KEY(val
, id
, set
, i
) {
913 int e
= PTR_TO_INT(val
);
915 if (action
!= SCMP_ACT_ALLOW
&& e
>= 0)
916 a
= SCMP_ACT_ERRNO(e
);
918 r
= seccomp_rule_add_exact(seccomp
, a
, PTR_TO_INT(id
) - 1, 0);
920 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
921 _cleanup_free_
char *n
= NULL
;
923 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
924 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
928 r
= seccomp_load(seccomp
);
929 if (IN_SET(r
, -EPERM
, -EACCES
))
932 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
938 int seccomp_parse_syscall_filter_full(
942 SeccompParseFlags flags
,
944 const char *filename
,
952 if (name
[0] == '@') {
953 const SyscallFilterSet
*set
;
956 set
= syscall_filter_set_find(name
);
958 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
961 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
962 "Unknown system call group, ignoring: %s", name
);
966 NULSTR_FOREACH(i
, set
->value
) {
967 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
968 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
969 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
971 r
= seccomp_parse_syscall_filter_full(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
978 id
= seccomp_syscall_resolve_name(name
);
979 if (id
== __NR_SCMP_ERROR
) {
980 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
983 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
984 "Failed to parse system call, ignoring: %s", name
);
988 /* If we previously wanted to forbid a syscall and now
989 * we want to allow it, then remove it from the list. */
990 if (!(flags
& SECCOMP_PARSE_INVERT
) == !!(flags
& SECCOMP_PARSE_WHITELIST
)) {
991 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
993 return flags
& SECCOMP_PARSE_LOG
? log_oom() : -ENOMEM
;
995 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1001 int seccomp_restrict_namespaces(unsigned long retain
) {
1005 if (DEBUG_LOGGING
) {
1006 _cleanup_free_
char *s
= NULL
;
1008 (void) namespace_flags_to_string(retain
, &s
);
1009 log_debug("Restricting namespace to: %s.", strna(s
));
1013 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
1016 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1017 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1020 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1022 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1026 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1027 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1029 r
= seccomp_rule_add_exact(
1031 SCMP_ACT_ERRNO(EPERM
),
1035 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1036 * special invocation with a zero flags argument, right here. */
1037 r
= seccomp_rule_add_exact(
1039 SCMP_ACT_ERRNO(EPERM
),
1042 SCMP_A1(SCMP_CMP_EQ
, 0));
1044 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1048 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
1051 f
= namespace_flag_map
[i
].flag
;
1052 if ((retain
& f
) == f
) {
1053 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1057 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1059 r
= seccomp_rule_add_exact(
1061 SCMP_ACT_ERRNO(EPERM
),
1064 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1066 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1070 /* On s390/s390x the first two parameters to clone are switched */
1071 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1072 r
= seccomp_rule_add_exact(
1074 SCMP_ACT_ERRNO(EPERM
),
1077 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1079 r
= seccomp_rule_add_exact(
1081 SCMP_ACT_ERRNO(EPERM
),
1084 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1086 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1090 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1091 r
= seccomp_rule_add_exact(
1093 SCMP_ACT_ERRNO(EPERM
),
1096 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1098 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1106 r
= seccomp_load(seccomp
);
1107 if (IN_SET(r
, -EPERM
, -EACCES
))
1110 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1116 int seccomp_protect_sysctl(void) {
1120 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1121 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1123 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1125 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
1126 /* No _sysctl syscall */
1129 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1133 r
= seccomp_rule_add_exact(
1135 SCMP_ACT_ERRNO(EPERM
),
1139 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1143 r
= seccomp_load(seccomp
);
1144 if (IN_SET(r
, -EPERM
, -EACCES
))
1147 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1153 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
1157 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1158 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1162 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1166 case SCMP_ARCH_X86_64
:
1169 case SCMP_ARCH_AARCH64
:
1171 case SCMP_ARCH_PPC64
:
1172 case SCMP_ARCH_PPC64LE
:
1173 case SCMP_ARCH_MIPSEL64N32
:
1174 case SCMP_ARCH_MIPS64N32
:
1175 case SCMP_ARCH_MIPSEL64
:
1176 case SCMP_ARCH_MIPS64
:
1177 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1181 case SCMP_ARCH_S390
:
1182 case SCMP_ARCH_S390X
:
1184 case SCMP_ARCH_MIPSEL
:
1185 case SCMP_ARCH_MIPS
:
1187 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1196 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1201 int af
, first
= 0, last
= 0;
1204 /* If this is a whitelist, we first block the address families that are out of range and then
1205 * everything that is not in the set. First, we find the lowest and highest address family in
1208 SET_FOREACH(afp
, address_families
, i
) {
1209 af
= PTR_TO_INT(afp
);
1211 if (af
<= 0 || af
>= af_max())
1214 if (first
== 0 || af
< first
)
1217 if (last
== 0 || af
> last
)
1221 assert((first
== 0) == (last
== 0));
1225 /* No entries in the valid range, block everything */
1226 r
= seccomp_rule_add_exact(
1228 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1232 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1238 /* Block everything below the first entry */
1239 r
= seccomp_rule_add_exact(
1241 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1244 SCMP_A0(SCMP_CMP_LT
, first
));
1246 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1250 /* Block everything above the last entry */
1251 r
= seccomp_rule_add_exact(
1253 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1256 SCMP_A0(SCMP_CMP_GT
, last
));
1258 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1262 /* Block everything between the first and last entry */
1263 for (af
= 1; af
< af_max(); af
++) {
1265 if (set_contains(address_families
, INT_TO_PTR(af
)))
1268 r
= seccomp_rule_add_exact(
1270 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1273 SCMP_A0(SCMP_CMP_EQ
, af
));
1278 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1286 /* If this is a blacklist, then generate one rule for
1287 * each address family that are then combined in OR
1290 SET_FOREACH(af
, address_families
, i
) {
1292 r
= seccomp_rule_add_exact(
1294 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1297 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1302 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1307 r
= seccomp_load(seccomp
);
1308 if (IN_SET(r
, -EPERM
, -EACCES
))
1311 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1317 int seccomp_restrict_realtime(void) {
1318 static const int permitted_policies
[] = {
1324 int r
, max_policy
= 0;
1328 /* Determine the highest policy constant we want to allow */
1329 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1330 if (permitted_policies
[i
] > max_policy
)
1331 max_policy
= permitted_policies
[i
];
1333 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1334 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1337 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1339 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1343 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1345 for (p
= 0; p
< max_policy
; p
++) {
1348 /* Check if this is in the whitelist. */
1349 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1350 if (permitted_policies
[i
] == p
) {
1358 /* Deny this policy */
1359 r
= seccomp_rule_add_exact(
1361 SCMP_ACT_ERRNO(EPERM
),
1362 SCMP_SYS(sched_setscheduler
),
1364 SCMP_A1(SCMP_CMP_EQ
, p
));
1366 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1371 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1372 * unsigned here, hence no need no check for < 0 values. */
1373 r
= seccomp_rule_add_exact(
1375 SCMP_ACT_ERRNO(EPERM
),
1376 SCMP_SYS(sched_setscheduler
),
1378 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1380 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1384 r
= seccomp_load(seccomp
);
1385 if (IN_SET(r
, -EPERM
, -EACCES
))
1388 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1394 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1397 unsigned int arg_cnt
,
1398 const struct scmp_arg_cmp arg
) {
1401 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1403 _cleanup_free_
char *n
= NULL
;
1405 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1406 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1408 seccomp_arch_to_string(arch
));
1414 /* For known architectures, check that syscalls are indeed defined or not. */
1415 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1416 assert_cc(SCMP_SYS(shmget
) > 0);
1417 assert_cc(SCMP_SYS(shmat
) > 0);
1418 assert_cc(SCMP_SYS(shmdt
) > 0);
1419 #elif defined(__i386__) || defined(__powerpc64__)
1420 assert_cc(SCMP_SYS(shmget
) < 0);
1421 assert_cc(SCMP_SYS(shmat
) < 0);
1422 assert_cc(SCMP_SYS(shmdt
) < 0);
1425 int seccomp_memory_deny_write_execute(void) {
1430 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1431 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1432 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1434 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1439 filter_syscall
= SCMP_SYS(mmap2
);
1440 block_syscall
= SCMP_SYS(mmap
);
1444 case SCMP_ARCH_PPC64
:
1445 case SCMP_ARCH_PPC64LE
:
1446 filter_syscall
= SCMP_SYS(mmap
);
1448 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1449 * We ignore that here, which means there's still a way to get writable/executable
1450 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1455 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1456 shmat_syscall
= SCMP_SYS(shmat
);
1459 case SCMP_ARCH_X86_64
:
1461 case SCMP_ARCH_AARCH64
:
1462 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1463 shmat_syscall
= SCMP_SYS(shmat
);
1466 /* Please add more definitions here, if you port systemd to other architectures! */
1468 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1469 #warning "Consider adding the right mmap() syscall definitions here!"
1473 /* Can't filter mmap() on this arch, then skip it */
1474 if (filter_syscall
== 0)
1477 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1481 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1483 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1487 if (block_syscall
!= 0) {
1488 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1493 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1495 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1499 #ifdef __NR_pkey_mprotect
1500 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1502 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1507 if (shmat_syscall
!= 0) {
1508 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1510 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1515 r
= seccomp_load(seccomp
);
1516 if (IN_SET(r
, -EPERM
, -EACCES
))
1519 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1525 int seccomp_restrict_archs(Set
*archs
) {
1526 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1531 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1534 * There are some qualifications. However the most important use is to stop processes from bypassing
1535 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1536 * in a non-native architecture. There are no holes in this use case, at least so far. */
1538 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1539 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1540 * to run a program with the restrictions applied. */
1541 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1545 SET_FOREACH(id
, archs
, i
) {
1546 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1547 if (r
< 0 && r
!= -EEXIST
)
1551 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1552 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1553 * The important thing is that you can block the old 32-bit x86 syscalls.
1554 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1556 if (seccomp_arch_native() == SCMP_ARCH_X32
||
1557 set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1))) {
1559 r
= seccomp_arch_add(seccomp
, SCMP_ARCH_X86_64
);
1560 if (r
< 0 && r
!= -EEXIST
)
1564 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1568 r
= seccomp_load(seccomp
);
1569 if (IN_SET(r
, -EPERM
, -EACCES
))
1572 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1577 int parse_syscall_archs(char **l
, Set
**archs
) {
1578 _cleanup_set_free_ Set
*_archs
;
1585 r
= set_ensure_allocated(&_archs
, NULL
);
1589 STRV_FOREACH(s
, l
) {
1592 r
= seccomp_arch_from_string(*s
, &a
);
1596 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1601 *archs
= TAKE_PTR(_archs
);
1606 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1612 NULSTR_FOREACH(i
, set
->value
) {
1615 const SyscallFilterSet
*more
;
1617 more
= syscall_filter_set_find(i
);
1621 r
= seccomp_filter_set_add(filter
, add
, more
);
1627 id
= seccomp_syscall_resolve_name(i
);
1628 if (id
== __NR_SCMP_ERROR
) {
1629 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1634 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1638 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1645 int seccomp_lock_personality(unsigned long personality
) {
1649 if (personality
>= PERSONALITY_INVALID
)
1652 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1653 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1655 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1659 r
= seccomp_rule_add_exact(
1661 SCMP_ACT_ERRNO(EPERM
),
1662 SCMP_SYS(personality
),
1664 SCMP_A0(SCMP_CMP_NE
, personality
));
1666 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1670 r
= seccomp_load(seccomp
);
1671 if (IN_SET(r
, -EPERM
, -EACCES
))
1674 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));