1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2014 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <linux/seccomp.h>
26 #include <sys/prctl.h>
30 #include "alloc-util.h"
33 #include "process-util.h"
34 #include "seccomp-util.h"
36 #include "string-util.h"
39 #include "errno-list.h"
41 const uint32_t seccomp_local_archs
[] = {
43 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
45 #if defined(__x86_64__) && defined(__ILP32__)
48 SCMP_ARCH_X32
, /* native */
49 #elif defined(__x86_64__) && !defined(__ILP32__)
52 SCMP_ARCH_X86_64
, /* native */
53 #elif defined(__i386__)
55 #elif defined(__aarch64__)
57 SCMP_ARCH_AARCH64
, /* native */
58 #elif defined(__arm__)
60 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
62 SCMP_ARCH_MIPS
, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
65 SCMP_ARCH_MIPSEL
, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
69 SCMP_ARCH_MIPSEL64N32
,
72 SCMP_ARCH_MIPS64
, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
77 SCMP_ARCH_MIPSEL64N32
,
79 SCMP_ARCH_MIPSEL64
, /* native */
80 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
85 SCMP_ARCH_MIPSEL64N32
,
86 SCMP_ARCH_MIPS64N32
, /* native */
87 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
93 SCMP_ARCH_MIPSEL64N32
, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
97 SCMP_ARCH_PPC64
, /* native */
98 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
101 SCMP_ARCH_PPC64LE
, /* native */
102 #elif defined(__powerpc__)
104 #elif defined(__s390x__)
106 SCMP_ARCH_S390X
, /* native */
107 #elif defined(__s390__)
113 const char* seccomp_arch_to_string(uint32_t c
) {
114 /* Maintain order used in <seccomp.h>.
116 * Names used here should be the same as those used for ConditionArchitecture=,
117 * except for "subarchitectures" like x32. */
120 case SCMP_ARCH_NATIVE
:
124 case SCMP_ARCH_X86_64
:
130 case SCMP_ARCH_AARCH64
:
134 case SCMP_ARCH_MIPS64
:
136 case SCMP_ARCH_MIPS64N32
:
138 case SCMP_ARCH_MIPSEL
:
140 case SCMP_ARCH_MIPSEL64
:
142 case SCMP_ARCH_MIPSEL64N32
:
143 return "mips64-le-n32";
146 case SCMP_ARCH_PPC64
:
148 case SCMP_ARCH_PPC64LE
:
152 case SCMP_ARCH_S390X
:
159 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
165 if (streq(n
, "native"))
166 *ret
= SCMP_ARCH_NATIVE
;
167 else if (streq(n
, "x86"))
168 *ret
= SCMP_ARCH_X86
;
169 else if (streq(n
, "x86-64"))
170 *ret
= SCMP_ARCH_X86_64
;
171 else if (streq(n
, "x32"))
172 *ret
= SCMP_ARCH_X32
;
173 else if (streq(n
, "arm"))
174 *ret
= SCMP_ARCH_ARM
;
175 else if (streq(n
, "arm64"))
176 *ret
= SCMP_ARCH_AARCH64
;
177 else if (streq(n
, "mips"))
178 *ret
= SCMP_ARCH_MIPS
;
179 else if (streq(n
, "mips64"))
180 *ret
= SCMP_ARCH_MIPS64
;
181 else if (streq(n
, "mips64-n32"))
182 *ret
= SCMP_ARCH_MIPS64N32
;
183 else if (streq(n
, "mips-le"))
184 *ret
= SCMP_ARCH_MIPSEL
;
185 else if (streq(n
, "mips64-le"))
186 *ret
= SCMP_ARCH_MIPSEL64
;
187 else if (streq(n
, "mips64-le-n32"))
188 *ret
= SCMP_ARCH_MIPSEL64N32
;
189 else if (streq(n
, "ppc"))
190 *ret
= SCMP_ARCH_PPC
;
191 else if (streq(n
, "ppc64"))
192 *ret
= SCMP_ARCH_PPC64
;
193 else if (streq(n
, "ppc64-le"))
194 *ret
= SCMP_ARCH_PPC64LE
;
195 else if (streq(n
, "s390"))
196 *ret
= SCMP_ARCH_S390
;
197 else if (streq(n
, "s390x"))
198 *ret
= SCMP_ARCH_S390X
;
205 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
206 scmp_filter_ctx seccomp
;
209 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
210 * any others. Also, turns off the NNP fiddling. */
212 seccomp
= seccomp_init(default_action
);
216 if (arch
!= SCMP_ARCH_NATIVE
&&
217 arch
!= seccomp_arch_native()) {
219 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
223 r
= seccomp_arch_add(seccomp
, arch
);
227 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
228 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
229 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
231 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
232 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
235 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
239 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
247 seccomp_release(seccomp
);
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
260 bool is_seccomp_available(void) {
261 static int cached_enabled
= -1;
263 if (cached_enabled
< 0)
265 is_basic_seccomp_available() &&
266 is_seccomp_filter_available();
268 return cached_enabled
;
271 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
272 [SYSCALL_FILTER_SET_DEFAULT
] = {
274 .help
= "System calls that are always permitted",
301 "getrlimit\0" /* make sure processes can query stack size and such */
322 [SYSCALL_FILTER_SET_AIO
] = {
324 .help
= "Asynchronous IO",
332 [SYSCALL_FILTER_SET_BASIC_IO
] = {
353 [SYSCALL_FILTER_SET_CHOWN
] = {
355 .help
= "Change ownership of files and directories",
365 [SYSCALL_FILTER_SET_CLOCK
] = {
367 .help
= "Change the system time",
375 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
376 .name
= "@cpu-emulation",
377 .help
= "System calls for CPU emulation functionality",
385 [SYSCALL_FILTER_SET_DEBUG
] = {
387 .help
= "Debugging, performance monitoring and tracing functionality",
392 "process_vm_writev\0"
395 #ifdef __NR_s390_runtime_instr
396 "s390_runtime_instr\0"
398 "sys_debug_setcontext\0"
400 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
401 .name
= "@file-system",
402 .help
= "File system operations",
432 "inotify_add_watch\0"
483 [SYSCALL_FILTER_SET_IO_EVENT
] = {
485 .help
= "Event loop system calls",
502 [SYSCALL_FILTER_SET_IPC
] = {
504 .help
= "SysV IPC, POSIX Message Queues or other IPC",
521 "process_vm_writev\0"
531 [SYSCALL_FILTER_SET_KEYRING
] = {
533 .help
= "Kernel keyring access",
539 [SYSCALL_FILTER_SET_MEMLOCK
] = {
541 .help
= "Memory locking control",
549 [SYSCALL_FILTER_SET_MODULE
] = {
551 .help
= "Loading and unloading of kernel modules",
557 [SYSCALL_FILTER_SET_MOUNT
] = {
559 .help
= "Mounting and unmounting of file systems",
567 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
568 .name
= "@network-io",
569 .help
= "Network or Unix socket IO, should not be needed if not network facing",
593 [SYSCALL_FILTER_SET_OBSOLETE
] = {
594 /* some unknown even to libseccomp */
596 .help
= "Unusual, obsolete or unimplemented system calls",
625 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
626 .name
= "@privileged",
627 .help
= "All system calls which need super-user capabilities",
657 [SYSCALL_FILTER_SET_PROCESS
] = {
659 .help
= "Process control, execution, namespaceing operations",
662 "capget\0" /* Able to query arbitrary processes */
670 "rt_tgsigqueueinfo\0"
681 [SYSCALL_FILTER_SET_RAW_IO
] = {
683 .help
= "Raw I/O port access",
690 #ifdef __NR_s390_pci_mmio_read
691 "s390_pci_mmio_read\0"
693 #ifdef __NR_s390_pci_mmio_write
694 "s390_pci_mmio_write\0"
697 [SYSCALL_FILTER_SET_REBOOT
] = {
699 .help
= "Reboot and reboot preparation/kexec",
705 [SYSCALL_FILTER_SET_RESOURCES
] = {
706 .name
= "@resources",
707 .help
= "Alter resource settings",
714 "sched_setaffinity\0"
717 "sched_setscheduler\0"
722 [SYSCALL_FILTER_SET_SETUID
] = {
724 .help
= "Operations for changing user/group credentials",
741 [SYSCALL_FILTER_SET_SIGNAL
] = {
743 .help
= "Process signal handling",
759 [SYSCALL_FILTER_SET_SWAP
] = {
761 .help
= "Enable/disable swap devices",
766 [SYSCALL_FILTER_SET_SYNC
] = {
768 .help
= "Synchronize files and memory to storage",
777 [SYSCALL_FILTER_SET_TIMER
] = {
779 .help
= "Schedule operations by time",
796 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
799 if (isempty(name
) || name
[0] != '@')
802 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
803 if (streq(syscall_filter_sets
[i
].name
, name
))
804 return syscall_filter_sets
+ i
;
809 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
, char **exclude
);
811 int seccomp_add_syscall_filter_item(scmp_filter_ctx
*seccomp
, const char *name
, uint32_t action
, char **exclude
) {
817 if (strv_contains(exclude
, name
))
820 if (name
[0] == '@') {
821 const SyscallFilterSet
*other
;
823 other
= syscall_filter_set_find(name
);
825 log_debug("Filter set %s is not known!", name
);
829 r
= seccomp_add_syscall_filter_set(seccomp
, other
, action
, exclude
);
835 id
= seccomp_syscall_resolve_name(name
);
836 if (id
== __NR_SCMP_ERROR
) {
837 log_debug("System call %s is not known, ignoring.", name
);
841 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
843 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
844 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", name
, id
);
850 static int seccomp_add_syscall_filter_set(
851 scmp_filter_ctx seccomp
,
852 const SyscallFilterSet
*set
,
862 NULSTR_FOREACH(sys
, set
->value
) {
863 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
);
871 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
877 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
878 * earch local arch. */
880 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
881 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
885 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
889 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
, NULL
);
891 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
895 r
= seccomp_load(seccomp
);
896 if (IN_SET(r
, -EPERM
, -EACCES
))
899 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
905 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* set
, uint32_t action
) {
909 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
910 * SyscallFilterSet* table. */
912 if (hashmap_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
915 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
916 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
920 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
922 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
926 HASHMAP_FOREACH_KEY(val
, id
, set
, i
) {
928 int e
= PTR_TO_INT(val
);
930 if (action
!= SCMP_ACT_ALLOW
&& e
>= 0)
931 a
= SCMP_ACT_ERRNO(e
);
933 r
= seccomp_rule_add_exact(seccomp
, a
, PTR_TO_INT(id
) - 1, 0);
935 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
936 _cleanup_free_
char *n
= NULL
;
938 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, PTR_TO_INT(id
) - 1);
939 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
943 r
= seccomp_load(seccomp
);
944 if (IN_SET(r
, -EPERM
, -EACCES
))
947 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
953 int seccomp_parse_syscall_filter_full(
957 SeccompParseFlags flags
,
959 const char *filename
,
967 if (name
[0] == '@') {
968 const SyscallFilterSet
*set
;
971 set
= syscall_filter_set_find(name
);
973 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
976 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
977 "Unknown system call group, ignoring: %s", name
);
981 NULSTR_FOREACH(i
, set
->value
) {
982 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
983 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
984 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
986 r
= seccomp_parse_syscall_filter_full(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
993 id
= seccomp_syscall_resolve_name(name
);
994 if (id
== __NR_SCMP_ERROR
) {
995 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
998 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
999 "Failed to parse system call, ignoring: %s", name
);
1003 /* If we previously wanted to forbid a syscall and now
1004 * we want to allow it, then remove it from the list. */
1005 if (!(flags
& SECCOMP_PARSE_INVERT
) == !!(flags
& SECCOMP_PARSE_WHITELIST
)) {
1006 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1008 return flags
& SECCOMP_PARSE_LOG
? log_oom() : -ENOMEM
;
1010 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1016 int seccomp_restrict_namespaces(unsigned long retain
) {
1020 if (DEBUG_LOGGING
) {
1021 _cleanup_free_
char *s
= NULL
;
1023 (void) namespace_flag_to_string_many(retain
, &s
);
1024 log_debug("Restricting namespace to: %s.", strna(s
));
1028 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
1031 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1032 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1035 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1037 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1041 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1042 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1044 r
= seccomp_rule_add_exact(
1046 SCMP_ACT_ERRNO(EPERM
),
1050 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1051 * special invocation with a zero flags argument, right here. */
1052 r
= seccomp_rule_add_exact(
1054 SCMP_ACT_ERRNO(EPERM
),
1057 SCMP_A1(SCMP_CMP_EQ
, 0));
1059 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1063 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
1066 f
= namespace_flag_map
[i
].flag
;
1067 if ((retain
& f
) == f
) {
1068 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1072 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1074 r
= seccomp_rule_add_exact(
1076 SCMP_ACT_ERRNO(EPERM
),
1079 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1081 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1085 /* On s390/s390x the first two parameters to clone are switched */
1086 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1087 r
= seccomp_rule_add_exact(
1089 SCMP_ACT_ERRNO(EPERM
),
1092 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1094 r
= seccomp_rule_add_exact(
1096 SCMP_ACT_ERRNO(EPERM
),
1099 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1101 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1105 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1106 r
= seccomp_rule_add_exact(
1108 SCMP_ACT_ERRNO(EPERM
),
1111 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1113 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1121 r
= seccomp_load(seccomp
);
1122 if (IN_SET(r
, -EPERM
, -EACCES
))
1125 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1131 int seccomp_protect_sysctl(void) {
1135 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1136 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1138 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1140 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
1141 /* No _sysctl syscall */
1144 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1148 r
= seccomp_rule_add_exact(
1150 SCMP_ACT_ERRNO(EPERM
),
1154 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1158 r
= seccomp_load(seccomp
);
1159 if (IN_SET(r
, -EPERM
, -EACCES
))
1162 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1168 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
1172 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1173 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1177 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1181 case SCMP_ARCH_X86_64
:
1184 case SCMP_ARCH_AARCH64
:
1186 case SCMP_ARCH_PPC64
:
1187 case SCMP_ARCH_PPC64LE
:
1188 case SCMP_ARCH_MIPSEL64N32
:
1189 case SCMP_ARCH_MIPS64N32
:
1190 case SCMP_ARCH_MIPSEL64
:
1191 case SCMP_ARCH_MIPS64
:
1192 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1196 case SCMP_ARCH_S390
:
1197 case SCMP_ARCH_S390X
:
1199 case SCMP_ARCH_MIPSEL
:
1200 case SCMP_ARCH_MIPS
:
1202 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1211 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1216 int af
, first
= 0, last
= 0;
1219 /* If this is a whitelist, we first block the address families that are out of range and then
1220 * everything that is not in the set. First, we find the lowest and highest address family in
1223 SET_FOREACH(afp
, address_families
, i
) {
1224 af
= PTR_TO_INT(afp
);
1226 if (af
<= 0 || af
>= af_max())
1229 if (first
== 0 || af
< first
)
1232 if (last
== 0 || af
> last
)
1236 assert((first
== 0) == (last
== 0));
1240 /* No entries in the valid range, block everything */
1241 r
= seccomp_rule_add_exact(
1243 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1247 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1253 /* Block everything below the first entry */
1254 r
= seccomp_rule_add_exact(
1256 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1259 SCMP_A0(SCMP_CMP_LT
, first
));
1261 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1265 /* Block everything above the last entry */
1266 r
= seccomp_rule_add_exact(
1268 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1271 SCMP_A0(SCMP_CMP_GT
, last
));
1273 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1277 /* Block everything between the first and last entry */
1278 for (af
= 1; af
< af_max(); af
++) {
1280 if (set_contains(address_families
, INT_TO_PTR(af
)))
1283 r
= seccomp_rule_add_exact(
1285 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1288 SCMP_A0(SCMP_CMP_EQ
, af
));
1293 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1301 /* If this is a blacklist, then generate one rule for
1302 * each address family that are then combined in OR
1305 SET_FOREACH(af
, address_families
, i
) {
1307 r
= seccomp_rule_add_exact(
1309 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1312 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1317 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1322 r
= seccomp_load(seccomp
);
1323 if (IN_SET(r
, -EPERM
, -EACCES
))
1326 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1332 int seccomp_restrict_realtime(void) {
1333 static const int permitted_policies
[] = {
1339 int r
, max_policy
= 0;
1343 /* Determine the highest policy constant we want to allow */
1344 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1345 if (permitted_policies
[i
] > max_policy
)
1346 max_policy
= permitted_policies
[i
];
1348 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1349 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1352 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1354 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1358 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1360 for (p
= 0; p
< max_policy
; p
++) {
1363 /* Check if this is in the whitelist. */
1364 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1365 if (permitted_policies
[i
] == p
) {
1373 /* Deny this policy */
1374 r
= seccomp_rule_add_exact(
1376 SCMP_ACT_ERRNO(EPERM
),
1377 SCMP_SYS(sched_setscheduler
),
1379 SCMP_A1(SCMP_CMP_EQ
, p
));
1381 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1386 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1387 * unsigned here, hence no need no check for < 0 values. */
1388 r
= seccomp_rule_add_exact(
1390 SCMP_ACT_ERRNO(EPERM
),
1391 SCMP_SYS(sched_setscheduler
),
1393 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1395 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1399 r
= seccomp_load(seccomp
);
1400 if (IN_SET(r
, -EPERM
, -EACCES
))
1403 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1409 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1412 unsigned int arg_cnt
,
1413 const struct scmp_arg_cmp arg
) {
1416 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1418 _cleanup_free_
char *n
= NULL
;
1420 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1421 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1423 seccomp_arch_to_string(arch
));
1429 /* For known architectures, check that syscalls are indeed defined or not. */
1430 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1431 assert_cc(SCMP_SYS(shmget
) > 0);
1432 assert_cc(SCMP_SYS(shmat
) > 0);
1433 assert_cc(SCMP_SYS(shmdt
) > 0);
1434 #elif defined(__i386__) || defined(__powerpc64__)
1435 assert_cc(SCMP_SYS(shmget
) < 0);
1436 assert_cc(SCMP_SYS(shmat
) < 0);
1437 assert_cc(SCMP_SYS(shmdt
) < 0);
1440 int seccomp_memory_deny_write_execute(void) {
1445 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1446 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1447 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1449 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1454 filter_syscall
= SCMP_SYS(mmap2
);
1455 block_syscall
= SCMP_SYS(mmap
);
1459 case SCMP_ARCH_PPC64
:
1460 case SCMP_ARCH_PPC64LE
:
1461 filter_syscall
= SCMP_SYS(mmap
);
1463 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1464 * We ignore that here, which means there's still a way to get writable/executable
1465 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1470 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1471 shmat_syscall
= SCMP_SYS(shmat
);
1474 case SCMP_ARCH_X86_64
:
1476 case SCMP_ARCH_AARCH64
:
1477 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1478 shmat_syscall
= SCMP_SYS(shmat
);
1481 /* Please add more definitions here, if you port systemd to other architectures! */
1483 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1484 #warning "Consider adding the right mmap() syscall definitions here!"
1488 /* Can't filter mmap() on this arch, then skip it */
1489 if (filter_syscall
== 0)
1492 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1496 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1498 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1502 if (block_syscall
!= 0) {
1503 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1508 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1510 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1514 #ifdef __NR_pkey_mprotect
1515 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1517 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1522 if (shmat_syscall
!= 0) {
1523 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1525 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1530 r
= seccomp_load(seccomp
);
1531 if (IN_SET(r
, -EPERM
, -EACCES
))
1534 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1540 int seccomp_restrict_archs(Set
*archs
) {
1541 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1546 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1549 * There are some qualifications. However the most important use is to stop processes from bypassing
1550 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1551 * in a non-native architecture. There are no holes in this use case, at least so far. */
1553 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1554 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1555 * to run a program with the restrictions applied. */
1556 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1560 SET_FOREACH(id
, archs
, i
) {
1561 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1562 if (r
< 0 && r
!= -EEXIST
)
1566 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1567 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1568 * The important thing is that you can block the old 32-bit x86 syscalls.
1569 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1571 if (seccomp_arch_native() == SCMP_ARCH_X32
||
1572 set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1))) {
1574 r
= seccomp_arch_add(seccomp
, SCMP_ARCH_X86_64
);
1575 if (r
< 0 && r
!= -EEXIST
)
1579 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1583 r
= seccomp_load(seccomp
);
1584 if (IN_SET(r
, -EPERM
, -EACCES
))
1587 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1592 int parse_syscall_archs(char **l
, Set
**archs
) {
1593 _cleanup_set_free_ Set
*_archs
;
1600 r
= set_ensure_allocated(&_archs
, NULL
);
1604 STRV_FOREACH(s
, l
) {
1607 r
= seccomp_arch_from_string(*s
, &a
);
1611 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1622 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1628 NULSTR_FOREACH(i
, set
->value
) {
1631 const SyscallFilterSet
*more
;
1633 more
= syscall_filter_set_find(i
);
1637 r
= seccomp_filter_set_add(filter
, add
, more
);
1643 id
= seccomp_syscall_resolve_name(i
);
1644 if (id
== __NR_SCMP_ERROR
) {
1645 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1650 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1654 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1661 int seccomp_lock_personality(unsigned long personality
) {
1665 if (personality
>= PERSONALITY_INVALID
)
1668 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1669 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1671 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1675 r
= seccomp_rule_add_exact(
1677 SCMP_ACT_ERRNO(EPERM
),
1678 SCMP_SYS(personality
),
1680 SCMP_A0(SCMP_CMP_NE
, personality
));
1682 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1686 r
= seccomp_load(seccomp
);
1687 if (IN_SET(r
, -EPERM
, -EACCES
))
1690 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));