1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2014 Lennart Poettering
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
22 #include <linux/seccomp.h>
26 #include <sys/prctl.h>
30 #include "alloc-util.h"
33 #include "process-util.h"
34 #include "seccomp-util.h"
36 #include "string-util.h"
39 #include "errno-list.h"
41 const uint32_t seccomp_local_archs
[] = {
43 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
45 #if defined(__x86_64__) && defined(__ILP32__)
48 SCMP_ARCH_X32
, /* native */
49 #elif defined(__x86_64__) && !defined(__ILP32__)
52 SCMP_ARCH_X86_64
, /* native */
53 #elif defined(__i386__)
55 #elif defined(__aarch64__)
57 SCMP_ARCH_AARCH64
, /* native */
58 #elif defined(__arm__)
60 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
62 SCMP_ARCH_MIPS
, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
65 SCMP_ARCH_MIPSEL
, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
69 SCMP_ARCH_MIPSEL64N32
,
72 SCMP_ARCH_MIPS64
, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
77 SCMP_ARCH_MIPSEL64N32
,
79 SCMP_ARCH_MIPSEL64
, /* native */
80 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
85 SCMP_ARCH_MIPSEL64N32
,
86 SCMP_ARCH_MIPS64N32
, /* native */
87 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
93 SCMP_ARCH_MIPSEL64N32
, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
97 SCMP_ARCH_PPC64
, /* native */
98 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
101 SCMP_ARCH_PPC64LE
, /* native */
102 #elif defined(__powerpc__)
104 #elif defined(__s390x__)
106 SCMP_ARCH_S390X
, /* native */
107 #elif defined(__s390__)
113 const char* seccomp_arch_to_string(uint32_t c
) {
114 /* Maintain order used in <seccomp.h>.
116 * Names used here should be the same as those used for ConditionArchitecture=,
117 * except for "subarchitectures" like x32. */
120 case SCMP_ARCH_NATIVE
:
124 case SCMP_ARCH_X86_64
:
130 case SCMP_ARCH_AARCH64
:
134 case SCMP_ARCH_MIPS64
:
136 case SCMP_ARCH_MIPS64N32
:
138 case SCMP_ARCH_MIPSEL
:
140 case SCMP_ARCH_MIPSEL64
:
142 case SCMP_ARCH_MIPSEL64N32
:
143 return "mips64-le-n32";
146 case SCMP_ARCH_PPC64
:
148 case SCMP_ARCH_PPC64LE
:
152 case SCMP_ARCH_S390X
:
159 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
165 if (streq(n
, "native"))
166 *ret
= SCMP_ARCH_NATIVE
;
167 else if (streq(n
, "x86"))
168 *ret
= SCMP_ARCH_X86
;
169 else if (streq(n
, "x86-64"))
170 *ret
= SCMP_ARCH_X86_64
;
171 else if (streq(n
, "x32"))
172 *ret
= SCMP_ARCH_X32
;
173 else if (streq(n
, "arm"))
174 *ret
= SCMP_ARCH_ARM
;
175 else if (streq(n
, "arm64"))
176 *ret
= SCMP_ARCH_AARCH64
;
177 else if (streq(n
, "mips"))
178 *ret
= SCMP_ARCH_MIPS
;
179 else if (streq(n
, "mips64"))
180 *ret
= SCMP_ARCH_MIPS64
;
181 else if (streq(n
, "mips64-n32"))
182 *ret
= SCMP_ARCH_MIPS64N32
;
183 else if (streq(n
, "mips-le"))
184 *ret
= SCMP_ARCH_MIPSEL
;
185 else if (streq(n
, "mips64-le"))
186 *ret
= SCMP_ARCH_MIPSEL64
;
187 else if (streq(n
, "mips64-le-n32"))
188 *ret
= SCMP_ARCH_MIPSEL64N32
;
189 else if (streq(n
, "ppc"))
190 *ret
= SCMP_ARCH_PPC
;
191 else if (streq(n
, "ppc64"))
192 *ret
= SCMP_ARCH_PPC64
;
193 else if (streq(n
, "ppc64-le"))
194 *ret
= SCMP_ARCH_PPC64LE
;
195 else if (streq(n
, "s390"))
196 *ret
= SCMP_ARCH_S390
;
197 else if (streq(n
, "s390x"))
198 *ret
= SCMP_ARCH_S390X
;
205 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
206 scmp_filter_ctx seccomp
;
209 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
210 * any others. Also, turns off the NNP fiddling. */
212 seccomp
= seccomp_init(default_action
);
216 if (arch
!= SCMP_ARCH_NATIVE
&&
217 arch
!= seccomp_arch_native()) {
219 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
223 r
= seccomp_arch_add(seccomp
, arch
);
227 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
228 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
229 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
231 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
232 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
235 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
239 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
247 seccomp_release(seccomp
);
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
260 bool is_seccomp_available(void) {
261 static int cached_enabled
= -1;
263 if (cached_enabled
< 0)
265 is_basic_seccomp_available() &&
266 is_seccomp_filter_available();
268 return cached_enabled
;
271 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
272 [SYSCALL_FILTER_SET_DEFAULT
] = {
274 .help
= "System calls that are always permitted",
301 "getrlimit\0" /* make sure processes can query stack size and such */
322 [SYSCALL_FILTER_SET_AIO
] = {
324 .help
= "Asynchronous IO",
332 [SYSCALL_FILTER_SET_BASIC_IO
] = {
353 [SYSCALL_FILTER_SET_CHOWN
] = {
355 .help
= "Change ownership of files and directories",
365 [SYSCALL_FILTER_SET_CLOCK
] = {
367 .help
= "Change the system time",
375 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
376 .name
= "@cpu-emulation",
377 .help
= "System calls for CPU emulation functionality",
385 [SYSCALL_FILTER_SET_DEBUG
] = {
387 .help
= "Debugging, performance monitoring and tracing functionality",
392 "process_vm_writev\0"
395 #ifdef __NR_s390_runtime_instr
396 "s390_runtime_instr\0"
398 "sys_debug_setcontext\0"
400 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
401 .name
= "@file-system",
402 .help
= "File system operations",
432 "inotify_add_watch\0"
483 [SYSCALL_FILTER_SET_IO_EVENT
] = {
485 .help
= "Event loop system calls",
502 [SYSCALL_FILTER_SET_IPC
] = {
504 .help
= "SysV IPC, POSIX Message Queues or other IPC",
521 "process_vm_writev\0"
531 [SYSCALL_FILTER_SET_KEYRING
] = {
533 .help
= "Kernel keyring access",
539 [SYSCALL_FILTER_SET_MEMLOCK
] = {
541 .help
= "Memory locking control",
549 [SYSCALL_FILTER_SET_MODULE
] = {
551 .help
= "Loading and unloading of kernel modules",
557 [SYSCALL_FILTER_SET_MOUNT
] = {
559 .help
= "Mounting and unmounting of file systems",
567 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
568 .name
= "@network-io",
569 .help
= "Network or Unix socket IO, should not be needed if not network facing",
593 [SYSCALL_FILTER_SET_OBSOLETE
] = {
594 /* some unknown even to libseccomp */
596 .help
= "Unusual, obsolete or unimplemented system calls",
625 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
626 .name
= "@privileged",
627 .help
= "All system calls which need super-user capabilities",
657 [SYSCALL_FILTER_SET_PROCESS
] = {
659 .help
= "Process control, execution, namespaceing operations",
662 "capget\0" /* Able to query arbitrary processes */
670 "rt_tgsigqueueinfo\0"
681 [SYSCALL_FILTER_SET_RAW_IO
] = {
683 .help
= "Raw I/O port access",
690 #ifdef __NR_s390_pci_mmio_read
691 "s390_pci_mmio_read\0"
693 #ifdef __NR_s390_pci_mmio_write
694 "s390_pci_mmio_write\0"
697 [SYSCALL_FILTER_SET_REBOOT
] = {
699 .help
= "Reboot and reboot preparation/kexec",
705 [SYSCALL_FILTER_SET_RESOURCES
] = {
706 .name
= "@resources",
707 .help
= "Alter resource settings",
714 "sched_setaffinity\0"
717 "sched_setscheduler\0"
722 [SYSCALL_FILTER_SET_SETUID
] = {
724 .help
= "Operations for changing user/group credentials",
741 [SYSCALL_FILTER_SET_SIGNAL
] = {
743 .help
= "Process signal handling",
759 [SYSCALL_FILTER_SET_SWAP
] = {
761 .help
= "Enable/disable swap devices",
766 [SYSCALL_FILTER_SET_SYNC
] = {
768 .help
= "Synchronize files and memory to storage",
777 [SYSCALL_FILTER_SET_TIMER
] = {
779 .help
= "Schedule operations by time",
796 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
799 if (isempty(name
) || name
[0] != '@')
802 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
803 if (streq(syscall_filter_sets
[i
].name
, name
))
804 return syscall_filter_sets
+ i
;
809 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
, char **exclude
);
811 int seccomp_add_syscall_filter_item(scmp_filter_ctx
*seccomp
, const char *name
, uint32_t action
, char **exclude
) {
817 if (strv_contains(exclude
, name
))
820 if (name
[0] == '@') {
821 const SyscallFilterSet
*other
;
823 other
= syscall_filter_set_find(name
);
825 log_debug("Filter set %s is not known!", name
);
829 r
= seccomp_add_syscall_filter_set(seccomp
, other
, action
, exclude
);
835 id
= seccomp_syscall_resolve_name(name
);
836 if (id
== __NR_SCMP_ERROR
) {
837 log_debug("System call %s is not known, ignoring.", name
);
841 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
843 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
844 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", name
, id
);
850 static int seccomp_add_syscall_filter_set(
851 scmp_filter_ctx seccomp
,
852 const SyscallFilterSet
*set
,
862 NULSTR_FOREACH(sys
, set
->value
) {
863 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
);
871 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
877 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
878 * earch local arch. */
880 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
881 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
883 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
885 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
889 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
, NULL
);
891 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
895 r
= seccomp_load(seccomp
);
896 if (IN_SET(r
, -EPERM
, -EACCES
))
899 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
905 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* set
, uint32_t action
) {
909 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
910 * SyscallFilterSet* table. */
912 if (hashmap_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
915 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
916 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
920 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
922 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
926 HASHMAP_FOREACH_KEY(val
, id
, set
, i
) {
928 int e
= PTR_TO_INT(val
);
930 if (action
!= SCMP_ACT_ALLOW
&& e
>= 0)
931 a
= SCMP_ACT_ERRNO(e
);
933 r
= seccomp_rule_add_exact(seccomp
, a
, PTR_TO_INT(id
) - 1, 0);
935 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
936 _cleanup_free_
char *n
= NULL
;
938 n
= seccomp_syscall_resolve_num_arch(arch
, PTR_TO_INT(id
) - 1);
939 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
943 r
= seccomp_load(seccomp
);
944 if (IN_SET(r
, -EPERM
, -EACCES
))
947 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
953 int seccomp_restrict_namespaces(unsigned long retain
) {
957 if (log_get_max_level() >= LOG_DEBUG
) {
958 _cleanup_free_
char *s
= NULL
;
960 (void) namespace_flag_to_string_many(retain
, &s
);
961 log_debug("Restricting namespace to: %s.", strna(s
));
965 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
968 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
969 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
972 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
974 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
978 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
979 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
981 r
= seccomp_rule_add_exact(
983 SCMP_ACT_ERRNO(EPERM
),
987 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
988 * special invocation with a zero flags argument, right here. */
989 r
= seccomp_rule_add_exact(
991 SCMP_ACT_ERRNO(EPERM
),
994 SCMP_A1(SCMP_CMP_EQ
, 0));
996 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1000 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
1003 f
= namespace_flag_map
[i
].flag
;
1004 if ((retain
& f
) == f
) {
1005 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1009 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1011 r
= seccomp_rule_add_exact(
1013 SCMP_ACT_ERRNO(EPERM
),
1016 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1018 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1022 /* On s390/s390x the first two parameters to clone are switched */
1023 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1024 r
= seccomp_rule_add_exact(
1026 SCMP_ACT_ERRNO(EPERM
),
1029 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1031 r
= seccomp_rule_add_exact(
1033 SCMP_ACT_ERRNO(EPERM
),
1036 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1038 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1042 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1043 r
= seccomp_rule_add_exact(
1045 SCMP_ACT_ERRNO(EPERM
),
1048 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1050 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1058 r
= seccomp_load(seccomp
);
1059 if (IN_SET(r
, -EPERM
, -EACCES
))
1062 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1068 int seccomp_protect_sysctl(void) {
1072 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1073 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1075 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1077 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
1078 /* No _sysctl syscall */
1081 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1085 r
= seccomp_rule_add_exact(
1087 SCMP_ACT_ERRNO(EPERM
),
1091 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1095 r
= seccomp_load(seccomp
);
1096 if (IN_SET(r
, -EPERM
, -EACCES
))
1099 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1105 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
1109 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1110 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1114 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1118 case SCMP_ARCH_X86_64
:
1121 case SCMP_ARCH_AARCH64
:
1122 case SCMP_ARCH_PPC64
:
1123 case SCMP_ARCH_PPC64LE
:
1124 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1128 case SCMP_ARCH_S390
:
1129 case SCMP_ARCH_S390X
:
1133 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1142 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1147 int af
, first
= 0, last
= 0;
1150 /* If this is a whitelist, we first block the address families that are out of range and then
1151 * everything that is not in the set. First, we find the lowest and highest address family in
1154 SET_FOREACH(afp
, address_families
, i
) {
1155 af
= PTR_TO_INT(afp
);
1157 if (af
<= 0 || af
>= af_max())
1160 if (first
== 0 || af
< first
)
1163 if (last
== 0 || af
> last
)
1167 assert((first
== 0) == (last
== 0));
1171 /* No entries in the valid range, block everything */
1172 r
= seccomp_rule_add_exact(
1174 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1178 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1184 /* Block everything below the first entry */
1185 r
= seccomp_rule_add_exact(
1187 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1190 SCMP_A0(SCMP_CMP_LT
, first
));
1192 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1196 /* Block everything above the last entry */
1197 r
= seccomp_rule_add_exact(
1199 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1202 SCMP_A0(SCMP_CMP_GT
, last
));
1204 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1208 /* Block everything between the first and last entry */
1209 for (af
= 1; af
< af_max(); af
++) {
1211 if (set_contains(address_families
, INT_TO_PTR(af
)))
1214 r
= seccomp_rule_add_exact(
1216 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1219 SCMP_A0(SCMP_CMP_EQ
, af
));
1224 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1232 /* If this is a blacklist, then generate one rule for
1233 * each address family that are then combined in OR
1236 SET_FOREACH(af
, address_families
, i
) {
1238 r
= seccomp_rule_add_exact(
1240 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1243 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1248 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1253 r
= seccomp_load(seccomp
);
1254 if (IN_SET(r
, -EPERM
, -EACCES
))
1257 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1263 int seccomp_restrict_realtime(void) {
1264 static const int permitted_policies
[] = {
1270 int r
, max_policy
= 0;
1274 /* Determine the highest policy constant we want to allow */
1275 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1276 if (permitted_policies
[i
] > max_policy
)
1277 max_policy
= permitted_policies
[i
];
1279 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1280 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1283 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1285 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1289 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1291 for (p
= 0; p
< max_policy
; p
++) {
1294 /* Check if this is in the whitelist. */
1295 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1296 if (permitted_policies
[i
] == p
) {
1304 /* Deny this policy */
1305 r
= seccomp_rule_add_exact(
1307 SCMP_ACT_ERRNO(EPERM
),
1308 SCMP_SYS(sched_setscheduler
),
1310 SCMP_A1(SCMP_CMP_EQ
, p
));
1312 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1317 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1318 * unsigned here, hence no need no check for < 0 values. */
1319 r
= seccomp_rule_add_exact(
1321 SCMP_ACT_ERRNO(EPERM
),
1322 SCMP_SYS(sched_setscheduler
),
1324 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1326 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1330 r
= seccomp_load(seccomp
);
1331 if (IN_SET(r
, -EPERM
, -EACCES
))
1334 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1340 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1343 unsigned int arg_cnt
,
1344 const struct scmp_arg_cmp arg
) {
1347 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1349 _cleanup_free_
char *n
= NULL
;
1351 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1352 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1354 seccomp_arch_to_string(arch
));
1360 /* For known architectures, check that syscalls are indeed defined or not. */
1361 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1362 assert_cc(SCMP_SYS(shmget
) > 0);
1363 assert_cc(SCMP_SYS(shmat
) > 0);
1364 assert_cc(SCMP_SYS(shmdt
) > 0);
1365 #elif defined(__i386__) || defined(__powerpc64__)
1366 assert_cc(SCMP_SYS(shmget
) < 0);
1367 assert_cc(SCMP_SYS(shmat
) < 0);
1368 assert_cc(SCMP_SYS(shmdt
) < 0);
1371 int seccomp_memory_deny_write_execute(void) {
1376 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1377 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1378 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1380 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1385 filter_syscall
= SCMP_SYS(mmap2
);
1386 block_syscall
= SCMP_SYS(mmap
);
1389 case SCMP_ARCH_PPC64
:
1390 case SCMP_ARCH_PPC64LE
:
1391 filter_syscall
= SCMP_SYS(mmap
);
1393 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1394 * We ignore that here, which means there's still a way to get writable/executable
1395 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1400 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1401 shmat_syscall
= SCMP_SYS(shmat
);
1404 case SCMP_ARCH_X86_64
:
1406 case SCMP_ARCH_AARCH64
:
1407 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1408 shmat_syscall
= SCMP_SYS(shmat
);
1411 /* Please add more definitions here, if you port systemd to other architectures! */
1413 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1414 #warning "Consider adding the right mmap() syscall definitions here!"
1418 /* Can't filter mmap() on this arch, then skip it */
1419 if (filter_syscall
== 0)
1422 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1426 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1428 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1432 if (block_syscall
!= 0) {
1433 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1438 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1440 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1444 #ifdef __NR_pkey_mprotect
1445 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1447 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1452 if (shmat_syscall
!= 0) {
1453 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1455 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1460 r
= seccomp_load(seccomp
);
1461 if (IN_SET(r
, -EPERM
, -EACCES
))
1464 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1470 int seccomp_restrict_archs(Set
*archs
) {
1471 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1476 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1479 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1483 SET_FOREACH(id
, archs
, i
) {
1484 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1491 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1495 r
= seccomp_load(seccomp
);
1496 if (IN_SET(r
, -EPERM
, -EACCES
))
1499 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1504 int parse_syscall_archs(char **l
, Set
**archs
) {
1505 _cleanup_set_free_ Set
*_archs
;
1512 r
= set_ensure_allocated(&_archs
, NULL
);
1516 STRV_FOREACH(s
, l
) {
1519 r
= seccomp_arch_from_string(*s
, &a
);
1523 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1534 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1540 NULSTR_FOREACH(i
, set
->value
) {
1543 const SyscallFilterSet
*more
;
1545 more
= syscall_filter_set_find(i
);
1549 r
= seccomp_filter_set_add(filter
, add
, more
);
1555 id
= seccomp_syscall_resolve_name(i
);
1556 if (id
== __NR_SCMP_ERROR
) {
1557 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1562 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1566 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1573 int seccomp_lock_personality(unsigned long personality
) {
1577 if (personality
>= PERSONALITY_INVALID
)
1580 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1581 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1583 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1587 r
= seccomp_rule_add_exact(
1589 SCMP_ACT_ERRNO(EPERM
),
1590 SCMP_SYS(personality
),
1592 SCMP_A0(SCMP_CMP_NE
, personality
));
1594 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1598 r
= seccomp_load(seccomp
);
1599 if (IN_SET(r
, -EPERM
, -EACCES
))
1602 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));