2 This file is part of systemd.
4 Copyright 2014 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <linux/seccomp.h>
25 #include <sys/prctl.h>
29 #include "alloc-util.h"
32 #include "seccomp-util.h"
34 #include "string-util.h"
37 #include "errno-list.h"
39 const uint32_t seccomp_local_archs
[] = {
41 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43 #if defined(__x86_64__) && defined(__ILP32__)
46 SCMP_ARCH_X32
, /* native */
47 #elif defined(__x86_64__) && !defined(__ILP32__)
50 SCMP_ARCH_X86_64
, /* native */
51 #elif defined(__i386__)
53 #elif defined(__aarch64__)
55 SCMP_ARCH_AARCH64
, /* native */
56 #elif defined(__arm__)
58 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPS
, /* native */
61 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
63 SCMP_ARCH_MIPSEL
, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
67 SCMP_ARCH_MIPSEL64N32
,
70 SCMP_ARCH_MIPS64
, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
75 SCMP_ARCH_MIPSEL64N32
,
77 SCMP_ARCH_MIPSEL64
, /* native */
78 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
83 SCMP_ARCH_MIPSEL64N32
,
84 SCMP_ARCH_MIPS64N32
, /* native */
85 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
91 SCMP_ARCH_MIPSEL64N32
, /* native */
92 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
95 SCMP_ARCH_PPC64
, /* native */
96 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
99 SCMP_ARCH_PPC64LE
, /* native */
100 #elif defined(__powerpc__)
102 #elif defined(__s390x__)
104 SCMP_ARCH_S390X
, /* native */
105 #elif defined(__s390__)
111 const char* seccomp_arch_to_string(uint32_t c
) {
112 /* Maintain order used in <seccomp.h>.
114 * Names used here should be the same as those used for ConditionArchitecture=,
115 * except for "subarchitectures" like x32. */
118 case SCMP_ARCH_NATIVE
:
122 case SCMP_ARCH_X86_64
:
128 case SCMP_ARCH_AARCH64
:
132 case SCMP_ARCH_MIPS64
:
134 case SCMP_ARCH_MIPS64N32
:
136 case SCMP_ARCH_MIPSEL
:
138 case SCMP_ARCH_MIPSEL64
:
140 case SCMP_ARCH_MIPSEL64N32
:
141 return "mips64-le-n32";
144 case SCMP_ARCH_PPC64
:
146 case SCMP_ARCH_PPC64LE
:
150 case SCMP_ARCH_S390X
:
157 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
163 if (streq(n
, "native"))
164 *ret
= SCMP_ARCH_NATIVE
;
165 else if (streq(n
, "x86"))
166 *ret
= SCMP_ARCH_X86
;
167 else if (streq(n
, "x86-64"))
168 *ret
= SCMP_ARCH_X86_64
;
169 else if (streq(n
, "x32"))
170 *ret
= SCMP_ARCH_X32
;
171 else if (streq(n
, "arm"))
172 *ret
= SCMP_ARCH_ARM
;
173 else if (streq(n
, "arm64"))
174 *ret
= SCMP_ARCH_AARCH64
;
175 else if (streq(n
, "mips"))
176 *ret
= SCMP_ARCH_MIPS
;
177 else if (streq(n
, "mips64"))
178 *ret
= SCMP_ARCH_MIPS64
;
179 else if (streq(n
, "mips64-n32"))
180 *ret
= SCMP_ARCH_MIPS64N32
;
181 else if (streq(n
, "mips-le"))
182 *ret
= SCMP_ARCH_MIPSEL
;
183 else if (streq(n
, "mips64-le"))
184 *ret
= SCMP_ARCH_MIPSEL64
;
185 else if (streq(n
, "mips64-le-n32"))
186 *ret
= SCMP_ARCH_MIPSEL64N32
;
187 else if (streq(n
, "ppc"))
188 *ret
= SCMP_ARCH_PPC
;
189 else if (streq(n
, "ppc64"))
190 *ret
= SCMP_ARCH_PPC64
;
191 else if (streq(n
, "ppc64-le"))
192 *ret
= SCMP_ARCH_PPC64LE
;
193 else if (streq(n
, "s390"))
194 *ret
= SCMP_ARCH_S390
;
195 else if (streq(n
, "s390x"))
196 *ret
= SCMP_ARCH_S390X
;
203 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
204 scmp_filter_ctx seccomp
;
207 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
208 * any others. Also, turns off the NNP fiddling. */
210 seccomp
= seccomp_init(default_action
);
214 if (arch
!= SCMP_ARCH_NATIVE
&&
215 arch
!= seccomp_arch_native()) {
217 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
221 r
= seccomp_arch_add(seccomp
, arch
);
225 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
226 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
227 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
229 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
230 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
233 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
237 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
245 seccomp_release(seccomp
);
249 static bool is_basic_seccomp_available(void) {
250 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
253 static bool is_seccomp_filter_available(void) {
254 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
258 bool is_seccomp_available(void) {
259 static int cached_enabled
= -1;
261 if (cached_enabled
< 0)
263 is_basic_seccomp_available() &&
264 is_seccomp_filter_available();
266 return cached_enabled
;
269 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
270 [SYSCALL_FILTER_SET_DEFAULT
] = {
272 .help
= "System calls that are always permitted",
280 "getrlimit\0" /* make sure processes can query stack size and such */
288 [SYSCALL_FILTER_SET_BASIC_IO
] = {
306 [SYSCALL_FILTER_SET_CLOCK
] = {
308 .help
= "Change the system time",
316 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
317 .name
= "@cpu-emulation",
318 .help
= "System calls for CPU emulation functionality",
326 [SYSCALL_FILTER_SET_DEBUG
] = {
328 .help
= "Debugging, performance monitoring and tracing functionality",
333 "process_vm_writev\0"
336 #ifdef __NR_s390_runtime_instr
337 "s390_runtime_instr\0"
339 "sys_debug_setcontext\0"
341 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
342 .name
= "@file-system",
343 .help
= "File system operations",
372 "inotify_add_watch\0"
414 [SYSCALL_FILTER_SET_IO_EVENT
] = {
416 .help
= "Event loop system calls",
433 [SYSCALL_FILTER_SET_IPC
] = {
435 .help
= "SysV IPC, POSIX Message Queues or other IPC",
452 "process_vm_writev\0"
462 [SYSCALL_FILTER_SET_KEYRING
] = {
464 .help
= "Kernel keyring access",
470 [SYSCALL_FILTER_SET_MODULE
] = {
472 .help
= "Loading and unloading of kernel modules",
478 [SYSCALL_FILTER_SET_MOUNT
] = {
480 .help
= "Mounting and unmounting of file systems",
488 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
489 .name
= "@network-io",
490 .help
= "Network or Unix socket IO, should not be needed if not network facing",
514 [SYSCALL_FILTER_SET_OBSOLETE
] = {
515 /* some unknown even to libseccomp */
517 .help
= "Unusual, obsolete or unimplemented system calls",
545 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
546 .name
= "@privileged",
547 .help
= "All system calls which need super-user capabilities",
586 [SYSCALL_FILTER_SET_PROCESS
] = {
588 .help
= "Process control, execution, namespaceing operations",
602 [SYSCALL_FILTER_SET_RAW_IO
] = {
604 .help
= "Raw I/O port access",
611 #ifdef __NR_s390_pci_mmio_read
612 "s390_pci_mmio_read\0"
614 #ifdef __NR_s390_pci_mmio_write
615 "s390_pci_mmio_write\0"
618 [SYSCALL_FILTER_SET_REBOOT
] = {
620 .help
= "Reboot and reboot preparation/kexec",
626 [SYSCALL_FILTER_SET_RESOURCES
] = {
627 .name
= "@resources",
628 .help
= "Alter resource settings",
631 "sched_setscheduler\0"
632 "sched_setaffinity\0"
642 [SYSCALL_FILTER_SET_SETUID
] = {
644 .help
= "Operations for changing user/group credentials",
661 [SYSCALL_FILTER_SET_SWAP
] = {
663 .help
= "Enable/disable swap devices",
670 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
673 if (isempty(name
) || name
[0] != '@')
676 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
677 if (streq(syscall_filter_sets
[i
].name
, name
))
678 return syscall_filter_sets
+ i
;
683 static int seccomp_add_syscall_filter_set(
684 scmp_filter_ctx seccomp
,
685 uint32_t default_action
,
686 const SyscallFilterSet
*set
,
695 NULSTR_FOREACH(sys
, set
->value
) {
699 const SyscallFilterSet
*other
;
701 other
= syscall_filter_set_find(sys
);
705 r
= seccomp_add_syscall_filter_set(seccomp
, default_action
, other
, action
);
709 id
= seccomp_syscall_resolve_name(sys
);
710 if (id
== __NR_SCMP_ERROR
)
711 return -EINVAL
; /* Not known at all? Then that's a real error */
713 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
715 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
716 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", sys
, id
);
723 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
729 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
730 * earch local arch. */
732 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
733 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
735 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
737 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
741 r
= seccomp_add_syscall_filter_set(seccomp
, default_action
, set
, action
);
743 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
747 r
= seccomp_load(seccomp
);
748 if (IN_SET(r
, -EPERM
, -EACCES
))
751 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
757 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Set
* set
, uint32_t action
) {
761 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
762 * SyscallFilterSet* table. */
764 if (set_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
767 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
768 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
772 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
774 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
778 SET_FOREACH(id
, set
, i
) {
779 r
= seccomp_rule_add_exact(seccomp
, action
, PTR_TO_INT(id
) - 1, 0);
781 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
782 _cleanup_free_
char *n
= NULL
;
784 n
= seccomp_syscall_resolve_num_arch(arch
, PTR_TO_INT(id
) - 1);
785 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
789 r
= seccomp_load(seccomp
);
790 if (IN_SET(r
, -EPERM
, -EACCES
))
793 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
799 int seccomp_restrict_namespaces(unsigned long retain
) {
803 if (log_get_max_level() >= LOG_DEBUG
) {
804 _cleanup_free_
char *s
= NULL
;
806 (void) namespace_flag_to_string_many(retain
, &s
);
807 log_debug("Restricting namespace to: %s.", strna(s
));
811 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
814 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
815 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
818 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
820 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
824 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
825 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
827 r
= seccomp_rule_add_exact(
829 SCMP_ACT_ERRNO(EPERM
),
833 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
834 * special invocation with a zero flags argument, right here. */
835 r
= seccomp_rule_add_exact(
837 SCMP_ACT_ERRNO(EPERM
),
840 SCMP_A1(SCMP_CMP_EQ
, 0));
842 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
846 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
849 f
= namespace_flag_map
[i
].flag
;
850 if ((retain
& f
) == f
) {
851 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
855 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
857 r
= seccomp_rule_add_exact(
859 SCMP_ACT_ERRNO(EPERM
),
862 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
864 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
868 /* On s390/s390x the first two parameters to clone are switched */
869 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
870 r
= seccomp_rule_add_exact(
872 SCMP_ACT_ERRNO(EPERM
),
875 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
877 r
= seccomp_rule_add_exact(
879 SCMP_ACT_ERRNO(EPERM
),
882 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
884 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
888 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
889 r
= seccomp_rule_add_exact(
891 SCMP_ACT_ERRNO(EPERM
),
894 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
896 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
904 r
= seccomp_load(seccomp
);
905 if (IN_SET(r
, -EPERM
, -EACCES
))
908 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
914 int seccomp_protect_sysctl(void) {
918 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
919 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
921 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
923 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
924 /* No _sysctl syscall */
927 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
931 r
= seccomp_rule_add_exact(
933 SCMP_ACT_ERRNO(EPERM
),
937 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
941 r
= seccomp_load(seccomp
);
942 if (IN_SET(r
, -EPERM
, -EACCES
))
945 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
951 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
955 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
956 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
960 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
964 case SCMP_ARCH_X86_64
:
967 case SCMP_ARCH_AARCH64
:
968 case SCMP_ARCH_PPC64
:
969 case SCMP_ARCH_PPC64LE
:
970 /* These we know we support (i.e. are the ones that do not use socketcall()) */
975 case SCMP_ARCH_S390X
:
979 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
988 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
993 int af
, first
= 0, last
= 0;
996 /* If this is a whitelist, we first block the address families that are out of range and then
997 * everything that is not in the set. First, we find the lowest and highest address family in
1000 SET_FOREACH(afp
, address_families
, i
) {
1001 af
= PTR_TO_INT(afp
);
1003 if (af
<= 0 || af
>= af_max())
1006 if (first
== 0 || af
< first
)
1009 if (last
== 0 || af
> last
)
1013 assert((first
== 0) == (last
== 0));
1017 /* No entries in the valid range, block everything */
1018 r
= seccomp_rule_add_exact(
1020 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1024 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1030 /* Block everything below the first entry */
1031 r
= seccomp_rule_add_exact(
1033 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1036 SCMP_A0(SCMP_CMP_LT
, first
));
1038 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1042 /* Block everything above the last entry */
1043 r
= seccomp_rule_add_exact(
1045 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1048 SCMP_A0(SCMP_CMP_GT
, last
));
1050 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1054 /* Block everything between the first and last entry */
1055 for (af
= 1; af
< af_max(); af
++) {
1057 if (set_contains(address_families
, INT_TO_PTR(af
)))
1060 r
= seccomp_rule_add_exact(
1062 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1065 SCMP_A0(SCMP_CMP_EQ
, af
));
1071 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1079 /* If this is a blacklist, then generate one rule for
1080 * each address family that are then combined in OR
1083 SET_FOREACH(af
, address_families
, i
) {
1085 r
= seccomp_rule_add_exact(
1087 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1090 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1096 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1101 r
= seccomp_load(seccomp
);
1102 if (IN_SET(r
, -EPERM
, -EACCES
))
1105 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1111 int seccomp_restrict_realtime(void) {
1112 static const int permitted_policies
[] = {
1118 int r
, max_policy
= 0;
1122 /* Determine the highest policy constant we want to allow */
1123 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1124 if (permitted_policies
[i
] > max_policy
)
1125 max_policy
= permitted_policies
[i
];
1127 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1128 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1131 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1133 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1137 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1139 for (p
= 0; p
< max_policy
; p
++) {
1142 /* Check if this is in the whitelist. */
1143 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1144 if (permitted_policies
[i
] == p
) {
1152 /* Deny this policy */
1153 r
= seccomp_rule_add_exact(
1155 SCMP_ACT_ERRNO(EPERM
),
1156 SCMP_SYS(sched_setscheduler
),
1158 SCMP_A1(SCMP_CMP_EQ
, p
));
1160 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1165 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1166 * unsigned here, hence no need no check for < 0 values. */
1167 r
= seccomp_rule_add_exact(
1169 SCMP_ACT_ERRNO(EPERM
),
1170 SCMP_SYS(sched_setscheduler
),
1172 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1174 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1178 r
= seccomp_load(seccomp
);
1179 if (IN_SET(r
, -EPERM
, -EACCES
))
1182 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1188 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1191 unsigned int arg_cnt
,
1192 const struct scmp_arg_cmp arg
) {
1195 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1197 _cleanup_free_
char *n
= NULL
;
1199 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1200 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1202 seccomp_arch_to_string(arch
));
1208 /* For known architectures, check that syscalls are indeed defined or not. */
1209 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1210 assert_cc(SCMP_SYS(shmget
) > 0);
1211 assert_cc(SCMP_SYS(shmat
) > 0);
1212 assert_cc(SCMP_SYS(shmdt
) > 0);
1213 #elif defined(__i386__) || defined(__powerpc64__)
1214 assert_cc(SCMP_SYS(shmget
) < 0);
1215 assert_cc(SCMP_SYS(shmat
) < 0);
1216 assert_cc(SCMP_SYS(shmdt
) < 0);
1219 int seccomp_memory_deny_write_execute(void) {
1224 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1225 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1226 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1228 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1233 filter_syscall
= SCMP_SYS(mmap2
);
1234 block_syscall
= SCMP_SYS(mmap
);
1237 case SCMP_ARCH_PPC64
:
1238 case SCMP_ARCH_PPC64LE
:
1239 filter_syscall
= SCMP_SYS(mmap
);
1241 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1242 * We ignore that here, which means there's still a way to get writable/executable
1243 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1248 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1249 shmat_syscall
= SCMP_SYS(shmat
);
1252 case SCMP_ARCH_X86_64
:
1254 case SCMP_ARCH_AARCH64
:
1255 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1256 shmat_syscall
= SCMP_SYS(shmat
);
1259 /* Please add more definitions here, if you port systemd to other architectures! */
1261 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1262 #warning "Consider adding the right mmap() syscall definitions here!"
1266 /* Can't filter mmap() on this arch, then skip it */
1267 if (filter_syscall
== 0)
1270 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1274 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1276 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1280 if (block_syscall
!= 0) {
1281 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1286 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1288 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1292 if (shmat_syscall
!= 0) {
1293 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1295 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1300 r
= seccomp_load(seccomp
);
1301 if (IN_SET(r
, -EPERM
, -EACCES
))
1304 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1310 int seccomp_restrict_archs(Set
*archs
) {
1311 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1316 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1319 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1323 SET_FOREACH(id
, archs
, i
) {
1324 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1331 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1335 return seccomp_load(seccomp
);
1338 int parse_syscall_archs(char **l
, Set
**archs
) {
1339 _cleanup_set_free_ Set
*_archs
;
1346 r
= set_ensure_allocated(&_archs
, NULL
);
1350 STRV_FOREACH(s
, l
) {
1353 r
= seccomp_arch_from_string(*s
, &a
);
1357 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1368 int seccomp_filter_set_add(Set
*filter
, bool add
, const SyscallFilterSet
*set
) {
1374 NULSTR_FOREACH(i
, set
->value
) {
1377 const SyscallFilterSet
*more
;
1379 more
= syscall_filter_set_find(i
);
1384 r
= seccomp_filter_set_add(filter
, add
, more
);
1390 id
= seccomp_syscall_resolve_name(i
);
1391 if (id
== __NR_SCMP_ERROR
)
1395 r
= set_put(filter
, INT_TO_PTR(id
+ 1));
1399 (void) set_remove(filter
, INT_TO_PTR(id
+ 1));