2 This file is part of systemd.
4 Copyright 2014 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <linux/seccomp.h>
25 #include <sys/prctl.h>
29 #include "alloc-util.h"
32 #include "seccomp-util.h"
33 #include "string-util.h"
35 #include "errno-list.h"
37 const uint32_t seccomp_local_archs
[] = {
39 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
41 #if defined(__x86_64__) && defined(__ILP32__)
44 SCMP_ARCH_X32
, /* native */
45 #elif defined(__x86_64__) && !defined(__ILP32__)
48 SCMP_ARCH_X86_64
, /* native */
49 #elif defined(__i386__)
51 #elif defined(__aarch64__)
53 SCMP_ARCH_AARCH64
, /* native */
54 #elif defined(__arm__)
56 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
58 SCMP_ARCH_MIPS
, /* native */
59 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
61 SCMP_ARCH_MIPSEL
, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
65 SCMP_ARCH_MIPSEL64N32
,
68 SCMP_ARCH_MIPS64
, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPSEL64N32
,
75 SCMP_ARCH_MIPSEL64
, /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
81 SCMP_ARCH_MIPSEL64N32
,
82 SCMP_ARCH_MIPS64N32
, /* native */
83 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
89 SCMP_ARCH_MIPSEL64N32
, /* native */
90 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
93 SCMP_ARCH_PPC64
, /* native */
94 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
97 SCMP_ARCH_PPC64LE
, /* native */
98 #elif defined(__powerpc__)
100 #elif defined(__s390x__)
102 SCMP_ARCH_S390X
, /* native */
103 #elif defined(__s390__)
109 const char* seccomp_arch_to_string(uint32_t c
) {
110 /* Maintain order used in <seccomp.h>.
112 * Names used here should be the same as those used for ConditionArchitecture=,
113 * except for "subarchitectures" like x32. */
116 case SCMP_ARCH_NATIVE
:
120 case SCMP_ARCH_X86_64
:
126 case SCMP_ARCH_AARCH64
:
130 case SCMP_ARCH_MIPS64
:
132 case SCMP_ARCH_MIPS64N32
:
134 case SCMP_ARCH_MIPSEL
:
136 case SCMP_ARCH_MIPSEL64
:
138 case SCMP_ARCH_MIPSEL64N32
:
139 return "mips64-le-n32";
142 case SCMP_ARCH_PPC64
:
144 case SCMP_ARCH_PPC64LE
:
148 case SCMP_ARCH_S390X
:
155 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
161 if (streq(n
, "native"))
162 *ret
= SCMP_ARCH_NATIVE
;
163 else if (streq(n
, "x86"))
164 *ret
= SCMP_ARCH_X86
;
165 else if (streq(n
, "x86-64"))
166 *ret
= SCMP_ARCH_X86_64
;
167 else if (streq(n
, "x32"))
168 *ret
= SCMP_ARCH_X32
;
169 else if (streq(n
, "arm"))
170 *ret
= SCMP_ARCH_ARM
;
171 else if (streq(n
, "arm64"))
172 *ret
= SCMP_ARCH_AARCH64
;
173 else if (streq(n
, "mips"))
174 *ret
= SCMP_ARCH_MIPS
;
175 else if (streq(n
, "mips64"))
176 *ret
= SCMP_ARCH_MIPS64
;
177 else if (streq(n
, "mips64-n32"))
178 *ret
= SCMP_ARCH_MIPS64N32
;
179 else if (streq(n
, "mips-le"))
180 *ret
= SCMP_ARCH_MIPSEL
;
181 else if (streq(n
, "mips64-le"))
182 *ret
= SCMP_ARCH_MIPSEL64
;
183 else if (streq(n
, "mips64-le-n32"))
184 *ret
= SCMP_ARCH_MIPSEL64N32
;
185 else if (streq(n
, "ppc"))
186 *ret
= SCMP_ARCH_PPC
;
187 else if (streq(n
, "ppc64"))
188 *ret
= SCMP_ARCH_PPC64
;
189 else if (streq(n
, "ppc64-le"))
190 *ret
= SCMP_ARCH_PPC64LE
;
191 else if (streq(n
, "s390"))
192 *ret
= SCMP_ARCH_S390
;
193 else if (streq(n
, "s390x"))
194 *ret
= SCMP_ARCH_S390X
;
201 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
202 scmp_filter_ctx seccomp
;
205 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
206 * any others. Also, turns off the NNP fiddling. */
208 seccomp
= seccomp_init(default_action
);
212 if (arch
!= SCMP_ARCH_NATIVE
&&
213 arch
!= seccomp_arch_native()) {
215 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
219 r
= seccomp_arch_add(seccomp
, arch
);
223 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
224 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
225 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
227 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
228 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
231 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
235 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
243 seccomp_release(seccomp
);
247 static bool is_basic_seccomp_available(void) {
248 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
251 static bool is_seccomp_filter_available(void) {
252 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
256 bool is_seccomp_available(void) {
257 static int cached_enabled
= -1;
259 if (cached_enabled
< 0)
261 is_basic_seccomp_available() &&
262 is_seccomp_filter_available();
264 return cached_enabled
;
267 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
268 [SYSCALL_FILTER_SET_DEFAULT
] = {
270 .help
= "System calls that are always permitted",
278 "getrlimit\0" /* make sure processes can query stack size and such */
286 [SYSCALL_FILTER_SET_BASIC_IO
] = {
304 [SYSCALL_FILTER_SET_CLOCK
] = {
306 .help
= "Change the system time",
314 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
315 .name
= "@cpu-emulation",
316 .help
= "System calls for CPU emulation functionality",
324 [SYSCALL_FILTER_SET_DEBUG
] = {
326 .help
= "Debugging, performance monitoring and tracing functionality",
331 "process_vm_writev\0"
334 #ifdef __NR_s390_runtime_instr
335 "s390_runtime_instr\0"
337 "sys_debug_setcontext\0"
339 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
340 .name
= "@file-system",
341 .help
= "File system operations",
370 "inotify_add_watch\0"
412 [SYSCALL_FILTER_SET_IO_EVENT
] = {
414 .help
= "Event loop system calls",
431 [SYSCALL_FILTER_SET_IPC
] = {
433 .help
= "SysV IPC, POSIX Message Queues or other IPC",
450 "process_vm_writev\0"
460 [SYSCALL_FILTER_SET_KEYRING
] = {
462 .help
= "Kernel keyring access",
468 [SYSCALL_FILTER_SET_MODULE
] = {
470 .help
= "Loading and unloading of kernel modules",
476 [SYSCALL_FILTER_SET_MOUNT
] = {
478 .help
= "Mounting and unmounting of file systems",
486 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
487 .name
= "@network-io",
488 .help
= "Network or Unix socket IO, should not be needed if not network facing",
512 [SYSCALL_FILTER_SET_OBSOLETE
] = {
513 /* some unknown even to libseccomp */
515 .help
= "Unusual, obsolete or unimplemented system calls",
543 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
544 .name
= "@privileged",
545 .help
= "All system calls which need super-user capabilities",
584 [SYSCALL_FILTER_SET_PROCESS
] = {
586 .help
= "Process control, execution, namespaceing operations",
600 [SYSCALL_FILTER_SET_RAW_IO
] = {
602 .help
= "Raw I/O port access",
609 #ifdef __NR_s390_pci_mmio_read
610 "s390_pci_mmio_read\0"
612 #ifdef __NR_s390_pci_mmio_write
613 "s390_pci_mmio_write\0"
616 [SYSCALL_FILTER_SET_REBOOT
] = {
618 .help
= "Reboot and reboot preparation/kexec",
624 [SYSCALL_FILTER_SET_RESOURCES
] = {
625 .name
= "@resources",
626 .help
= "Alter resource settings",
629 "sched_setscheduler\0"
630 "sched_setaffinity\0"
640 [SYSCALL_FILTER_SET_SWAP
] = {
642 .help
= "Enable/disable swap devices",
649 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
652 if (isempty(name
) || name
[0] != '@')
655 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
656 if (streq(syscall_filter_sets
[i
].name
, name
))
657 return syscall_filter_sets
+ i
;
662 static int seccomp_add_syscall_filter_set(
663 scmp_filter_ctx seccomp
,
664 uint32_t default_action
,
665 const SyscallFilterSet
*set
,
674 NULSTR_FOREACH(sys
, set
->value
) {
678 const SyscallFilterSet
*other
;
680 other
= syscall_filter_set_find(sys
);
684 r
= seccomp_add_syscall_filter_set(seccomp
, default_action
, other
, action
);
688 id
= seccomp_syscall_resolve_name(sys
);
689 if (id
== __NR_SCMP_ERROR
)
690 return -EINVAL
; /* Not known at all? Then that's a real error */
692 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
694 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
695 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", sys
, id
);
702 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
708 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
709 * earch local arch. */
711 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
712 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
714 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
716 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
720 r
= seccomp_add_syscall_filter_set(seccomp
, default_action
, set
, action
);
722 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
726 r
= seccomp_load(seccomp
);
727 if (IN_SET(r
, -EPERM
, -EACCES
))
730 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
736 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Set
* set
, uint32_t action
) {
740 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
741 * SyscallFilterSet* table. */
743 if (set_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
746 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
747 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
751 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
753 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
757 SET_FOREACH(id
, set
, i
) {
758 r
= seccomp_rule_add_exact(seccomp
, action
, PTR_TO_INT(id
) - 1, 0);
760 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
761 _cleanup_free_
char *n
= NULL
;
763 n
= seccomp_syscall_resolve_num_arch(arch
, PTR_TO_INT(id
) - 1);
764 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
768 r
= seccomp_load(seccomp
);
769 if (IN_SET(r
, -EPERM
, -EACCES
))
772 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
778 int seccomp_restrict_namespaces(unsigned long retain
) {
782 if (log_get_max_level() >= LOG_DEBUG
) {
783 _cleanup_free_
char *s
= NULL
;
785 (void) namespace_flag_to_string_many(retain
, &s
);
786 log_debug("Restricting namespace to: %s.", strna(s
));
790 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
793 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
794 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
797 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
799 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
803 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
804 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
806 r
= seccomp_rule_add_exact(
808 SCMP_ACT_ERRNO(EPERM
),
812 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
813 * special invocation with a zero flags argument, right here. */
814 r
= seccomp_rule_add_exact(
816 SCMP_ACT_ERRNO(EPERM
),
819 SCMP_A1(SCMP_CMP_EQ
, 0));
821 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
825 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
828 f
= namespace_flag_map
[i
].flag
;
829 if ((retain
& f
) == f
) {
830 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
834 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
836 r
= seccomp_rule_add_exact(
838 SCMP_ACT_ERRNO(EPERM
),
841 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
843 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
847 /* On s390/s390x the first two parameters to clone are switched */
848 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
849 r
= seccomp_rule_add_exact(
851 SCMP_ACT_ERRNO(EPERM
),
854 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
856 r
= seccomp_rule_add_exact(
858 SCMP_ACT_ERRNO(EPERM
),
861 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
863 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
867 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
868 r
= seccomp_rule_add_exact(
870 SCMP_ACT_ERRNO(EPERM
),
873 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
875 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
883 r
= seccomp_load(seccomp
);
884 if (IN_SET(r
, -EPERM
, -EACCES
))
887 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
893 int seccomp_protect_sysctl(void) {
897 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
898 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
900 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
902 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
903 /* No _sysctl syscall */
906 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
910 r
= seccomp_rule_add_exact(
912 SCMP_ACT_ERRNO(EPERM
),
916 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
920 r
= seccomp_load(seccomp
);
921 if (IN_SET(r
, -EPERM
, -EACCES
))
924 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
930 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
934 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
935 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
939 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
943 case SCMP_ARCH_X86_64
:
946 case SCMP_ARCH_AARCH64
:
947 case SCMP_ARCH_PPC64
:
948 case SCMP_ARCH_PPC64LE
:
949 /* These we know we support (i.e. are the ones that do not use socketcall()) */
954 case SCMP_ARCH_S390X
:
958 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
967 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
972 int af
, first
= 0, last
= 0;
975 /* If this is a whitelist, we first block the address families that are out of range and then
976 * everything that is not in the set. First, we find the lowest and highest address family in
979 SET_FOREACH(afp
, address_families
, i
) {
980 af
= PTR_TO_INT(afp
);
982 if (af
<= 0 || af
>= af_max())
985 if (first
== 0 || af
< first
)
988 if (last
== 0 || af
> last
)
992 assert((first
== 0) == (last
== 0));
996 /* No entries in the valid range, block everything */
997 r
= seccomp_rule_add_exact(
999 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1003 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1009 /* Block everything below the first entry */
1010 r
= seccomp_rule_add_exact(
1012 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1015 SCMP_A0(SCMP_CMP_LT
, first
));
1017 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1021 /* Block everything above the last entry */
1022 r
= seccomp_rule_add_exact(
1024 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1027 SCMP_A0(SCMP_CMP_GT
, last
));
1029 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1033 /* Block everything between the first and last entry */
1034 for (af
= 1; af
< af_max(); af
++) {
1036 if (set_contains(address_families
, INT_TO_PTR(af
)))
1039 r
= seccomp_rule_add_exact(
1041 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1044 SCMP_A0(SCMP_CMP_EQ
, af
));
1050 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1058 /* If this is a blacklist, then generate one rule for
1059 * each address family that are then combined in OR
1062 SET_FOREACH(af
, address_families
, i
) {
1064 r
= seccomp_rule_add_exact(
1066 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1069 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1075 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1080 r
= seccomp_load(seccomp
);
1081 if (IN_SET(r
, -EPERM
, -EACCES
))
1084 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1090 int seccomp_restrict_realtime(void) {
1091 static const int permitted_policies
[] = {
1097 int r
, max_policy
= 0;
1101 /* Determine the highest policy constant we want to allow */
1102 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1103 if (permitted_policies
[i
] > max_policy
)
1104 max_policy
= permitted_policies
[i
];
1106 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1107 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1110 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1112 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1116 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1118 for (p
= 0; p
< max_policy
; p
++) {
1121 /* Check if this is in the whitelist. */
1122 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1123 if (permitted_policies
[i
] == p
) {
1131 /* Deny this policy */
1132 r
= seccomp_rule_add_exact(
1134 SCMP_ACT_ERRNO(EPERM
),
1135 SCMP_SYS(sched_setscheduler
),
1137 SCMP_A1(SCMP_CMP_EQ
, p
));
1139 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1144 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1145 * unsigned here, hence no need no check for < 0 values. */
1146 r
= seccomp_rule_add_exact(
1148 SCMP_ACT_ERRNO(EPERM
),
1149 SCMP_SYS(sched_setscheduler
),
1151 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1153 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1157 r
= seccomp_load(seccomp
);
1158 if (IN_SET(r
, -EPERM
, -EACCES
))
1161 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1167 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1170 unsigned int arg_cnt
,
1171 const struct scmp_arg_cmp arg
) {
1174 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1176 _cleanup_free_
char *n
= NULL
;
1178 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1179 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1181 seccomp_arch_to_string(arch
));
1187 /* For known architectures, check that syscalls are indeed defined or not. */
1188 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1189 assert_cc(SCMP_SYS(shmget
) > 0);
1190 assert_cc(SCMP_SYS(shmat
) > 0);
1191 assert_cc(SCMP_SYS(shmdt
) > 0);
1192 #elif defined(__i386__) || defined(__powerpc64__)
1193 assert_cc(SCMP_SYS(shmget
) < 0);
1194 assert_cc(SCMP_SYS(shmat
) < 0);
1195 assert_cc(SCMP_SYS(shmdt
) < 0);
1198 int seccomp_memory_deny_write_execute(void) {
1203 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1204 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1205 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1207 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1212 filter_syscall
= SCMP_SYS(mmap2
);
1213 block_syscall
= SCMP_SYS(mmap
);
1216 case SCMP_ARCH_PPC64
:
1217 case SCMP_ARCH_PPC64LE
:
1218 filter_syscall
= SCMP_SYS(mmap
);
1220 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1221 * We ignore that here, which means there's still a way to get writable/executable
1222 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1227 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1228 shmat_syscall
= SCMP_SYS(shmat
);
1231 case SCMP_ARCH_X86_64
:
1233 case SCMP_ARCH_AARCH64
:
1234 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1235 shmat_syscall
= SCMP_SYS(shmat
);
1238 /* Please add more definitions here, if you port systemd to other architectures! */
1240 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1241 #warning "Consider adding the right mmap() syscall definitions here!"
1245 /* Can't filter mmap() on this arch, then skip it */
1246 if (filter_syscall
== 0)
1249 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1253 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1255 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1259 if (block_syscall
!= 0) {
1260 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1265 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1267 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1271 if (shmat_syscall
!= 0) {
1272 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1274 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1279 r
= seccomp_load(seccomp
);
1280 if (IN_SET(r
, -EPERM
, -EACCES
))
1283 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1289 int seccomp_restrict_archs(Set
*archs
) {
1290 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1295 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1298 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1302 SET_FOREACH(id
, archs
, i
) {
1303 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1310 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1314 return seccomp_load(seccomp
);