2 This file is part of systemd.
4 Copyright 2014 Lennart Poettering
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <linux/seccomp.h>
25 #include <sys/prctl.h>
29 #include "alloc-util.h"
32 #include "process-util.h"
33 #include "seccomp-util.h"
35 #include "string-util.h"
38 #include "errno-list.h"
40 const uint32_t seccomp_local_archs
[] = {
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
44 #if defined(__x86_64__) && defined(__ILP32__)
47 SCMP_ARCH_X32
, /* native */
48 #elif defined(__x86_64__) && !defined(__ILP32__)
51 SCMP_ARCH_X86_64
, /* native */
52 #elif defined(__i386__)
54 #elif defined(__aarch64__)
56 SCMP_ARCH_AARCH64
, /* native */
57 #elif defined(__arm__)
59 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
61 SCMP_ARCH_MIPS
, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
64 SCMP_ARCH_MIPSEL
, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
68 SCMP_ARCH_MIPSEL64N32
,
71 SCMP_ARCH_MIPS64
, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
76 SCMP_ARCH_MIPSEL64N32
,
78 SCMP_ARCH_MIPSEL64
, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
84 SCMP_ARCH_MIPSEL64N32
,
85 SCMP_ARCH_MIPS64N32
, /* native */
86 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
92 SCMP_ARCH_MIPSEL64N32
, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
96 SCMP_ARCH_PPC64
, /* native */
97 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
100 SCMP_ARCH_PPC64LE
, /* native */
101 #elif defined(__powerpc__)
103 #elif defined(__s390x__)
105 SCMP_ARCH_S390X
, /* native */
106 #elif defined(__s390__)
112 const char* seccomp_arch_to_string(uint32_t c
) {
113 /* Maintain order used in <seccomp.h>.
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
119 case SCMP_ARCH_NATIVE
:
123 case SCMP_ARCH_X86_64
:
129 case SCMP_ARCH_AARCH64
:
133 case SCMP_ARCH_MIPS64
:
135 case SCMP_ARCH_MIPS64N32
:
137 case SCMP_ARCH_MIPSEL
:
139 case SCMP_ARCH_MIPSEL64
:
141 case SCMP_ARCH_MIPSEL64N32
:
142 return "mips64-le-n32";
145 case SCMP_ARCH_PPC64
:
147 case SCMP_ARCH_PPC64LE
:
151 case SCMP_ARCH_S390X
:
158 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
164 if (streq(n
, "native"))
165 *ret
= SCMP_ARCH_NATIVE
;
166 else if (streq(n
, "x86"))
167 *ret
= SCMP_ARCH_X86
;
168 else if (streq(n
, "x86-64"))
169 *ret
= SCMP_ARCH_X86_64
;
170 else if (streq(n
, "x32"))
171 *ret
= SCMP_ARCH_X32
;
172 else if (streq(n
, "arm"))
173 *ret
= SCMP_ARCH_ARM
;
174 else if (streq(n
, "arm64"))
175 *ret
= SCMP_ARCH_AARCH64
;
176 else if (streq(n
, "mips"))
177 *ret
= SCMP_ARCH_MIPS
;
178 else if (streq(n
, "mips64"))
179 *ret
= SCMP_ARCH_MIPS64
;
180 else if (streq(n
, "mips64-n32"))
181 *ret
= SCMP_ARCH_MIPS64N32
;
182 else if (streq(n
, "mips-le"))
183 *ret
= SCMP_ARCH_MIPSEL
;
184 else if (streq(n
, "mips64-le"))
185 *ret
= SCMP_ARCH_MIPSEL64
;
186 else if (streq(n
, "mips64-le-n32"))
187 *ret
= SCMP_ARCH_MIPSEL64N32
;
188 else if (streq(n
, "ppc"))
189 *ret
= SCMP_ARCH_PPC
;
190 else if (streq(n
, "ppc64"))
191 *ret
= SCMP_ARCH_PPC64
;
192 else if (streq(n
, "ppc64-le"))
193 *ret
= SCMP_ARCH_PPC64LE
;
194 else if (streq(n
, "s390"))
195 *ret
= SCMP_ARCH_S390
;
196 else if (streq(n
, "s390x"))
197 *ret
= SCMP_ARCH_S390X
;
204 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
205 scmp_filter_ctx seccomp
;
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
211 seccomp
= seccomp_init(default_action
);
215 if (arch
!= SCMP_ARCH_NATIVE
&&
216 arch
!= seccomp_arch_native()) {
218 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
222 r
= seccomp_arch_add(seccomp
, arch
);
226 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
227 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
228 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
230 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
231 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
234 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
238 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
246 seccomp_release(seccomp
);
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
259 bool is_seccomp_available(void) {
260 static int cached_enabled
= -1;
262 if (cached_enabled
< 0)
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
267 return cached_enabled
;
270 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
271 [SYSCALL_FILTER_SET_DEFAULT
] = {
273 .help
= "System calls that are always permitted",
300 "getrlimit\0" /* make sure processes can query stack size and such */
320 [SYSCALL_FILTER_SET_AIO
] = {
322 .help
= "Asynchronous IO",
330 [SYSCALL_FILTER_SET_BASIC_IO
] = {
351 [SYSCALL_FILTER_SET_CHOWN
] = {
353 .help
= "Change ownership of files and directories",
363 [SYSCALL_FILTER_SET_CLOCK
] = {
365 .help
= "Change the system time",
373 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
374 .name
= "@cpu-emulation",
375 .help
= "System calls for CPU emulation functionality",
383 [SYSCALL_FILTER_SET_DEBUG
] = {
385 .help
= "Debugging, performance monitoring and tracing functionality",
390 "process_vm_writev\0"
393 #ifdef __NR_s390_runtime_instr
394 "s390_runtime_instr\0"
396 "sys_debug_setcontext\0"
398 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
399 .name
= "@file-system",
400 .help
= "File system operations",
430 "inotify_add_watch\0"
481 [SYSCALL_FILTER_SET_IO_EVENT
] = {
483 .help
= "Event loop system calls",
500 [SYSCALL_FILTER_SET_IPC
] = {
502 .help
= "SysV IPC, POSIX Message Queues or other IPC",
519 "process_vm_writev\0"
529 [SYSCALL_FILTER_SET_KEYRING
] = {
531 .help
= "Kernel keyring access",
537 [SYSCALL_FILTER_SET_MEMLOCK
] = {
539 .help
= "Memory locking control",
547 [SYSCALL_FILTER_SET_MODULE
] = {
549 .help
= "Loading and unloading of kernel modules",
555 [SYSCALL_FILTER_SET_MOUNT
] = {
557 .help
= "Mounting and unmounting of file systems",
565 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
566 .name
= "@network-io",
567 .help
= "Network or Unix socket IO, should not be needed if not network facing",
591 [SYSCALL_FILTER_SET_OBSOLETE
] = {
592 /* some unknown even to libseccomp */
594 .help
= "Unusual, obsolete or unimplemented system calls",
623 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
624 .name
= "@privileged",
625 .help
= "All system calls which need super-user capabilities",
658 [SYSCALL_FILTER_SET_PROCESS
] = {
660 .help
= "Process control, execution, namespaceing operations",
663 "capget\0" /* Able to query arbitrary processes */
671 "rt_tgsigqueueinfo\0"
682 [SYSCALL_FILTER_SET_RAW_IO
] = {
684 .help
= "Raw I/O port access",
691 #ifdef __NR_s390_pci_mmio_read
692 "s390_pci_mmio_read\0"
694 #ifdef __NR_s390_pci_mmio_write
695 "s390_pci_mmio_write\0"
698 [SYSCALL_FILTER_SET_REBOOT
] = {
700 .help
= "Reboot and reboot preparation/kexec",
706 [SYSCALL_FILTER_SET_RESOURCES
] = {
707 .name
= "@resources",
708 .help
= "Alter resource settings",
715 "sched_setaffinity\0"
718 "sched_setscheduler\0"
723 [SYSCALL_FILTER_SET_SETUID
] = {
725 .help
= "Operations for changing user/group credentials",
742 [SYSCALL_FILTER_SET_SIGNAL
] = {
744 .help
= "Process signal handling",
760 [SYSCALL_FILTER_SET_SWAP
] = {
762 .help
= "Enable/disable swap devices",
767 [SYSCALL_FILTER_SET_SYNC
] = {
769 .help
= "Synchronize files and memory to storage",
778 [SYSCALL_FILTER_SET_TIMER
] = {
780 .help
= "Schedule operations by time",
797 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
800 if (isempty(name
) || name
[0] != '@')
803 for (i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
804 if (streq(syscall_filter_sets
[i
].name
, name
))
805 return syscall_filter_sets
+ i
;
810 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp
, const SyscallFilterSet
*set
, uint32_t action
, char **exclude
);
812 int seccomp_add_syscall_filter_item(scmp_filter_ctx
*seccomp
, const char *name
, uint32_t action
, char **exclude
) {
818 if (strv_contains(exclude
, name
))
821 if (name
[0] == '@') {
822 const SyscallFilterSet
*other
;
824 other
= syscall_filter_set_find(name
);
826 log_debug("Filter set %s is not known!", name
);
830 r
= seccomp_add_syscall_filter_set(seccomp
, other
, action
, exclude
);
836 id
= seccomp_syscall_resolve_name(name
);
837 if (id
== __NR_SCMP_ERROR
) {
838 log_debug("System call %s is not known, ignoring.", name
);
842 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
844 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
845 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", name
, id
);
851 static int seccomp_add_syscall_filter_set(
852 scmp_filter_ctx seccomp
,
853 const SyscallFilterSet
*set
,
863 NULSTR_FOREACH(sys
, set
->value
) {
864 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
);
872 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
) {
878 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
879 * earch local arch. */
881 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
882 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
884 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
886 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
890 r
= seccomp_add_syscall_filter_set(seccomp
, set
, action
, NULL
);
892 log_debug_errno(r
, "Failed to add filter set, ignoring: %m");
896 r
= seccomp_load(seccomp
);
897 if (IN_SET(r
, -EPERM
, -EACCES
))
900 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
906 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Set
* set
, uint32_t action
) {
910 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
911 * SyscallFilterSet* table. */
913 if (set_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
916 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
917 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
921 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
923 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
927 SET_FOREACH(id
, set
, i
) {
928 r
= seccomp_rule_add_exact(seccomp
, action
, PTR_TO_INT(id
) - 1, 0);
930 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
931 _cleanup_free_
char *n
= NULL
;
933 n
= seccomp_syscall_resolve_num_arch(arch
, PTR_TO_INT(id
) - 1);
934 log_debug_errno(r
, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n
), PTR_TO_INT(id
) - 1);
938 r
= seccomp_load(seccomp
);
939 if (IN_SET(r
, -EPERM
, -EACCES
))
942 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
948 int seccomp_restrict_namespaces(unsigned long retain
) {
952 if (log_get_max_level() >= LOG_DEBUG
) {
953 _cleanup_free_
char *s
= NULL
;
955 (void) namespace_flag_to_string_many(retain
, &s
);
956 log_debug("Restricting namespace to: %s.", strna(s
));
960 if ((retain
& NAMESPACE_FLAGS_ALL
) == NAMESPACE_FLAGS_ALL
)
963 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
964 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
967 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
969 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
973 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
974 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
976 r
= seccomp_rule_add_exact(
978 SCMP_ACT_ERRNO(EPERM
),
982 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
983 * special invocation with a zero flags argument, right here. */
984 r
= seccomp_rule_add_exact(
986 SCMP_ACT_ERRNO(EPERM
),
989 SCMP_A1(SCMP_CMP_EQ
, 0));
991 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
995 for (i
= 0; namespace_flag_map
[i
].name
; i
++) {
998 f
= namespace_flag_map
[i
].flag
;
999 if ((retain
& f
) == f
) {
1000 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1004 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1006 r
= seccomp_rule_add_exact(
1008 SCMP_ACT_ERRNO(EPERM
),
1011 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1013 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1017 /* On s390/s390x the first two parameters to clone are switched */
1018 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1019 r
= seccomp_rule_add_exact(
1021 SCMP_ACT_ERRNO(EPERM
),
1024 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1026 r
= seccomp_rule_add_exact(
1028 SCMP_ACT_ERRNO(EPERM
),
1031 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1033 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1037 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1038 r
= seccomp_rule_add_exact(
1040 SCMP_ACT_ERRNO(EPERM
),
1043 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1045 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1053 r
= seccomp_load(seccomp
);
1054 if (IN_SET(r
, -EPERM
, -EACCES
))
1057 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1063 int seccomp_protect_sysctl(void) {
1067 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1068 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1070 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1072 if (IN_SET(arch
, SCMP_ARCH_X32
, SCMP_ARCH_AARCH64
))
1073 /* No _sysctl syscall */
1076 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1080 r
= seccomp_rule_add_exact(
1082 SCMP_ACT_ERRNO(EPERM
),
1086 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1090 r
= seccomp_load(seccomp
);
1091 if (IN_SET(r
, -EPERM
, -EACCES
))
1094 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1100 int seccomp_restrict_address_families(Set
*address_families
, bool whitelist
) {
1104 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1105 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1109 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1113 case SCMP_ARCH_X86_64
:
1116 case SCMP_ARCH_AARCH64
:
1117 case SCMP_ARCH_PPC64
:
1118 case SCMP_ARCH_PPC64LE
:
1119 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1123 case SCMP_ARCH_S390
:
1124 case SCMP_ARCH_S390X
:
1128 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1137 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1142 int af
, first
= 0, last
= 0;
1145 /* If this is a whitelist, we first block the address families that are out of range and then
1146 * everything that is not in the set. First, we find the lowest and highest address family in
1149 SET_FOREACH(afp
, address_families
, i
) {
1150 af
= PTR_TO_INT(afp
);
1152 if (af
<= 0 || af
>= af_max())
1155 if (first
== 0 || af
< first
)
1158 if (last
== 0 || af
> last
)
1162 assert((first
== 0) == (last
== 0));
1166 /* No entries in the valid range, block everything */
1167 r
= seccomp_rule_add_exact(
1169 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1173 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1179 /* Block everything below the first entry */
1180 r
= seccomp_rule_add_exact(
1182 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1185 SCMP_A0(SCMP_CMP_LT
, first
));
1187 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1191 /* Block everything above the last entry */
1192 r
= seccomp_rule_add_exact(
1194 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1197 SCMP_A0(SCMP_CMP_GT
, last
));
1199 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1203 /* Block everything between the first and last entry */
1204 for (af
= 1; af
< af_max(); af
++) {
1206 if (set_contains(address_families
, INT_TO_PTR(af
)))
1209 r
= seccomp_rule_add_exact(
1211 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1214 SCMP_A0(SCMP_CMP_EQ
, af
));
1219 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1227 /* If this is a blacklist, then generate one rule for
1228 * each address family that are then combined in OR
1231 SET_FOREACH(af
, address_families
, i
) {
1233 r
= seccomp_rule_add_exact(
1235 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1238 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1243 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1248 r
= seccomp_load(seccomp
);
1249 if (IN_SET(r
, -EPERM
, -EACCES
))
1252 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1258 int seccomp_restrict_realtime(void) {
1259 static const int permitted_policies
[] = {
1265 int r
, max_policy
= 0;
1269 /* Determine the highest policy constant we want to allow */
1270 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1271 if (permitted_policies
[i
] > max_policy
)
1272 max_policy
= permitted_policies
[i
];
1274 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1275 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1278 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1280 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1284 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1286 for (p
= 0; p
< max_policy
; p
++) {
1289 /* Check if this is in the whitelist. */
1290 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1291 if (permitted_policies
[i
] == p
) {
1299 /* Deny this policy */
1300 r
= seccomp_rule_add_exact(
1302 SCMP_ACT_ERRNO(EPERM
),
1303 SCMP_SYS(sched_setscheduler
),
1305 SCMP_A1(SCMP_CMP_EQ
, p
));
1307 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1312 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1313 * unsigned here, hence no need no check for < 0 values. */
1314 r
= seccomp_rule_add_exact(
1316 SCMP_ACT_ERRNO(EPERM
),
1317 SCMP_SYS(sched_setscheduler
),
1319 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1321 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1325 r
= seccomp_load(seccomp
);
1326 if (IN_SET(r
, -EPERM
, -EACCES
))
1329 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1335 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1338 unsigned int arg_cnt
,
1339 const struct scmp_arg_cmp arg
) {
1342 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1344 _cleanup_free_
char *n
= NULL
;
1346 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1347 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1349 seccomp_arch_to_string(arch
));
1355 /* For known architectures, check that syscalls are indeed defined or not. */
1356 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1357 assert_cc(SCMP_SYS(shmget
) > 0);
1358 assert_cc(SCMP_SYS(shmat
) > 0);
1359 assert_cc(SCMP_SYS(shmdt
) > 0);
1360 #elif defined(__i386__) || defined(__powerpc64__)
1361 assert_cc(SCMP_SYS(shmget
) < 0);
1362 assert_cc(SCMP_SYS(shmat
) < 0);
1363 assert_cc(SCMP_SYS(shmdt
) < 0);
1366 int seccomp_memory_deny_write_execute(void) {
1371 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1372 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1373 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0;
1375 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1380 filter_syscall
= SCMP_SYS(mmap2
);
1381 block_syscall
= SCMP_SYS(mmap
);
1384 case SCMP_ARCH_PPC64
:
1385 case SCMP_ARCH_PPC64LE
:
1386 filter_syscall
= SCMP_SYS(mmap
);
1388 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1389 * We ignore that here, which means there's still a way to get writable/executable
1390 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1395 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1396 shmat_syscall
= SCMP_SYS(shmat
);
1399 case SCMP_ARCH_X86_64
:
1401 case SCMP_ARCH_AARCH64
:
1402 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, and arm64 have only mmap */
1403 shmat_syscall
= SCMP_SYS(shmat
);
1406 /* Please add more definitions here, if you port systemd to other architectures! */
1408 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1409 #warning "Consider adding the right mmap() syscall definitions here!"
1413 /* Can't filter mmap() on this arch, then skip it */
1414 if (filter_syscall
== 0)
1417 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1421 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1423 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1427 if (block_syscall
!= 0) {
1428 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1433 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1435 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1439 if (shmat_syscall
!= 0) {
1440 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(shmat
),
1442 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1447 r
= seccomp_load(seccomp
);
1448 if (IN_SET(r
, -EPERM
, -EACCES
))
1451 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1457 int seccomp_restrict_archs(Set
*archs
) {
1458 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1463 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1466 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1470 SET_FOREACH(id
, archs
, i
) {
1471 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1478 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1482 r
= seccomp_load(seccomp
);
1483 if (IN_SET(r
, -EPERM
, -EACCES
))
1486 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1491 int parse_syscall_archs(char **l
, Set
**archs
) {
1492 _cleanup_set_free_ Set
*_archs
;
1499 r
= set_ensure_allocated(&_archs
, NULL
);
1503 STRV_FOREACH(s
, l
) {
1506 r
= seccomp_arch_from_string(*s
, &a
);
1510 r
= set_put(_archs
, UINT32_TO_PTR(a
+ 1));
1521 int seccomp_filter_set_add(Set
*filter
, bool add
, const SyscallFilterSet
*set
) {
1527 NULSTR_FOREACH(i
, set
->value
) {
1530 const SyscallFilterSet
*more
;
1532 more
= syscall_filter_set_find(i
);
1536 r
= seccomp_filter_set_add(filter
, add
, more
);
1542 id
= seccomp_syscall_resolve_name(i
);
1543 if (id
== __NR_SCMP_ERROR
) {
1544 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1549 r
= set_put(filter
, INT_TO_PTR(id
+ 1));
1553 (void) set_remove(filter
, INT_TO_PTR(id
+ 1));
1560 int seccomp_lock_personality(unsigned long personality
) {
1564 if (personality
>= PERSONALITY_INVALID
)
1567 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1568 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1570 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1574 r
= seccomp_rule_add_exact(
1576 SCMP_ACT_ERRNO(EPERM
),
1577 SCMP_SYS(personality
),
1579 SCMP_A0(SCMP_CMP_NE
, personality
));
1581 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1585 r
= seccomp_load(seccomp
);
1586 if (IN_SET(r
, -EPERM
, -EACCES
))
1589 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));