1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include <linux/seccomp.h>
14 #include "alloc-util.h"
16 #include "errno-list.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
23 #include "string-util.h"
26 const uint32_t seccomp_local_archs
[] = {
28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
30 #if defined(__x86_64__) && defined(__ILP32__)
33 SCMP_ARCH_X32
, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
37 SCMP_ARCH_X86_64
, /* native */
38 #elif defined(__i386__)
40 #elif defined(__aarch64__)
42 SCMP_ARCH_AARCH64
, /* native */
43 #elif defined(__arm__)
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
47 SCMP_ARCH_MIPS
, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL
, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
54 SCMP_ARCH_MIPSEL64N32
,
57 SCMP_ARCH_MIPS64
, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
62 SCMP_ARCH_MIPSEL64N32
,
64 SCMP_ARCH_MIPSEL64
, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL64N32
,
71 SCMP_ARCH_MIPS64N32
, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPSEL64N32
, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
82 SCMP_ARCH_PPC64
, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
86 SCMP_ARCH_PPC64LE
, /* native */
87 #elif defined(__powerpc__)
89 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
91 #elif defined(__s390x__)
93 SCMP_ARCH_S390X
, /* native */
94 #elif defined(__s390__)
100 const char* seccomp_arch_to_string(uint32_t c
) {
101 /* Maintain order used in <seccomp.h>.
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
107 case SCMP_ARCH_NATIVE
:
111 case SCMP_ARCH_X86_64
:
117 case SCMP_ARCH_AARCH64
:
121 case SCMP_ARCH_MIPS64
:
123 case SCMP_ARCH_MIPS64N32
:
125 case SCMP_ARCH_MIPSEL
:
127 case SCMP_ARCH_MIPSEL64
:
129 case SCMP_ARCH_MIPSEL64N32
:
130 return "mips64-le-n32";
133 case SCMP_ARCH_PPC64
:
135 case SCMP_ARCH_PPC64LE
:
137 #ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64
:
143 case SCMP_ARCH_S390X
:
150 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
156 if (streq(n
, "native"))
157 *ret
= SCMP_ARCH_NATIVE
;
158 else if (streq(n
, "x86"))
159 *ret
= SCMP_ARCH_X86
;
160 else if (streq(n
, "x86-64"))
161 *ret
= SCMP_ARCH_X86_64
;
162 else if (streq(n
, "x32"))
163 *ret
= SCMP_ARCH_X32
;
164 else if (streq(n
, "arm"))
165 *ret
= SCMP_ARCH_ARM
;
166 else if (streq(n
, "arm64"))
167 *ret
= SCMP_ARCH_AARCH64
;
168 else if (streq(n
, "mips"))
169 *ret
= SCMP_ARCH_MIPS
;
170 else if (streq(n
, "mips64"))
171 *ret
= SCMP_ARCH_MIPS64
;
172 else if (streq(n
, "mips64-n32"))
173 *ret
= SCMP_ARCH_MIPS64N32
;
174 else if (streq(n
, "mips-le"))
175 *ret
= SCMP_ARCH_MIPSEL
;
176 else if (streq(n
, "mips64-le"))
177 *ret
= SCMP_ARCH_MIPSEL64
;
178 else if (streq(n
, "mips64-le-n32"))
179 *ret
= SCMP_ARCH_MIPSEL64N32
;
180 else if (streq(n
, "ppc"))
181 *ret
= SCMP_ARCH_PPC
;
182 else if (streq(n
, "ppc64"))
183 *ret
= SCMP_ARCH_PPC64
;
184 else if (streq(n
, "ppc64-le"))
185 *ret
= SCMP_ARCH_PPC64LE
;
186 #ifdef SCMP_ARCH_RISCV64
187 else if (streq(n
, "riscv64"))
188 *ret
= SCMP_ARCH_RISCV64
;
190 else if (streq(n
, "s390"))
191 *ret
= SCMP_ARCH_S390
;
192 else if (streq(n
, "s390x"))
193 *ret
= SCMP_ARCH_S390X
;
200 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
201 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
207 seccomp
= seccomp_init(default_action
);
211 if (arch
!= SCMP_ARCH_NATIVE
&&
212 arch
!= seccomp_arch_native()) {
214 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
218 r
= seccomp_arch_add(seccomp
, arch
);
222 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
223 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
224 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
226 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
227 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
230 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
234 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
238 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_LOG
, 1);
242 log_debug_errno(r
, "Failed to enable seccomp event logging: %m");
246 *ret
= TAKE_PTR(seccomp
);
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
259 bool is_seccomp_available(void) {
260 static int cached_enabled
= -1;
262 if (cached_enabled
< 0) {
265 b
= getenv_bool_secure("SYSTEMD_SECCOMP");
267 if (b
< 0 && b
!= -ENXIO
) /* ENXIO: env var unset */
268 log_debug_errno(b
, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
271 is_basic_seccomp_available() &&
272 is_seccomp_filter_available();
274 cached_enabled
= false;
277 return cached_enabled
;
280 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
281 [SYSCALL_FILTER_SET_DEFAULT
] = {
283 .help
= "System calls that are always permitted",
287 "clock_getres_time64\0"
291 "clock_nanosleep_time64\0"
315 "getrlimit\0" /* make sure processes can query stack size and such */
337 [SYSCALL_FILTER_SET_AIO
] = {
339 .help
= "Asynchronous IO",
345 "io_pgetevents_time64\0"
349 "io_uring_register\0"
352 [SYSCALL_FILTER_SET_BASIC_IO
] = {
374 [SYSCALL_FILTER_SET_CHOWN
] = {
376 .help
= "Change ownership of files and directories",
386 [SYSCALL_FILTER_SET_CLOCK
] = {
388 .help
= "Change the system time",
397 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
398 .name
= "@cpu-emulation",
399 .help
= "System calls for CPU emulation functionality",
407 [SYSCALL_FILTER_SET_DEBUG
] = {
409 .help
= "Debugging, performance monitoring and tracing functionality",
416 #if defined __s390__ || defined __s390x__
417 "s390_runtime_instr\0"
419 "sys_debug_setcontext\0"
421 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
422 .name
= "@file-system",
423 .help
= "File system operations",
454 "inotify_add_watch\0"
505 [SYSCALL_FILTER_SET_IO_EVENT
] = {
507 .help
= "Event loop system calls",
526 [SYSCALL_FILTER_SET_IPC
] = {
528 .help
= "SysV IPC, POSIX Message Queues or other IPC",
536 "mq_timedreceive_time64\0"
538 "mq_timedsend_time64\0"
547 "process_vm_writev\0"
552 "semtimedop_time64\0"
558 [SYSCALL_FILTER_SET_KEYRING
] = {
560 .help
= "Kernel keyring access",
566 [SYSCALL_FILTER_SET_MEMLOCK
] = {
568 .help
= "Memory locking control",
576 [SYSCALL_FILTER_SET_MODULE
] = {
578 .help
= "Loading and unloading of kernel modules",
584 [SYSCALL_FILTER_SET_MOUNT
] = {
586 .help
= "Mounting and unmounting of file systems",
600 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
601 .name
= "@network-io",
602 .help
= "Network or Unix socket IO, should not be needed if not network facing",
627 [SYSCALL_FILTER_SET_OBSOLETE
] = {
628 /* some unknown even to libseccomp */
630 .help
= "Unusual, obsolete or unimplemented system calls",
660 [SYSCALL_FILTER_SET_PKEY
] = {
662 .help
= "System calls used for memory protection keys",
668 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
669 .name
= "@privileged",
670 .help
= "All system calls which need super-user capabilities",
686 "open_by_handle_at\0"
699 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
703 [SYSCALL_FILTER_SET_PROCESS
] = {
705 .help
= "Process control, execution, namespacing operations",
708 "capget\0" /* Able to query arbitrary processes */
716 "pidfd_send_signal\0"
719 "rt_tgsigqueueinfo\0"
721 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
731 [SYSCALL_FILTER_SET_RAW_IO
] = {
733 .help
= "Raw I/O port access",
740 #if defined __s390__ || defined __s390x__
741 "s390_pci_mmio_read\0"
742 "s390_pci_mmio_write\0"
745 [SYSCALL_FILTER_SET_REBOOT
] = {
747 .help
= "Reboot and reboot preparation/kexec",
753 [SYSCALL_FILTER_SET_RESOURCES
] = {
754 .name
= "@resources",
755 .help
= "Alter resource settings",
762 "sched_setaffinity\0"
765 "sched_setscheduler\0"
770 [SYSCALL_FILTER_SET_SETUID
] = {
772 .help
= "Operations for changing user/group credentials",
789 [SYSCALL_FILTER_SET_SIGNAL
] = {
791 .help
= "Process signal handling",
798 "rt_sigtimedwait_time64\0"
808 [SYSCALL_FILTER_SET_SWAP
] = {
810 .help
= "Enable/disable swap devices",
815 [SYSCALL_FILTER_SET_SYNC
] = {
817 .help
= "Synchronize files and memory to storage",
827 [SYSCALL_FILTER_SET_SYSTEM_SERVICE
] = {
828 .name
= "@system-service",
829 .help
= "General system service operations",
864 "name_to_handle_at\0"
871 "sched_get_priority_max\0"
872 "sched_get_priority_min\0"
873 "sched_getaffinity\0"
876 "sched_getscheduler\0"
877 "sched_rr_get_interval\0"
878 "sched_rr_get_interval_time64\0"
896 [SYSCALL_FILTER_SET_TIMER
] = {
898 .help
= "Schedule operations by time",
912 "timerfd_gettime64\0"
914 "timerfd_settime64\0"
917 [SYSCALL_FILTER_SET_KNOWN
] = {
919 .help
= "All known syscalls declared in the kernel",
921 #include "syscall-list.h"
925 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
926 if (isempty(name
) || name
[0] != '@')
929 for (unsigned i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
930 if (streq(syscall_filter_sets
[i
].name
, name
))
931 return syscall_filter_sets
+ i
;
936 static int add_syscall_filter_set(
937 scmp_filter_ctx seccomp
,
938 const SyscallFilterSet
*set
,
944 int seccomp_add_syscall_filter_item(
945 scmp_filter_ctx
*seccomp
,
955 if (strv_contains(exclude
, name
))
958 /* Any syscalls that are handled are added to the *added strv. The pointer
959 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
961 if (name
[0] == '@') {
962 const SyscallFilterSet
*other
;
964 other
= syscall_filter_set_find(name
);
966 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
967 "Filter set %s is not known!",
970 return add_syscall_filter_set(seccomp
, other
, action
, exclude
, log_missing
, added
);
975 id
= seccomp_syscall_resolve_name(name
);
976 if (id
== __NR_SCMP_ERROR
) {
978 log_debug("System call %s is not known, ignoring.", name
);
982 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
984 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
985 bool ignore
= r
== -EDOM
;
987 if (!ignore
|| log_missing
)
988 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
989 name
, id
, ignore
? ", ignoring" : "");
995 r
= strv_extend(added
, name
);
1004 static int add_syscall_filter_set(
1005 scmp_filter_ctx seccomp
,
1006 const SyscallFilterSet
*set
,
1015 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1020 NULSTR_FOREACH(sys
, set
->value
) {
1021 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
, log_missing
, added
);
1029 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
, bool log_missing
) {
1035 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1036 * each local arch. */
1038 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1039 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1041 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1043 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1047 r
= add_syscall_filter_set(seccomp
, set
, action
, NULL
, log_missing
, NULL
);
1049 return log_debug_errno(r
, "Failed to add filter set: %m");
1051 r
= seccomp_load(seccomp
);
1052 if (ERRNO_IS_SECCOMP_FATAL(r
))
1055 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1061 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* set
, uint32_t action
, bool log_missing
) {
1065 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1066 * SyscallFilterSet* table. */
1068 if (hashmap_isempty(set
) && default_action
== SCMP_ACT_ALLOW
)
1071 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1072 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1073 void *syscall_id
, *val
;
1075 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1077 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1081 HASHMAP_FOREACH_KEY(val
, syscall_id
, set
) {
1082 uint32_t a
= action
;
1083 int id
= PTR_TO_INT(syscall_id
) - 1;
1084 int error
= PTR_TO_INT(val
);
1086 if (error
== SECCOMP_ERROR_NUMBER_KILL
)
1087 a
= scmp_act_kill_process();
1089 else if (action
== SCMP_ACT_LOG
)
1092 else if (action
!= SCMP_ACT_ALLOW
&& error
>= 0)
1093 a
= SCMP_ACT_ERRNO(error
);
1095 r
= seccomp_rule_add_exact(seccomp
, a
, id
, 0);
1097 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1098 _cleanup_free_
char *n
= NULL
;
1101 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, id
);
1102 ignore
= r
== -EDOM
;
1103 if (!ignore
|| log_missing
)
1104 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1105 strna(n
), id
, ignore
? ", ignoring" : "");
1111 r
= seccomp_load(seccomp
);
1112 if (ERRNO_IS_SECCOMP_FATAL(r
))
1115 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1121 int seccomp_parse_syscall_filter(
1125 SeccompParseFlags flags
,
1127 const char *filename
,
1135 if (name
[0] == '@') {
1136 const SyscallFilterSet
*set
;
1139 set
= syscall_filter_set_find(name
);
1141 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
1144 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1145 "Unknown system call group, ignoring: %s", name
);
1149 NULSTR_FOREACH(i
, set
->value
) {
1150 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1151 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1152 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1154 r
= seccomp_parse_syscall_filter(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
1161 id
= seccomp_syscall_resolve_name(name
);
1162 if (id
== __NR_SCMP_ERROR
) {
1163 if (!(flags
& SECCOMP_PARSE_PERMISSIVE
))
1166 log_syntax(unit
, flags
& SECCOMP_PARSE_LOG
? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1167 "Failed to parse system call, ignoring: %s", name
);
1171 /* If we previously wanted to forbid a syscall and now
1172 * we want to allow it, then remove it from the list. */
1173 if (!(flags
& SECCOMP_PARSE_INVERT
) == !!(flags
& SECCOMP_PARSE_ALLOW_LIST
)) {
1174 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1178 return flags
& SECCOMP_PARSE_LOG
? log_oom() : -ENOMEM
;
1180 assert_se(hashmap_update(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
)) == 0);
1186 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1192 int seccomp_restrict_namespaces(unsigned long retain
) {
1196 if (DEBUG_LOGGING
) {
1197 _cleanup_free_
char *s
= NULL
;
1199 (void) namespace_flags_to_string(retain
, &s
);
1200 log_debug("Restricting namespace to: %s.", strna(s
));
1204 if (FLAGS_SET(retain
, NAMESPACE_FLAGS_ALL
))
1207 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1208 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1210 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1212 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1216 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1217 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1219 r
= seccomp_rule_add_exact(
1221 SCMP_ACT_ERRNO(EPERM
),
1225 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1226 * special invocation with a zero flags argument, right here. */
1227 r
= seccomp_rule_add_exact(
1229 SCMP_ACT_ERRNO(EPERM
),
1232 SCMP_A1(SCMP_CMP_EQ
, 0));
1234 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1238 for (unsigned i
= 0; namespace_flag_map
[i
].name
; i
++) {
1241 f
= namespace_flag_map
[i
].flag
;
1242 if (FLAGS_SET(retain
, f
)) {
1243 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1247 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1249 r
= seccomp_rule_add_exact(
1251 SCMP_ACT_ERRNO(EPERM
),
1254 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1256 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1260 /* On s390/s390x the first two parameters to clone are switched */
1261 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1262 r
= seccomp_rule_add_exact(
1264 SCMP_ACT_ERRNO(EPERM
),
1267 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1269 r
= seccomp_rule_add_exact(
1271 SCMP_ACT_ERRNO(EPERM
),
1274 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1276 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1280 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1281 r
= seccomp_rule_add_exact(
1283 SCMP_ACT_ERRNO(EPERM
),
1286 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1288 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1296 r
= seccomp_load(seccomp
);
1297 if (ERRNO_IS_SECCOMP_FATAL(r
))
1300 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1306 int seccomp_protect_sysctl(void) {
1310 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1311 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1313 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1317 #ifdef SCMP_ARCH_RISCV64
1322 /* No _sysctl syscall */
1325 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1329 r
= seccomp_rule_add_exact(
1331 SCMP_ACT_ERRNO(EPERM
),
1335 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1339 r
= seccomp_load(seccomp
);
1340 if (ERRNO_IS_SECCOMP_FATAL(r
))
1343 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1349 int seccomp_protect_syslog(void) {
1353 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1354 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1356 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1360 r
= seccomp_rule_add_exact(
1362 SCMP_ACT_ERRNO(EPERM
),
1367 log_debug_errno(r
, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1371 r
= seccomp_load(seccomp
);
1372 if (ERRNO_IS_SECCOMP_FATAL(r
))
1375 log_debug_errno(r
, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1381 int seccomp_restrict_address_families(Set
*address_families
, bool allow_list
) {
1385 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1386 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1389 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1393 case SCMP_ARCH_X86_64
:
1396 case SCMP_ARCH_AARCH64
:
1398 case SCMP_ARCH_PPC64
:
1399 case SCMP_ARCH_PPC64LE
:
1400 case SCMP_ARCH_MIPSEL64N32
:
1401 case SCMP_ARCH_MIPS64N32
:
1402 case SCMP_ARCH_MIPSEL64
:
1403 case SCMP_ARCH_MIPS64
:
1404 #ifdef SCMP_ARCH_RISCV64
1405 case SCMP_ARCH_RISCV64
:
1407 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1411 case SCMP_ARCH_S390
:
1412 case SCMP_ARCH_S390X
:
1414 case SCMP_ARCH_MIPSEL
:
1415 case SCMP_ARCH_MIPS
:
1417 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1426 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1431 int first
= 0, last
= 0;
1434 /* If this is an allow list, we first block the address families that are out of
1435 * range and then everything that is not in the set. First, we find the lowest and
1436 * highest address family in the set. */
1438 SET_FOREACH(afp
, address_families
) {
1439 int af
= PTR_TO_INT(afp
);
1441 if (af
<= 0 || af
>= af_max())
1444 if (first
== 0 || af
< first
)
1447 if (last
== 0 || af
> last
)
1451 assert((first
== 0) == (last
== 0));
1455 /* No entries in the valid range, block everything */
1456 r
= seccomp_rule_add_exact(
1458 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1462 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1468 /* Block everything below the first entry */
1469 r
= seccomp_rule_add_exact(
1471 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1474 SCMP_A0(SCMP_CMP_LT
, first
));
1476 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1480 /* Block everything above the last entry */
1481 r
= seccomp_rule_add_exact(
1483 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1486 SCMP_A0(SCMP_CMP_GT
, last
));
1488 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1492 /* Block everything between the first and last entry */
1493 for (int af
= 1; af
< af_max(); af
++) {
1495 if (set_contains(address_families
, INT_TO_PTR(af
)))
1498 r
= seccomp_rule_add_exact(
1500 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1503 SCMP_A0(SCMP_CMP_EQ
, af
));
1508 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1516 /* If this is a deny list, then generate one rule for each address family that are
1517 * then combined in OR checks. */
1519 SET_FOREACH(af
, address_families
) {
1520 r
= seccomp_rule_add_exact(
1522 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1525 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1530 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1535 r
= seccomp_load(seccomp
);
1536 if (ERRNO_IS_SECCOMP_FATAL(r
))
1539 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1545 int seccomp_restrict_realtime(void) {
1546 static const int permitted_policies
[] = {
1552 int r
, max_policy
= 0;
1556 /* Determine the highest policy constant we want to allow */
1557 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1558 if (permitted_policies
[i
] > max_policy
)
1559 max_policy
= permitted_policies
[i
];
1561 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1562 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1565 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1567 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1571 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1573 for (p
= 0; p
< max_policy
; p
++) {
1576 /* Check if this is in the allow list. */
1577 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1578 if (permitted_policies
[i
] == p
) {
1586 /* Deny this policy */
1587 r
= seccomp_rule_add_exact(
1589 SCMP_ACT_ERRNO(EPERM
),
1590 SCMP_SYS(sched_setscheduler
),
1592 SCMP_A1(SCMP_CMP_EQ
, p
));
1594 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1599 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1600 * are unsigned here, hence no need no check for < 0 values. */
1601 r
= seccomp_rule_add_exact(
1603 SCMP_ACT_ERRNO(EPERM
),
1604 SCMP_SYS(sched_setscheduler
),
1606 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1608 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1612 r
= seccomp_load(seccomp
);
1613 if (ERRNO_IS_SECCOMP_FATAL(r
))
1616 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1622 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1626 const struct scmp_arg_cmp arg
) {
1629 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1631 _cleanup_free_
char *n
= NULL
;
1633 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1634 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1636 seccomp_arch_to_string(arch
));
1642 /* For known architectures, check that syscalls are indeed defined or not. */
1643 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1644 assert_cc(SCMP_SYS(shmget
) > 0);
1645 assert_cc(SCMP_SYS(shmat
) > 0);
1646 assert_cc(SCMP_SYS(shmdt
) > 0);
1649 int seccomp_memory_deny_write_execute(void) {
1651 unsigned loaded
= 0;
1653 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1654 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1655 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0, r
;
1657 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1661 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1662 * We ignore that here, which means there's still a way to get writable/executable
1663 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1666 case SCMP_ARCH_S390
:
1667 filter_syscall
= SCMP_SYS(mmap2
);
1668 block_syscall
= SCMP_SYS(mmap
);
1669 /* shmat multiplexed, see above */
1673 case SCMP_ARCH_PPC64
:
1674 case SCMP_ARCH_PPC64LE
:
1675 case SCMP_ARCH_S390X
:
1676 filter_syscall
= SCMP_SYS(mmap
);
1677 /* shmat multiplexed, see above */
1681 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1682 shmat_syscall
= SCMP_SYS(shmat
);
1685 case SCMP_ARCH_X86_64
:
1687 case SCMP_ARCH_AARCH64
:
1688 #ifdef SCMP_ARCH_RISCV64
1689 case SCMP_ARCH_RISCV64
:
1691 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, arm64 and riscv64 have only mmap */
1692 shmat_syscall
= SCMP_SYS(shmat
);
1695 /* Please add more definitions here, if you port systemd to other architectures! */
1697 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1698 #warning "Consider adding the right mmap() syscall definitions here!"
1702 /* Can't filter mmap() on this arch, then skip it */
1703 if (filter_syscall
== 0)
1706 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1710 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1712 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1716 if (block_syscall
!= 0) {
1717 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1722 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1724 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1728 #ifdef __NR_pkey_mprotect
1729 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1731 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1736 if (shmat_syscall
> 0) {
1737 r
= add_seccomp_syscall_filter(seccomp
, arch
, shmat_syscall
,
1739 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1744 r
= seccomp_load(seccomp
);
1745 if (ERRNO_IS_SECCOMP_FATAL(r
))
1748 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1749 seccomp_arch_to_string(arch
));
1754 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1759 int seccomp_restrict_archs(Set
*archs
) {
1760 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1764 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1767 * There are some qualifications. However the most important use is to stop processes from bypassing
1768 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1769 * in a non-native architecture. There are no holes in this use case, at least so far. */
1771 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1772 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1773 * to run a program with the restrictions applied. */
1774 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1778 SET_FOREACH(id
, archs
) {
1779 r
= seccomp_arch_add(seccomp
, PTR_TO_UINT32(id
) - 1);
1780 if (r
< 0 && r
!= -EEXIST
)
1784 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1785 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1786 * The important thing is that you can block the old 32-bit x86 syscalls.
1787 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1789 if (seccomp_arch_native() == SCMP_ARCH_X32
||
1790 set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1))) {
1792 r
= seccomp_arch_add(seccomp
, SCMP_ARCH_X86_64
);
1793 if (r
< 0 && r
!= -EEXIST
)
1797 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1801 r
= seccomp_load(seccomp
);
1802 if (ERRNO_IS_SECCOMP_FATAL(r
))
1805 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1810 int parse_syscall_archs(char **l
, Set
**ret_archs
) {
1811 _cleanup_set_free_ Set
*archs
= NULL
;
1818 STRV_FOREACH(s
, l
) {
1821 r
= seccomp_arch_from_string(*s
, &a
);
1825 r
= set_ensure_put(&archs
, NULL
, UINT32_TO_PTR(a
+ 1));
1830 *ret_archs
= TAKE_PTR(archs
);
1834 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1840 NULSTR_FOREACH(i
, set
->value
) {
1843 const SyscallFilterSet
*more
;
1845 more
= syscall_filter_set_find(i
);
1849 r
= seccomp_filter_set_add(filter
, add
, more
);
1855 id
= seccomp_syscall_resolve_name(i
);
1856 if (id
== __NR_SCMP_ERROR
) {
1857 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1862 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1866 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1873 int seccomp_lock_personality(unsigned long personality
) {
1877 if (personality
>= PERSONALITY_INVALID
)
1880 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1881 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1883 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1887 r
= seccomp_rule_add_exact(
1889 SCMP_ACT_ERRNO(EPERM
),
1890 SCMP_SYS(personality
),
1892 SCMP_A0(SCMP_CMP_NE
, personality
));
1894 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1898 r
= seccomp_load(seccomp
);
1899 if (ERRNO_IS_SECCOMP_FATAL(r
))
1902 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1908 int seccomp_protect_hostname(void) {
1912 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1913 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1915 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1919 r
= seccomp_rule_add_exact(
1921 SCMP_ACT_ERRNO(EPERM
),
1922 SCMP_SYS(sethostname
),
1925 log_debug_errno(r
, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1929 r
= seccomp_rule_add_exact(
1931 SCMP_ACT_ERRNO(EPERM
),
1932 SCMP_SYS(setdomainname
),
1935 log_debug_errno(r
, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1939 r
= seccomp_load(seccomp
);
1940 if (ERRNO_IS_SECCOMP_FATAL(r
))
1943 log_debug_errno(r
, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1949 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp
, mode_t m
) {
1950 /* Checks the mode_t parameter of the following system calls:
1952 * → chmod() + fchmod() + fchmodat()
1953 * → open() + creat() + openat()
1954 * → mkdir() + mkdirat()
1955 * → mknod() + mknodat()
1957 * Returns error if *everything* failed, and 0 otherwise.
1962 r
= seccomp_rule_add_exact(
1964 SCMP_ACT_ERRNO(EPERM
),
1967 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1969 log_debug_errno(r
, "Failed to add filter for chmod: %m");
1973 r
= seccomp_rule_add_exact(
1975 SCMP_ACT_ERRNO(EPERM
),
1978 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1980 log_debug_errno(r
, "Failed to add filter for fchmod: %m");
1984 r
= seccomp_rule_add_exact(
1986 SCMP_ACT_ERRNO(EPERM
),
1989 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
1991 log_debug_errno(r
, "Failed to add filter for fchmodat: %m");
1995 r
= seccomp_rule_add_exact(
1997 SCMP_ACT_ERRNO(EPERM
),
2000 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2002 log_debug_errno(r
, "Failed to add filter for mkdir: %m");
2006 r
= seccomp_rule_add_exact(
2008 SCMP_ACT_ERRNO(EPERM
),
2011 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2013 log_debug_errno(r
, "Failed to add filter for mkdirat: %m");
2017 r
= seccomp_rule_add_exact(
2019 SCMP_ACT_ERRNO(EPERM
),
2022 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2024 log_debug_errno(r
, "Failed to add filter for mknod: %m");
2028 r
= seccomp_rule_add_exact(
2030 SCMP_ACT_ERRNO(EPERM
),
2033 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2035 log_debug_errno(r
, "Failed to add filter for mknodat: %m");
2039 #if SCMP_SYS(open) > 0
2040 r
= seccomp_rule_add_exact(
2042 SCMP_ACT_ERRNO(EPERM
),
2045 SCMP_A1(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2046 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2048 log_debug_errno(r
, "Failed to add filter for open: %m");
2053 r
= seccomp_rule_add_exact(
2055 SCMP_ACT_ERRNO(EPERM
),
2058 SCMP_A2(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2059 SCMP_A3(SCMP_CMP_MASKED_EQ
, m
, m
));
2061 log_debug_errno(r
, "Failed to add filter for openat: %m");
2065 #if defined(__SNR_openat2)
2066 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2067 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2068 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2069 * compatible with kernels that are not absolutely recent. */
2070 r
= seccomp_rule_add_exact(
2072 SCMP_ACT_ERRNO(EPERM
),
2076 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2081 r
= seccomp_rule_add_exact(
2083 SCMP_ACT_ERRNO(EPERM
),
2086 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2088 log_debug_errno(r
, "Failed to add filter for creat: %m");
2095 int seccomp_restrict_suid_sgid(void) {
2099 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2100 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2102 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2106 r
= seccomp_restrict_sxid(seccomp
, S_ISUID
);
2108 log_debug_errno(r
, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2110 k
= seccomp_restrict_sxid(seccomp
, S_ISGID
);
2112 log_debug_errno(r
, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2117 r
= seccomp_load(seccomp
);
2118 if (ERRNO_IS_SECCOMP_FATAL(r
))
2121 log_debug_errno(r
, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2127 uint32_t scmp_act_kill_process(void) {
2129 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2130 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2131 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2132 * for single-threaded apps does the right thing. */
2134 #ifdef SCMP_ACT_KILL_PROCESS
2135 if (seccomp_api_get() >= 3)
2136 return SCMP_ACT_KILL_PROCESS
;
2139 return SCMP_ACT_KILL
; /* same as SCMP_ACT_KILL_THREAD */