1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include <linux/seccomp.h>
14 #include "alloc-util.h"
16 #include "errno-list.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
23 #include "string-util.h"
26 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
27 uint32_t seccomp_local_archs
[] = {
29 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
31 #if defined(__x86_64__) && defined(__ILP32__)
34 SCMP_ARCH_X32
, /* native */
35 #elif defined(__x86_64__) && !defined(__ILP32__)
38 SCMP_ARCH_X86_64
, /* native */
39 #elif defined(__i386__)
41 #elif defined(__aarch64__)
43 SCMP_ARCH_AARCH64
, /* native */
44 #elif defined(__arm__)
46 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS
, /* native */
49 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
51 SCMP_ARCH_MIPSEL
, /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
55 SCMP_ARCH_MIPSEL64N32
,
58 SCMP_ARCH_MIPS64
, /* native */
59 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPSEL64N32
,
65 SCMP_ARCH_MIPSEL64
, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
71 SCMP_ARCH_MIPSEL64N32
,
72 SCMP_ARCH_MIPS64N32
, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
79 SCMP_ARCH_MIPSEL64N32
, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
83 SCMP_ARCH_PPC64
, /* native */
84 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
87 SCMP_ARCH_PPC64LE
, /* native */
88 #elif defined(__powerpc__)
90 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
92 #elif defined(__s390x__)
94 SCMP_ARCH_S390X
, /* native */
95 #elif defined(__s390__)
98 SECCOMP_LOCAL_ARCH_END
101 const char* seccomp_arch_to_string(uint32_t c
) {
102 /* Maintain order used in <seccomp.h>.
104 * Names used here should be the same as those used for ConditionArchitecture=,
105 * except for "subarchitectures" like x32. */
108 case SCMP_ARCH_NATIVE
:
112 case SCMP_ARCH_X86_64
:
118 case SCMP_ARCH_AARCH64
:
122 case SCMP_ARCH_MIPS64
:
124 case SCMP_ARCH_MIPS64N32
:
126 case SCMP_ARCH_MIPSEL
:
128 case SCMP_ARCH_MIPSEL64
:
130 case SCMP_ARCH_MIPSEL64N32
:
131 return "mips64-le-n32";
134 case SCMP_ARCH_PPC64
:
136 case SCMP_ARCH_PPC64LE
:
138 #ifdef SCMP_ARCH_RISCV64
139 case SCMP_ARCH_RISCV64
:
144 case SCMP_ARCH_S390X
:
151 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
157 if (streq(n
, "native"))
158 *ret
= SCMP_ARCH_NATIVE
;
159 else if (streq(n
, "x86"))
160 *ret
= SCMP_ARCH_X86
;
161 else if (streq(n
, "x86-64"))
162 *ret
= SCMP_ARCH_X86_64
;
163 else if (streq(n
, "x32"))
164 *ret
= SCMP_ARCH_X32
;
165 else if (streq(n
, "arm"))
166 *ret
= SCMP_ARCH_ARM
;
167 else if (streq(n
, "arm64"))
168 *ret
= SCMP_ARCH_AARCH64
;
169 else if (streq(n
, "mips"))
170 *ret
= SCMP_ARCH_MIPS
;
171 else if (streq(n
, "mips64"))
172 *ret
= SCMP_ARCH_MIPS64
;
173 else if (streq(n
, "mips64-n32"))
174 *ret
= SCMP_ARCH_MIPS64N32
;
175 else if (streq(n
, "mips-le"))
176 *ret
= SCMP_ARCH_MIPSEL
;
177 else if (streq(n
, "mips64-le"))
178 *ret
= SCMP_ARCH_MIPSEL64
;
179 else if (streq(n
, "mips64-le-n32"))
180 *ret
= SCMP_ARCH_MIPSEL64N32
;
181 else if (streq(n
, "ppc"))
182 *ret
= SCMP_ARCH_PPC
;
183 else if (streq(n
, "ppc64"))
184 *ret
= SCMP_ARCH_PPC64
;
185 else if (streq(n
, "ppc64-le"))
186 *ret
= SCMP_ARCH_PPC64LE
;
187 #ifdef SCMP_ARCH_RISCV64
188 else if (streq(n
, "riscv64"))
189 *ret
= SCMP_ARCH_RISCV64
;
191 else if (streq(n
, "s390"))
192 *ret
= SCMP_ARCH_S390
;
193 else if (streq(n
, "s390x"))
194 *ret
= SCMP_ARCH_S390X
;
201 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
202 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
205 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
206 * any others. Also, turns off the NNP fiddling. */
208 seccomp
= seccomp_init(default_action
);
212 if (arch
!= SCMP_ARCH_NATIVE
&&
213 arch
!= seccomp_arch_native()) {
215 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
219 r
= seccomp_arch_add(seccomp
, arch
);
223 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
224 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
225 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
227 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
228 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
231 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
235 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
239 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
240 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
241 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_LOG
, 1);
243 log_debug_errno(r
, "Failed to enable seccomp event logging: %m");
247 *ret
= TAKE_PTR(seccomp
);
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
260 bool is_seccomp_available(void) {
261 static int cached_enabled
= -1;
263 if (cached_enabled
< 0) {
266 b
= getenv_bool_secure("SYSTEMD_SECCOMP");
268 if (b
< 0 && b
!= -ENXIO
) /* ENXIO: env var unset */
269 log_debug_errno(b
, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
272 is_basic_seccomp_available() &&
273 is_seccomp_filter_available();
275 cached_enabled
= false;
278 return cached_enabled
;
281 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
282 [SYSCALL_FILTER_SET_DEFAULT
] = {
284 .help
= "System calls that are always permitted",
289 "clock_getres_time64\0"
293 "clock_nanosleep_time64\0"
318 "getrlimit\0" /* make sure processes can query stack size and such */
334 "sched_getaffinity\0"
344 [SYSCALL_FILTER_SET_AIO
] = {
346 .help
= "Asynchronous IO",
352 "io_pgetevents_time64\0"
356 "io_uring_register\0"
359 [SYSCALL_FILTER_SET_BASIC_IO
] = {
381 [SYSCALL_FILTER_SET_CHOWN
] = {
383 .help
= "Change ownership of files and directories",
393 [SYSCALL_FILTER_SET_CLOCK
] = {
395 .help
= "Change the system time",
404 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
405 .name
= "@cpu-emulation",
406 .help
= "System calls for CPU emulation functionality",
414 [SYSCALL_FILTER_SET_DEBUG
] = {
416 .help
= "Debugging, performance monitoring and tracing functionality",
423 #if defined __s390__ || defined __s390x__
424 "s390_runtime_instr\0"
426 "sys_debug_setcontext\0"
428 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
429 .name
= "@file-system",
430 .help
= "File system operations",
461 "inotify_add_watch\0"
509 [SYSCALL_FILTER_SET_IO_EVENT
] = {
511 .help
= "Event loop system calls",
531 [SYSCALL_FILTER_SET_IPC
] = {
533 .help
= "SysV IPC, POSIX Message Queues or other IPC",
541 "mq_timedreceive_time64\0"
543 "mq_timedsend_time64\0"
553 "process_vm_writev\0"
558 "semtimedop_time64\0"
564 [SYSCALL_FILTER_SET_KEYRING
] = {
566 .help
= "Kernel keyring access",
572 [SYSCALL_FILTER_SET_MEMLOCK
] = {
574 .help
= "Memory locking control",
582 [SYSCALL_FILTER_SET_MODULE
] = {
584 .help
= "Loading and unloading of kernel modules",
590 [SYSCALL_FILTER_SET_MOUNT
] = {
592 .help
= "Mounting and unmounting of file systems",
607 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
608 .name
= "@network-io",
609 .help
= "Network or Unix socket IO, should not be needed if not network facing",
634 [SYSCALL_FILTER_SET_OBSOLETE
] = {
635 /* some unknown even to libseccomp */
637 .help
= "Unusual, obsolete or unimplemented system calls",
667 [SYSCALL_FILTER_SET_PKEY
] = {
669 .help
= "System calls used for memory protection keys",
675 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
676 .name
= "@privileged",
677 .help
= "All system calls which need super-user capabilities",
693 "open_by_handle_at\0"
706 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
710 [SYSCALL_FILTER_SET_PROCESS
] = {
712 .help
= "Process control, execution, namespacing operations",
715 "capget\0" /* Able to query arbitrary processes */
723 "pidfd_send_signal\0"
726 "rt_tgsigqueueinfo\0"
728 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
738 [SYSCALL_FILTER_SET_RAW_IO
] = {
740 .help
= "Raw I/O port access",
747 #if defined __s390__ || defined __s390x__
748 "s390_pci_mmio_read\0"
749 "s390_pci_mmio_write\0"
752 [SYSCALL_FILTER_SET_REBOOT
] = {
754 .help
= "Reboot and reboot preparation/kexec",
760 [SYSCALL_FILTER_SET_RESOURCES
] = {
761 .name
= "@resources",
762 .help
= "Alter resource settings",
769 "sched_setaffinity\0"
772 "sched_setscheduler\0"
777 [SYSCALL_FILTER_SET_SETUID
] = {
779 .help
= "Operations for changing user/group credentials",
796 [SYSCALL_FILTER_SET_SIGNAL
] = {
798 .help
= "Process signal handling",
805 "rt_sigtimedwait_time64\0"
815 [SYSCALL_FILTER_SET_SWAP
] = {
817 .help
= "Enable/disable swap devices",
822 [SYSCALL_FILTER_SET_SYNC
] = {
824 .help
= "Synchronize files and memory to storage",
834 [SYSCALL_FILTER_SET_SYSTEM_SERVICE
] = {
835 .name
= "@system-service",
836 .help
= "General system service operations",
869 "name_to_handle_at\0"
876 "sched_get_priority_max\0"
877 "sched_get_priority_min\0"
880 "sched_getscheduler\0"
881 "sched_rr_get_interval\0"
882 "sched_rr_get_interval_time64\0"
900 [SYSCALL_FILTER_SET_TIMER
] = {
902 .help
= "Schedule operations by time",
916 "timerfd_gettime64\0"
918 "timerfd_settime64\0"
921 [SYSCALL_FILTER_SET_KNOWN
] = {
923 .help
= "All known syscalls declared in the kernel",
925 #include "syscall-list.h"
929 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
930 if (isempty(name
) || name
[0] != '@')
933 for (unsigned i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
934 if (streq(syscall_filter_sets
[i
].name
, name
))
935 return syscall_filter_sets
+ i
;
940 static int add_syscall_filter_set(
941 scmp_filter_ctx seccomp
,
942 const SyscallFilterSet
*set
,
948 int seccomp_add_syscall_filter_item(
949 scmp_filter_ctx
*seccomp
,
959 if (strv_contains(exclude
, name
))
962 /* Any syscalls that are handled are added to the *added strv. The pointer
963 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
965 if (name
[0] == '@') {
966 const SyscallFilterSet
*other
;
968 other
= syscall_filter_set_find(name
);
970 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
971 "Filter set %s is not known!",
974 return add_syscall_filter_set(seccomp
, other
, action
, exclude
, log_missing
, added
);
979 id
= seccomp_syscall_resolve_name(name
);
980 if (id
== __NR_SCMP_ERROR
) {
982 log_debug("System call %s is not known, ignoring.", name
);
986 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
988 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
989 bool ignore
= r
== -EDOM
;
991 if (!ignore
|| log_missing
)
992 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
993 name
, id
, ignore
? ", ignoring" : "");
999 r
= strv_extend(added
, name
);
1008 static int add_syscall_filter_set(
1009 scmp_filter_ctx seccomp
,
1010 const SyscallFilterSet
*set
,
1019 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1024 NULSTR_FOREACH(sys
, set
->value
) {
1025 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
, log_missing
, added
);
1033 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
, bool log_missing
) {
1039 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1040 * each local arch. */
1042 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1043 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1045 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1047 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1051 r
= add_syscall_filter_set(seccomp
, set
, action
, NULL
, log_missing
, NULL
);
1053 return log_debug_errno(r
, "Failed to add filter set: %m");
1055 r
= seccomp_load(seccomp
);
1056 if (ERRNO_IS_SECCOMP_FATAL(r
))
1059 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1065 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* filter
, uint32_t action
, bool log_missing
) {
1069 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1070 * of a SyscallFilterSet* table. */
1072 if (hashmap_isempty(filter
) && default_action
== SCMP_ACT_ALLOW
)
1075 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1076 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1077 void *syscall_id
, *val
;
1079 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1081 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1085 HASHMAP_FOREACH_KEY(val
, syscall_id
, filter
) {
1086 uint32_t a
= action
;
1087 int id
= PTR_TO_INT(syscall_id
) - 1;
1088 int error
= PTR_TO_INT(val
);
1090 if (error
== SECCOMP_ERROR_NUMBER_KILL
)
1091 a
= scmp_act_kill_process();
1093 else if (action
== SCMP_ACT_LOG
)
1096 else if (error
>= 0)
1097 a
= SCMP_ACT_ERRNO(error
);
1099 r
= seccomp_rule_add_exact(seccomp
, a
, id
, 0);
1101 /* If the system call is not known on this architecture, then that's
1102 * fine, let's ignore it */
1103 _cleanup_free_
char *n
= NULL
;
1106 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, id
);
1107 ignore
= r
== -EDOM
;
1108 if (!ignore
|| log_missing
)
1109 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1110 strna(n
), id
, ignore
? ", ignoring" : "");
1116 r
= seccomp_load(seccomp
);
1117 if (ERRNO_IS_SECCOMP_FATAL(r
))
1120 log_debug_errno(r
, "Failed to install systemc call filter for architecture %s, skipping: %m",
1121 seccomp_arch_to_string(arch
));
1127 int seccomp_parse_syscall_filter(
1131 SeccompParseFlags flags
,
1133 const char *filename
,
1141 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) && errno_num
>= 0)
1144 if (name
[0] == '@') {
1145 const SyscallFilterSet
*set
;
1148 set
= syscall_filter_set_find(name
);
1150 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1153 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1154 "Unknown system call group, ignoring: %s", name
);
1158 NULSTR_FOREACH(i
, set
->value
) {
1159 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1160 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1161 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1163 r
= seccomp_parse_syscall_filter(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
1170 id
= seccomp_syscall_resolve_name(name
);
1171 if (id
== __NR_SCMP_ERROR
) {
1172 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1175 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1176 "Failed to parse system call, ignoring: %s", name
);
1180 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1181 * it from the list. The entries in allow-list with non-negative error value will be
1182 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1183 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) == FLAGS_SET(flags
, SECCOMP_PARSE_ALLOW_LIST
) ||
1184 (FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
| SECCOMP_PARSE_ALLOW_LIST
) && errno_num
>= 0)) {
1185 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1189 return FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? log_oom() : -ENOMEM
;
1191 assert_se(hashmap_update(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
)) == 0);
1197 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1203 int seccomp_restrict_namespaces(unsigned long retain
) {
1207 if (DEBUG_LOGGING
) {
1208 _cleanup_free_
char *s
= NULL
;
1210 (void) namespace_flags_to_string(retain
, &s
);
1211 log_debug("Restricting namespace to: %s.", strna(s
));
1215 if (FLAGS_SET(retain
, NAMESPACE_FLAGS_ALL
))
1218 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1219 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1221 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1223 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1227 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1228 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1230 r
= seccomp_rule_add_exact(
1232 SCMP_ACT_ERRNO(EPERM
),
1236 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1237 * special invocation with a zero flags argument, right here. */
1238 r
= seccomp_rule_add_exact(
1240 SCMP_ACT_ERRNO(EPERM
),
1243 SCMP_A1(SCMP_CMP_EQ
, 0));
1245 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1249 for (unsigned i
= 0; namespace_flag_map
[i
].name
; i
++) {
1252 f
= namespace_flag_map
[i
].flag
;
1253 if (FLAGS_SET(retain
, f
)) {
1254 log_debug("Permitting %s.", namespace_flag_map
[i
].name
);
1258 log_debug("Blocking %s.", namespace_flag_map
[i
].name
);
1260 r
= seccomp_rule_add_exact(
1262 SCMP_ACT_ERRNO(EPERM
),
1265 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1267 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1271 /* On s390/s390x the first two parameters to clone are switched */
1272 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1273 r
= seccomp_rule_add_exact(
1275 SCMP_ACT_ERRNO(EPERM
),
1278 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1280 r
= seccomp_rule_add_exact(
1282 SCMP_ACT_ERRNO(EPERM
),
1285 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1287 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1291 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1292 r
= seccomp_rule_add_exact(
1294 SCMP_ACT_ERRNO(EPERM
),
1297 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1299 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1307 r
= seccomp_load(seccomp
);
1308 if (ERRNO_IS_SECCOMP_FATAL(r
))
1311 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1317 int seccomp_protect_sysctl(void) {
1321 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1322 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1324 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1328 #ifdef SCMP_ARCH_RISCV64
1333 /* No _sysctl syscall */
1336 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1340 r
= seccomp_rule_add_exact(
1342 SCMP_ACT_ERRNO(EPERM
),
1346 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1350 r
= seccomp_load(seccomp
);
1351 if (ERRNO_IS_SECCOMP_FATAL(r
))
1354 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1360 int seccomp_protect_syslog(void) {
1364 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1365 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1367 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1371 r
= seccomp_rule_add_exact(
1373 SCMP_ACT_ERRNO(EPERM
),
1378 log_debug_errno(r
, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1382 r
= seccomp_load(seccomp
);
1383 if (ERRNO_IS_SECCOMP_FATAL(r
))
1386 log_debug_errno(r
, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1392 int seccomp_restrict_address_families(Set
*address_families
, bool allow_list
) {
1396 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1397 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1400 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1404 case SCMP_ARCH_X86_64
:
1407 case SCMP_ARCH_AARCH64
:
1408 case SCMP_ARCH_MIPSEL64N32
:
1409 case SCMP_ARCH_MIPS64N32
:
1410 case SCMP_ARCH_MIPSEL64
:
1411 case SCMP_ARCH_MIPS64
:
1412 #ifdef SCMP_ARCH_RISCV64
1413 case SCMP_ARCH_RISCV64
:
1415 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1419 case SCMP_ARCH_S390
:
1420 case SCMP_ARCH_S390X
:
1422 case SCMP_ARCH_MIPSEL
:
1423 case SCMP_ARCH_MIPS
:
1425 case SCMP_ARCH_PPC64
:
1426 case SCMP_ARCH_PPC64LE
:
1428 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1437 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1442 int first
= 0, last
= 0;
1445 /* If this is an allow list, we first block the address families that are out of
1446 * range and then everything that is not in the set. First, we find the lowest and
1447 * highest address family in the set. */
1449 SET_FOREACH(afp
, address_families
) {
1450 int af
= PTR_TO_INT(afp
);
1452 if (af
<= 0 || af
>= af_max())
1455 if (first
== 0 || af
< first
)
1458 if (last
== 0 || af
> last
)
1462 assert((first
== 0) == (last
== 0));
1466 /* No entries in the valid range, block everything */
1467 r
= seccomp_rule_add_exact(
1469 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1473 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1479 /* Block everything below the first entry */
1480 r
= seccomp_rule_add_exact(
1482 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1485 SCMP_A0(SCMP_CMP_LT
, first
));
1487 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1491 /* Block everything above the last entry */
1492 r
= seccomp_rule_add_exact(
1494 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1497 SCMP_A0(SCMP_CMP_GT
, last
));
1499 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1503 /* Block everything between the first and last entry */
1504 for (int af
= 1; af
< af_max(); af
++) {
1506 if (set_contains(address_families
, INT_TO_PTR(af
)))
1509 r
= seccomp_rule_add_exact(
1511 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1514 SCMP_A0(SCMP_CMP_EQ
, af
));
1519 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1527 /* If this is a deny list, then generate one rule for each address family that are
1528 * then combined in OR checks. */
1530 SET_FOREACH(af
, address_families
) {
1531 r
= seccomp_rule_add_exact(
1533 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1536 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1541 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1546 r
= seccomp_load(seccomp
);
1547 if (ERRNO_IS_SECCOMP_FATAL(r
))
1550 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1556 int seccomp_restrict_realtime(void) {
1557 static const int permitted_policies
[] = {
1563 int r
, max_policy
= 0;
1567 /* Determine the highest policy constant we want to allow */
1568 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1569 if (permitted_policies
[i
] > max_policy
)
1570 max_policy
= permitted_policies
[i
];
1572 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1573 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1576 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1578 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1582 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1584 for (p
= 0; p
< max_policy
; p
++) {
1587 /* Check if this is in the allow list. */
1588 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1589 if (permitted_policies
[i
] == p
) {
1597 /* Deny this policy */
1598 r
= seccomp_rule_add_exact(
1600 SCMP_ACT_ERRNO(EPERM
),
1601 SCMP_SYS(sched_setscheduler
),
1603 SCMP_A1(SCMP_CMP_EQ
, p
));
1605 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1610 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1611 * are unsigned here, hence no need no check for < 0 values. */
1612 r
= seccomp_rule_add_exact(
1614 SCMP_ACT_ERRNO(EPERM
),
1615 SCMP_SYS(sched_setscheduler
),
1617 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1619 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1623 r
= seccomp_load(seccomp
);
1624 if (ERRNO_IS_SECCOMP_FATAL(r
))
1627 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1633 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1637 const struct scmp_arg_cmp arg
) {
1640 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1642 _cleanup_free_
char *n
= NULL
;
1644 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1645 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1647 seccomp_arch_to_string(arch
));
1653 /* For known architectures, check that syscalls are indeed defined or not. */
1654 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1655 assert_cc(SCMP_SYS(shmget
) > 0);
1656 assert_cc(SCMP_SYS(shmat
) > 0);
1657 assert_cc(SCMP_SYS(shmdt
) > 0);
1660 int seccomp_memory_deny_write_execute(void) {
1662 unsigned loaded
= 0;
1664 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1665 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1666 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0, r
;
1668 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1672 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1673 * We ignore that here, which means there's still a way to get writable/executable
1674 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1677 case SCMP_ARCH_S390
:
1678 filter_syscall
= SCMP_SYS(mmap2
);
1679 block_syscall
= SCMP_SYS(mmap
);
1680 /* shmat multiplexed, see above */
1684 case SCMP_ARCH_PPC64
:
1685 case SCMP_ARCH_PPC64LE
:
1686 case SCMP_ARCH_S390X
:
1687 filter_syscall
= SCMP_SYS(mmap
);
1688 /* shmat multiplexed, see above */
1692 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1693 shmat_syscall
= SCMP_SYS(shmat
);
1696 case SCMP_ARCH_X86_64
:
1698 case SCMP_ARCH_AARCH64
:
1699 #ifdef SCMP_ARCH_RISCV64
1700 case SCMP_ARCH_RISCV64
:
1702 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, arm64 and riscv64 have only mmap */
1703 shmat_syscall
= SCMP_SYS(shmat
);
1706 /* Please add more definitions here, if you port systemd to other architectures! */
1708 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1709 #warning "Consider adding the right mmap() syscall definitions here!"
1713 /* Can't filter mmap() on this arch, then skip it */
1714 if (filter_syscall
== 0)
1717 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1721 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1723 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1727 if (block_syscall
!= 0) {
1728 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1733 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1735 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1739 #ifdef __NR_pkey_mprotect
1740 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1742 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1747 if (shmat_syscall
> 0) {
1748 r
= add_seccomp_syscall_filter(seccomp
, arch
, shmat_syscall
,
1750 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1755 r
= seccomp_load(seccomp
);
1756 if (ERRNO_IS_SECCOMP_FATAL(r
))
1759 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1760 seccomp_arch_to_string(arch
));
1765 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1770 int seccomp_restrict_archs(Set
*archs
) {
1771 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1773 bool blocked_new
= false;
1775 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1778 * There are some qualifications. However the most important use is to stop processes from bypassing
1779 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1780 * in a non-native architecture. There are no holes in this use case, at least so far. */
1782 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1783 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1784 * to run a program with the restrictions applied. */
1785 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1789 for (unsigned i
= 0; seccomp_local_archs
[i
] != SECCOMP_LOCAL_ARCH_END
; ++i
) {
1790 uint32_t arch
= seccomp_local_archs
[i
];
1792 /* See above comment, our "native" architecture is never blocked. */
1793 if (arch
== seccomp_arch_native())
1796 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1797 if (arch
== SECCOMP_LOCAL_ARCH_BLOCKED
)
1800 bool block
= !set_contains(archs
, UINT32_TO_PTR(arch
+ 1));
1802 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1803 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1804 * The important thing is that you can block the old 32-bit x86 syscalls.
1805 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1806 if (block
&& arch
== SCMP_ARCH_X86_64
&& seccomp_arch_native() == SCMP_ARCH_X32
)
1807 block
= !set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1));
1810 seccomp_local_archs
[i
] = SECCOMP_LOCAL_ARCH_BLOCKED
;
1813 r
= seccomp_arch_add(seccomp
, arch
);
1814 if (r
< 0 && r
!= -EEXIST
)
1819 /* All architectures that will be blocked by the seccomp program were
1820 * already blocked. */
1824 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1828 r
= seccomp_load(seccomp
);
1829 if (ERRNO_IS_SECCOMP_FATAL(r
))
1832 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1837 int parse_syscall_archs(char **l
, Set
**ret_archs
) {
1838 _cleanup_set_free_ Set
*archs
= NULL
;
1845 STRV_FOREACH(s
, l
) {
1848 r
= seccomp_arch_from_string(*s
, &a
);
1852 r
= set_ensure_put(&archs
, NULL
, UINT32_TO_PTR(a
+ 1));
1857 *ret_archs
= TAKE_PTR(archs
);
1861 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1867 NULSTR_FOREACH(i
, set
->value
) {
1870 const SyscallFilterSet
*more
;
1872 more
= syscall_filter_set_find(i
);
1876 r
= seccomp_filter_set_add(filter
, add
, more
);
1882 id
= seccomp_syscall_resolve_name(i
);
1883 if (id
== __NR_SCMP_ERROR
) {
1884 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1889 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1893 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1900 int seccomp_lock_personality(unsigned long personality
) {
1904 if (personality
>= PERSONALITY_INVALID
)
1907 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1908 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1910 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1914 r
= seccomp_rule_add_exact(
1916 SCMP_ACT_ERRNO(EPERM
),
1917 SCMP_SYS(personality
),
1919 SCMP_A0(SCMP_CMP_NE
, personality
));
1921 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1925 r
= seccomp_load(seccomp
);
1926 if (ERRNO_IS_SECCOMP_FATAL(r
))
1929 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1935 int seccomp_protect_hostname(void) {
1939 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1940 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1942 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1946 r
= seccomp_rule_add_exact(
1948 SCMP_ACT_ERRNO(EPERM
),
1949 SCMP_SYS(sethostname
),
1952 log_debug_errno(r
, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1956 r
= seccomp_rule_add_exact(
1958 SCMP_ACT_ERRNO(EPERM
),
1959 SCMP_SYS(setdomainname
),
1962 log_debug_errno(r
, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1966 r
= seccomp_load(seccomp
);
1967 if (ERRNO_IS_SECCOMP_FATAL(r
))
1970 log_debug_errno(r
, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1976 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp
, mode_t m
) {
1977 /* Checks the mode_t parameter of the following system calls:
1979 * → chmod() + fchmod() + fchmodat()
1980 * → open() + creat() + openat()
1981 * → mkdir() + mkdirat()
1982 * → mknod() + mknodat()
1984 * Returns error if *everything* failed, and 0 otherwise.
1989 r
= seccomp_rule_add_exact(
1991 SCMP_ACT_ERRNO(EPERM
),
1994 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
1996 log_debug_errno(r
, "Failed to add filter for chmod: %m");
2000 r
= seccomp_rule_add_exact(
2002 SCMP_ACT_ERRNO(EPERM
),
2005 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2007 log_debug_errno(r
, "Failed to add filter for fchmod: %m");
2011 r
= seccomp_rule_add_exact(
2013 SCMP_ACT_ERRNO(EPERM
),
2016 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2018 log_debug_errno(r
, "Failed to add filter for fchmodat: %m");
2022 r
= seccomp_rule_add_exact(
2024 SCMP_ACT_ERRNO(EPERM
),
2027 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2029 log_debug_errno(r
, "Failed to add filter for mkdir: %m");
2033 r
= seccomp_rule_add_exact(
2035 SCMP_ACT_ERRNO(EPERM
),
2038 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2040 log_debug_errno(r
, "Failed to add filter for mkdirat: %m");
2044 r
= seccomp_rule_add_exact(
2046 SCMP_ACT_ERRNO(EPERM
),
2049 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2051 log_debug_errno(r
, "Failed to add filter for mknod: %m");
2055 r
= seccomp_rule_add_exact(
2057 SCMP_ACT_ERRNO(EPERM
),
2060 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2062 log_debug_errno(r
, "Failed to add filter for mknodat: %m");
2066 #if SCMP_SYS(open) > 0
2067 r
= seccomp_rule_add_exact(
2069 SCMP_ACT_ERRNO(EPERM
),
2072 SCMP_A1(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2073 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2075 log_debug_errno(r
, "Failed to add filter for open: %m");
2080 r
= seccomp_rule_add_exact(
2082 SCMP_ACT_ERRNO(EPERM
),
2085 SCMP_A2(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2086 SCMP_A3(SCMP_CMP_MASKED_EQ
, m
, m
));
2088 log_debug_errno(r
, "Failed to add filter for openat: %m");
2092 #if defined(__SNR_openat2)
2093 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2094 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2095 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2096 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2097 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2098 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2099 r
= seccomp_rule_add_exact(
2101 SCMP_ACT_ERRNO(ENOSYS
),
2105 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2110 r
= seccomp_rule_add_exact(
2112 SCMP_ACT_ERRNO(EPERM
),
2115 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2117 log_debug_errno(r
, "Failed to add filter for creat: %m");
2124 int seccomp_restrict_suid_sgid(void) {
2128 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2129 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2131 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2135 r
= seccomp_restrict_sxid(seccomp
, S_ISUID
);
2137 log_debug_errno(r
, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2139 k
= seccomp_restrict_sxid(seccomp
, S_ISGID
);
2141 log_debug_errno(r
, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2146 r
= seccomp_load(seccomp
);
2147 if (ERRNO_IS_SECCOMP_FATAL(r
))
2150 log_debug_errno(r
, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2156 uint32_t scmp_act_kill_process(void) {
2158 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2159 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2160 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2161 * for single-threaded apps does the right thing. */
2163 #ifdef SCMP_ACT_KILL_PROCESS
2164 if (seccomp_api_get() >= 3)
2165 return SCMP_ACT_KILL_PROCESS
;
2168 return SCMP_ACT_KILL
; /* same as SCMP_ACT_KILL_THREAD */
2171 int parse_syscall_and_errno(const char *in
, char **name
, int *error
) {
2172 _cleanup_free_
char *n
= NULL
;
2181 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2182 * If errno is omitted, then error is set to -1.
2183 * Empty syscall name is not allowed.
2184 * Here, we do not check that the syscall name is valid or not.
2187 p
= strchr(in
, ':');
2189 e
= seccomp_parse_errno_or_action(p
+ 1);
2193 n
= strndup(in
, p
- in
);
2204 *name
= TAKE_PTR(n
);
2209 static int block_open_flag(scmp_filter_ctx seccomp
, int flag
) {
2213 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2214 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2216 #if SCMP_SYS(open) > 0
2217 r
= seccomp_rule_add_exact(
2219 SCMP_ACT_ERRNO(EINVAL
),
2222 SCMP_A1(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2224 log_debug_errno(r
, "Failed to add filter for open: %m");
2229 r
= seccomp_rule_add_exact(
2231 SCMP_ACT_ERRNO(EINVAL
),
2234 SCMP_A2(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2236 log_debug_errno(r
, "Failed to add filter for openat: %m");
2240 #if defined(__SNR_openat2)
2241 /* The new openat2() system call can't be filtered sensibly, see above. */
2242 r
= seccomp_rule_add_exact(
2244 SCMP_ACT_ERRNO(ENOSYS
),
2248 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2256 int seccomp_suppress_sync(void) {
2260 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2261 * manageable, and also masks O_SYNC/O_DSYNC */
2263 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2264 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2267 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2271 NULSTR_FOREACH(c
, syscall_filter_sets
[SYSCALL_FILTER_SET_SYNC
].value
) {
2274 id
= seccomp_syscall_resolve_name(c
);
2275 if (id
== __NR_SCMP_ERROR
) {
2276 log_debug("System call %s is not known, ignoring.", c
);
2280 r
= seccomp_rule_add_exact(
2282 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2286 log_debug_errno(r
, "Failed to add filter for system call %s, ignoring: %m", c
);
2289 (void) block_open_flag(seccomp
, O_SYNC
);
2290 #if O_DSYNC != O_SYNC
2291 (void) block_open_flag(seccomp
, O_DSYNC
);
2294 r
= seccomp_load(seccomp
);
2295 if (ERRNO_IS_SECCOMP_FATAL(r
))
2298 log_debug_errno(r
, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));