1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
5 #include <linux/seccomp.h>
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
17 #include "alloc-util.h"
19 #include "errno-list.h"
21 #include "namespace-util.h"
23 #include "nulstr-util.h"
24 #include "process-util.h"
25 #include "seccomp-util.h"
27 #include "string-util.h"
30 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
31 uint32_t seccomp_local_archs
[] = {
33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
35 #if defined(__x86_64__) && defined(__ILP32__)
38 SCMP_ARCH_X32
, /* native */
39 #elif defined(__x86_64__) && !defined(__ILP32__)
42 SCMP_ARCH_X86_64
, /* native */
43 #elif defined(__i386__)
45 #elif defined(__aarch64__)
47 SCMP_ARCH_AARCH64
, /* native */
48 #elif defined(__arm__)
50 #elif defined(__loongarch_lp64)
51 SCMP_ARCH_LOONGARCH64
,
52 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
54 SCMP_ARCH_MIPS
, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
57 SCMP_ARCH_MIPSEL
, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
61 SCMP_ARCH_MIPSEL64N32
,
64 SCMP_ARCH_MIPS64
, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
69 SCMP_ARCH_MIPSEL64N32
,
71 SCMP_ARCH_MIPSEL64
, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPSEL64N32
,
78 SCMP_ARCH_MIPS64N32
, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
85 SCMP_ARCH_MIPSEL64N32
, /* native */
86 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
88 SCMP_ARCH_PARISC64
, /* native */
89 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
91 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
94 SCMP_ARCH_PPC64
, /* native */
95 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC64LE
, /* native */
99 #elif defined(__powerpc__)
101 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
103 #elif defined(__s390x__)
105 SCMP_ARCH_S390X
, /* native */
106 #elif defined(__s390__)
109 SECCOMP_LOCAL_ARCH_END
112 const char* seccomp_arch_to_string(uint32_t c
) {
113 /* Maintain order used in <seccomp.h>.
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
119 case SCMP_ARCH_NATIVE
:
123 case SCMP_ARCH_X86_64
:
129 case SCMP_ARCH_AARCH64
:
131 #ifdef SCMP_ARCH_LOONGARCH64
132 case SCMP_ARCH_LOONGARCH64
:
133 return "loongarch64";
137 case SCMP_ARCH_MIPS64
:
139 case SCMP_ARCH_MIPS64N32
:
141 case SCMP_ARCH_MIPSEL
:
143 case SCMP_ARCH_MIPSEL64
:
145 case SCMP_ARCH_MIPSEL64N32
:
146 return "mips64-le-n32";
147 #ifdef SCMP_ARCH_PARISC
148 case SCMP_ARCH_PARISC
:
151 #ifdef SCMP_ARCH_PARISC64
152 case SCMP_ARCH_PARISC64
:
157 case SCMP_ARCH_PPC64
:
159 case SCMP_ARCH_PPC64LE
:
161 #ifdef SCMP_ARCH_RISCV64
162 case SCMP_ARCH_RISCV64
:
167 case SCMP_ARCH_S390X
:
174 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
180 if (streq(n
, "native"))
181 *ret
= SCMP_ARCH_NATIVE
;
182 else if (streq(n
, "x86"))
183 *ret
= SCMP_ARCH_X86
;
184 else if (streq(n
, "x86-64"))
185 *ret
= SCMP_ARCH_X86_64
;
186 else if (streq(n
, "x32"))
187 *ret
= SCMP_ARCH_X32
;
188 else if (streq(n
, "arm"))
189 *ret
= SCMP_ARCH_ARM
;
190 else if (streq(n
, "arm64"))
191 *ret
= SCMP_ARCH_AARCH64
;
192 #ifdef SCMP_ARCH_LOONGARCH64
193 else if (streq(n
, "loongarch64"))
194 *ret
= SCMP_ARCH_LOONGARCH64
;
196 else if (streq(n
, "mips"))
197 *ret
= SCMP_ARCH_MIPS
;
198 else if (streq(n
, "mips64"))
199 *ret
= SCMP_ARCH_MIPS64
;
200 else if (streq(n
, "mips64-n32"))
201 *ret
= SCMP_ARCH_MIPS64N32
;
202 else if (streq(n
, "mips-le"))
203 *ret
= SCMP_ARCH_MIPSEL
;
204 else if (streq(n
, "mips64-le"))
205 *ret
= SCMP_ARCH_MIPSEL64
;
206 else if (streq(n
, "mips64-le-n32"))
207 *ret
= SCMP_ARCH_MIPSEL64N32
;
208 #ifdef SCMP_ARCH_PARISC
209 else if (streq(n
, "parisc"))
210 *ret
= SCMP_ARCH_PARISC
;
212 #ifdef SCMP_ARCH_PARISC64
213 else if (streq(n
, "parisc64"))
214 *ret
= SCMP_ARCH_PARISC64
;
216 else if (streq(n
, "ppc"))
217 *ret
= SCMP_ARCH_PPC
;
218 else if (streq(n
, "ppc64"))
219 *ret
= SCMP_ARCH_PPC64
;
220 else if (streq(n
, "ppc64-le"))
221 *ret
= SCMP_ARCH_PPC64LE
;
222 #ifdef SCMP_ARCH_RISCV64
223 else if (streq(n
, "riscv64"))
224 *ret
= SCMP_ARCH_RISCV64
;
226 else if (streq(n
, "s390"))
227 *ret
= SCMP_ARCH_S390
;
228 else if (streq(n
, "s390x"))
229 *ret
= SCMP_ARCH_S390X
;
236 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
237 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
240 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
241 * any others. Also, turns off the NNP fiddling. */
243 seccomp
= seccomp_init(default_action
);
247 if (arch
!= SCMP_ARCH_NATIVE
&&
248 arch
!= seccomp_arch_native()) {
250 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
254 r
= seccomp_arch_add(seccomp
, arch
);
258 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
259 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
260 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
262 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
263 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
266 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
270 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
274 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
275 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
276 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_LOG
, 1);
278 log_debug_errno(r
, "Failed to enable seccomp event logging: %m");
282 *ret
= TAKE_PTR(seccomp
);
286 static bool is_basic_seccomp_available(void) {
287 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
290 static bool is_seccomp_filter_available(void) {
291 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
295 bool is_seccomp_available(void) {
296 static int cached_enabled
= -1;
298 if (cached_enabled
< 0) {
301 b
= getenv_bool_secure("SYSTEMD_SECCOMP");
303 if (b
< 0 && b
!= -ENXIO
) /* ENXIO: env var unset */
304 log_debug_errno(b
, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
307 is_basic_seccomp_available() &&
308 is_seccomp_filter_available();
310 cached_enabled
= false;
313 return cached_enabled
;
316 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
317 [SYSCALL_FILTER_SET_DEFAULT
] = {
319 .help
= "System calls that are always permitted",
321 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
325 "clock_getres_time64\0"
329 "clock_nanosleep_time64\0"
355 "getrlimit\0" /* make sure processes can query stack size and such */
370 "riscv_flush_icache\0"
374 "sched_getaffinity\0"
384 [SYSCALL_FILTER_SET_AIO
] = {
386 .help
= "Asynchronous IO",
392 "io_pgetevents_time64\0"
396 "io_uring_register\0"
399 [SYSCALL_FILTER_SET_BASIC_IO
] = {
421 [SYSCALL_FILTER_SET_CHOWN
] = {
423 .help
= "Change ownership of files and directories",
433 [SYSCALL_FILTER_SET_CLOCK
] = {
435 .help
= "Change the system time",
444 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
445 .name
= "@cpu-emulation",
446 .help
= "System calls for CPU emulation functionality",
454 [SYSCALL_FILTER_SET_DEBUG
] = {
456 .help
= "Debugging, performance monitoring and tracing functionality",
463 "s390_runtime_instr\0"
464 "sys_debug_setcontext\0"
466 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
467 .name
= "@file-system",
468 .help
= "File system operations",
499 "inotify_add_watch\0"
547 [SYSCALL_FILTER_SET_IO_EVENT
] = {
549 .help
= "Event loop system calls",
569 [SYSCALL_FILTER_SET_IPC
] = {
571 .help
= "SysV IPC, POSIX Message Queues or other IPC",
579 "mq_timedreceive_time64\0"
581 "mq_timedsend_time64\0"
591 "process_vm_writev\0"
596 "semtimedop_time64\0"
602 [SYSCALL_FILTER_SET_KEYRING
] = {
604 .help
= "Kernel keyring access",
610 [SYSCALL_FILTER_SET_MEMLOCK
] = {
612 .help
= "Memory locking control",
620 [SYSCALL_FILTER_SET_MODULE
] = {
622 .help
= "Loading and unloading of kernel modules",
628 [SYSCALL_FILTER_SET_MOUNT
] = {
630 .help
= "Mounting and unmounting of file systems",
645 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
646 .name
= "@network-io",
647 .help
= "Network or Unix socket IO, should not be needed if not network facing",
672 [SYSCALL_FILTER_SET_OBSOLETE
] = {
673 /* some unknown even to libseccomp */
675 .help
= "Unusual, obsolete or unimplemented system calls",
705 [SYSCALL_FILTER_SET_PKEY
] = {
707 .help
= "System calls used for memory protection keys",
713 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
714 .name
= "@privileged",
715 .help
= "All system calls which need super-user capabilities",
731 "open_by_handle_at\0"
745 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
749 [SYSCALL_FILTER_SET_PROCESS
] = {
751 .help
= "Process control, execution, namespacing operations",
753 "capget\0" /* Able to query arbitrary processes */
755 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
756 * implement seccomp, so we don't need to list it at all. C.f.
757 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
764 "pidfd_send_signal\0"
767 "rt_tgsigqueueinfo\0"
769 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
779 [SYSCALL_FILTER_SET_RAW_IO
] = {
781 .help
= "Raw I/O port access",
788 "s390_pci_mmio_read\0"
789 "s390_pci_mmio_write\0"
791 [SYSCALL_FILTER_SET_REBOOT
] = {
793 .help
= "Reboot and reboot preparation/kexec",
799 [SYSCALL_FILTER_SET_RESOURCES
] = {
800 .name
= "@resources",
801 .help
= "Alter resource settings",
808 "sched_setaffinity\0"
811 "sched_setscheduler\0"
813 "set_mempolicy_home_node\0"
817 [SYSCALL_FILTER_SET_SANDBOX
] = {
819 .help
= "Sandbox functionality",
821 "landlock_add_rule\0"
822 "landlock_create_ruleset\0"
823 "landlock_restrict_self\0"
826 [SYSCALL_FILTER_SET_SETUID
] = {
828 .help
= "Operations for changing user/group credentials",
845 [SYSCALL_FILTER_SET_SIGNAL
] = {
847 .help
= "Process signal handling",
854 "rt_sigtimedwait_time64\0"
864 [SYSCALL_FILTER_SET_SWAP
] = {
866 .help
= "Enable/disable swap devices",
871 [SYSCALL_FILTER_SET_SYNC
] = {
873 .help
= "Synchronize files and memory to storage",
883 [SYSCALL_FILTER_SET_SYSTEM_SERVICE
] = {
884 .name
= "@system-service",
885 .help
= "General system service operations",
918 "name_to_handle_at\0"
925 "sched_get_priority_max\0"
926 "sched_get_priority_min\0"
929 "sched_getscheduler\0"
930 "sched_rr_get_interval\0"
931 "sched_rr_get_interval_time64\0"
949 [SYSCALL_FILTER_SET_TIMER
] = {
951 .help
= "Schedule operations by time",
965 "timerfd_gettime64\0"
967 "timerfd_settime64\0"
970 [SYSCALL_FILTER_SET_KNOWN
] = {
972 .help
= "All known syscalls declared in the kernel",
975 #include "syscall-list.h"
979 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
980 if (isempty(name
) || name
[0] != '@')
983 for (unsigned i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
984 if (streq(syscall_filter_sets
[i
].name
, name
))
985 return syscall_filter_sets
+ i
;
990 static int add_syscall_filter_set(
991 scmp_filter_ctx seccomp
,
992 const SyscallFilterSet
*set
,
998 int seccomp_add_syscall_filter_item(
999 scmp_filter_ctx
*seccomp
,
1009 if (strv_contains(exclude
, name
))
1012 /* Any syscalls that are handled are added to the *added strv. The pointer
1013 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1015 if (name
[0] == '@') {
1016 const SyscallFilterSet
*other
;
1018 other
= syscall_filter_set_find(name
);
1020 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
1021 "Filter set %s is not known!",
1024 return add_syscall_filter_set(seccomp
, other
, action
, exclude
, log_missing
, added
);
1029 id
= seccomp_syscall_resolve_name(name
);
1030 if (id
== __NR_SCMP_ERROR
) {
1032 log_debug("System call %s is not known, ignoring.", name
);
1036 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
1038 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1039 bool ignore
= r
== -EDOM
;
1041 if (!ignore
|| log_missing
)
1042 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1043 name
, id
, ignore
? ", ignoring" : "");
1049 r
= strv_extend(added
, name
);
1058 static int add_syscall_filter_set(
1059 scmp_filter_ctx seccomp
,
1060 const SyscallFilterSet
*set
,
1068 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1073 NULSTR_FOREACH(sys
, set
->value
) {
1074 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
, log_missing
, added
);
1082 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
, bool log_missing
) {
1088 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1089 * each local arch. */
1091 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1092 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1094 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1096 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1100 r
= add_syscall_filter_set(seccomp
, set
, action
, NULL
, log_missing
, NULL
);
1102 return log_debug_errno(r
, "Failed to add filter set: %m");
1104 r
= seccomp_load(seccomp
);
1106 if (ERRNO_IS_SECCOMP_FATAL(r
))
1108 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1115 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* filter
, uint32_t action
, bool log_missing
) {
1119 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1120 * of a SyscallFilterSet* table. */
1122 if (hashmap_isempty(filter
) && default_action
== SCMP_ACT_ALLOW
)
1125 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1126 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1127 void *syscall_id
, *val
;
1129 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1131 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action
);
1135 HASHMAP_FOREACH_KEY(val
, syscall_id
, filter
) {
1136 uint32_t a
= action
;
1137 int id
= PTR_TO_INT(syscall_id
) - 1;
1138 int error
= PTR_TO_INT(val
);
1140 if (error
== SECCOMP_ERROR_NUMBER_KILL
)
1141 a
= scmp_act_kill_process();
1143 else if (action
== SCMP_ACT_LOG
)
1146 else if (error
>= 0)
1147 a
= SCMP_ACT_ERRNO(error
);
1149 r
= seccomp_rule_add_exact(seccomp
, a
, id
, 0);
1151 /* If the system call is not known on this architecture, then that's
1152 * fine, let's ignore it */
1153 _cleanup_free_
char *n
= NULL
;
1156 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, id
);
1157 ignore
= r
== -EDOM
;
1158 if (!ignore
|| log_missing
)
1159 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1160 strna(n
), id
, ignore
? ", ignoring" : "");
1166 r
= seccomp_load(seccomp
);
1168 if (ERRNO_IS_SECCOMP_FATAL(r
))
1170 log_debug_errno(r
, "Failed to install system call filter for architecture %s, skipping: %m",
1171 seccomp_arch_to_string(arch
));
1178 int seccomp_parse_syscall_filter(
1182 SeccompParseFlags flags
,
1184 const char *filename
,
1192 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) && errno_num
>= 0)
1195 if (name
[0] == '@') {
1196 const SyscallFilterSet
*set
;
1198 set
= syscall_filter_set_find(name
);
1200 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1203 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1204 "Unknown system call group, ignoring: %s", name
);
1208 NULSTR_FOREACH(i
, set
->value
) {
1209 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1210 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1211 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1213 r
= seccomp_parse_syscall_filter(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
1220 id
= seccomp_syscall_resolve_name(name
);
1221 if (id
== __NR_SCMP_ERROR
) {
1222 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1225 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1226 "Failed to parse system call, ignoring: %s", name
);
1230 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1231 * it from the list. The entries in allow-list with non-negative error value will be
1232 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1233 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) == FLAGS_SET(flags
, SECCOMP_PARSE_ALLOW_LIST
) ||
1234 (FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
| SECCOMP_PARSE_ALLOW_LIST
) && errno_num
>= 0)) {
1235 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1239 return FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? log_oom() : -ENOMEM
;
1241 assert_se(hashmap_update(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
)) == 0);
1247 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1253 int seccomp_restrict_namespaces(unsigned long retain
) {
1257 if (DEBUG_LOGGING
) {
1258 _cleanup_free_
char *s
= NULL
;
1260 (void) namespace_flags_to_string(retain
, &s
);
1261 log_debug("Restricting namespace to: %s.", strna(s
));
1265 if (FLAGS_SET(retain
, NAMESPACE_FLAGS_ALL
))
1268 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1269 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1271 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1273 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1277 /* We cannot filter on individual flags to clone3(), and we need to disable the
1278 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1279 * users shall fall back to clone(), as if on an older kernel.
1281 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1282 * https://github.com/moby/moby/issues/42680. */
1284 r
= seccomp_rule_add_exact(
1286 SCMP_ACT_ERRNO(ENOSYS
),
1290 log_debug_errno(r
, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
1292 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1293 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1295 r
= seccomp_rule_add_exact(
1297 SCMP_ACT_ERRNO(EPERM
),
1301 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1302 * special invocation with a zero flags argument, right here. */
1303 r
= seccomp_rule_add_exact(
1305 SCMP_ACT_ERRNO(EPERM
),
1308 SCMP_A1(SCMP_CMP_EQ
, 0));
1310 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1314 for (unsigned i
= 0; namespace_info
[i
].proc_name
; i
++) {
1317 f
= namespace_info
[i
].clone_flag
;
1318 if (FLAGS_SET(retain
, f
)) {
1319 log_debug("Permitting %s.", namespace_info
[i
].proc_name
);
1323 log_trace("Blocking %s.", namespace_info
[i
].proc_name
);
1325 r
= seccomp_rule_add_exact(
1327 SCMP_ACT_ERRNO(EPERM
),
1330 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1332 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1336 /* On s390/s390x the first two parameters to clone are switched */
1337 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1338 r
= seccomp_rule_add_exact(
1340 SCMP_ACT_ERRNO(EPERM
),
1343 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1345 r
= seccomp_rule_add_exact(
1347 SCMP_ACT_ERRNO(EPERM
),
1350 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1352 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1356 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1357 r
= seccomp_rule_add_exact(
1359 SCMP_ACT_ERRNO(EPERM
),
1362 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1364 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1372 r
= seccomp_load(seccomp
);
1374 if (ERRNO_IS_SECCOMP_FATAL(r
))
1376 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1383 int seccomp_protect_sysctl(void) {
1387 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1388 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1390 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1394 #ifdef SCMP_ARCH_LOONGARCH64
1395 SCMP_ARCH_LOONGARCH64
,
1397 #ifdef SCMP_ARCH_RISCV64
1402 /* No _sysctl syscall */
1405 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1409 r
= seccomp_rule_add_exact(
1411 SCMP_ACT_ERRNO(EPERM
),
1415 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1419 r
= seccomp_load(seccomp
);
1421 if (ERRNO_IS_SECCOMP_FATAL(r
))
1423 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1430 int seccomp_protect_syslog(void) {
1434 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1435 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1437 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1441 r
= seccomp_rule_add_exact(
1443 SCMP_ACT_ERRNO(EPERM
),
1448 log_debug_errno(r
, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1452 r
= seccomp_load(seccomp
);
1454 if (ERRNO_IS_SECCOMP_FATAL(r
))
1456 log_debug_errno(r
, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1463 int seccomp_restrict_address_families(Set
*address_families
, bool allow_list
) {
1467 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1468 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1471 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1475 case SCMP_ARCH_X86_64
:
1478 case SCMP_ARCH_AARCH64
:
1479 #ifdef SCMP_ARCH_LOONGARCH64
1480 case SCMP_ARCH_LOONGARCH64
:
1482 case SCMP_ARCH_MIPSEL64N32
:
1483 case SCMP_ARCH_MIPS64N32
:
1484 case SCMP_ARCH_MIPSEL64
:
1485 case SCMP_ARCH_MIPS64
:
1486 #ifdef SCMP_ARCH_RISCV64
1487 case SCMP_ARCH_RISCV64
:
1489 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1493 case SCMP_ARCH_S390
:
1494 case SCMP_ARCH_S390X
:
1496 case SCMP_ARCH_MIPSEL
:
1497 case SCMP_ARCH_MIPS
:
1498 #ifdef SCMP_ARCH_PARISC
1499 case SCMP_ARCH_PARISC
:
1501 #ifdef SCMP_ARCH_PARISC64
1502 case SCMP_ARCH_PARISC64
:
1505 case SCMP_ARCH_PPC64
:
1506 case SCMP_ARCH_PPC64LE
:
1508 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1517 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1522 int first
= 0, last
= 0;
1525 /* If this is an allow list, we first block the address families that are out of
1526 * range and then everything that is not in the set. First, we find the lowest and
1527 * highest address family in the set. */
1529 SET_FOREACH(afp
, address_families
) {
1530 int af
= PTR_TO_INT(afp
);
1532 if (af
<= 0 || af
>= af_max())
1535 if (first
== 0 || af
< first
)
1538 if (last
== 0 || af
> last
)
1542 assert((first
== 0) == (last
== 0));
1546 /* No entries in the valid range, block everything */
1547 r
= seccomp_rule_add_exact(
1549 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1553 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1559 /* Block everything below the first entry */
1560 r
= seccomp_rule_add_exact(
1562 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1565 SCMP_A0(SCMP_CMP_LT
, first
));
1567 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1571 /* Block everything above the last entry */
1572 r
= seccomp_rule_add_exact(
1574 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1577 SCMP_A0(SCMP_CMP_GT
, last
));
1579 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1583 /* Block everything between the first and last entry */
1584 for (int af
= 1; af
< af_max(); af
++) {
1586 if (set_contains(address_families
, INT_TO_PTR(af
)))
1589 r
= seccomp_rule_add_exact(
1591 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1594 SCMP_A0(SCMP_CMP_EQ
, af
));
1599 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1607 /* If this is a deny list, then generate one rule for each address family that are
1608 * then combined in OR checks. */
1610 SET_FOREACH(af
, address_families
) {
1611 r
= seccomp_rule_add_exact(
1613 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1616 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1621 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1626 r
= seccomp_load(seccomp
);
1628 if (ERRNO_IS_SECCOMP_FATAL(r
))
1630 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1637 int seccomp_restrict_realtime_full(int error_code
) {
1638 static const int permitted_policies
[] = {
1644 int r
, max_policy
= 0;
1648 assert(error_code
> 0);
1650 /* Determine the highest policy constant we want to allow */
1651 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1652 if (permitted_policies
[i
] > max_policy
)
1653 max_policy
= permitted_policies
[i
];
1655 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1656 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1659 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1661 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1665 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1667 for (p
= 0; p
< max_policy
; p
++) {
1670 /* Check if this is in the allow list. */
1671 for (i
= 0; i
< ELEMENTSOF(permitted_policies
); i
++)
1672 if (permitted_policies
[i
] == p
) {
1680 /* Deny this policy */
1681 r
= seccomp_rule_add_exact(
1683 SCMP_ACT_ERRNO(error_code
),
1684 SCMP_SYS(sched_setscheduler
),
1686 SCMP_A1(SCMP_CMP_EQ
, p
));
1688 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1693 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1694 * are unsigned here, hence no need no check for < 0 values. */
1695 r
= seccomp_rule_add_exact(
1697 SCMP_ACT_ERRNO(error_code
),
1698 SCMP_SYS(sched_setscheduler
),
1700 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1702 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1706 r
= seccomp_load(seccomp
);
1708 if (ERRNO_IS_SECCOMP_FATAL(r
))
1710 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
1717 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1721 const struct scmp_arg_cmp arg
) {
1724 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1726 _cleanup_free_
char *n
= NULL
;
1728 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1729 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1731 seccomp_arch_to_string(arch
));
1737 /* For known architectures, check that syscalls are indeed defined or not. */
1738 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
1739 assert_cc(SCMP_SYS(shmget
) > 0);
1740 assert_cc(SCMP_SYS(shmat
) > 0);
1741 assert_cc(SCMP_SYS(shmdt
) > 0);
1744 int seccomp_memory_deny_write_execute(void) {
1746 unsigned loaded
= 0;
1748 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1749 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1750 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0, r
;
1752 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1756 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1757 * We ignore that here, which means there's still a way to get writable/executable
1758 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1760 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1761 * on that front (kernel work done in 5.18).
1765 case SCMP_ARCH_S390
:
1766 filter_syscall
= SCMP_SYS(mmap2
);
1767 block_syscall
= SCMP_SYS(mmap
);
1768 /* shmat multiplexed, see above */
1772 case SCMP_ARCH_PPC64
:
1773 case SCMP_ARCH_PPC64LE
:
1774 case SCMP_ARCH_S390X
:
1775 filter_syscall
= SCMP_SYS(mmap
);
1776 /* shmat multiplexed, see above */
1780 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1781 shmat_syscall
= SCMP_SYS(shmat
);
1784 case SCMP_ARCH_X86_64
:
1786 case SCMP_ARCH_AARCH64
:
1787 #ifdef SCMP_ARCH_LOONGARCH64
1788 case SCMP_ARCH_LOONGARCH64
:
1790 #ifdef SCMP_ARCH_RISCV64
1791 case SCMP_ARCH_RISCV64
:
1793 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
1794 shmat_syscall
= SCMP_SYS(shmat
);
1797 /* Please add more definitions here, if you port systemd to other architectures! */
1799 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
1800 #warning "Consider adding the right mmap() syscall definitions here!"
1804 /* Can't filter mmap() on this arch, then skip it */
1805 if (filter_syscall
== 0)
1808 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1812 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1814 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1818 if (block_syscall
!= 0) {
1819 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1824 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1826 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1830 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1832 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1836 if (shmat_syscall
> 0) {
1837 r
= add_seccomp_syscall_filter(seccomp
, arch
, shmat_syscall
,
1839 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1844 r
= seccomp_load(seccomp
);
1846 if (ERRNO_IS_SECCOMP_FATAL(r
))
1848 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1849 seccomp_arch_to_string(arch
));
1855 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1860 int seccomp_restrict_archs(Set
*archs
) {
1861 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1863 bool blocked_new
= false;
1865 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1868 * There are some qualifications. However the most important use is to stop processes from bypassing
1869 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1870 * in a non-native architecture. There are no holes in this use case, at least so far. */
1872 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1873 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1874 * to run a program with the restrictions applied. */
1875 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1879 for (unsigned i
= 0; seccomp_local_archs
[i
] != SECCOMP_LOCAL_ARCH_END
; ++i
) {
1880 uint32_t arch
= seccomp_local_archs
[i
];
1882 /* See above comment, our "native" architecture is never blocked. */
1883 if (arch
== seccomp_arch_native())
1886 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1887 if (arch
== SECCOMP_LOCAL_ARCH_BLOCKED
)
1890 bool block
= !set_contains(archs
, UINT32_TO_PTR(arch
+ 1));
1892 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1893 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1894 * The important thing is that you can block the old 32-bit x86 syscalls.
1895 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1896 if (block
&& arch
== SCMP_ARCH_X86_64
&& seccomp_arch_native() == SCMP_ARCH_X32
)
1897 block
= !set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1));
1900 seccomp_local_archs
[i
] = SECCOMP_LOCAL_ARCH_BLOCKED
;
1903 r
= seccomp_arch_add(seccomp
, arch
);
1904 if (r
< 0 && r
!= -EEXIST
)
1909 /* All architectures that will be blocked by the seccomp program were
1910 * already blocked. */
1914 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
1918 r
= seccomp_load(seccomp
);
1920 if (ERRNO_IS_SECCOMP_FATAL(r
))
1922 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
1928 int parse_syscall_archs(char **l
, Set
**ret_archs
) {
1929 _cleanup_set_free_ Set
*archs
= NULL
;
1935 STRV_FOREACH(s
, l
) {
1938 r
= seccomp_arch_from_string(*s
, &a
);
1942 r
= set_ensure_put(&archs
, NULL
, UINT32_TO_PTR(a
+ 1));
1947 *ret_archs
= TAKE_PTR(archs
);
1951 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
1956 NULSTR_FOREACH(i
, set
->value
) {
1959 const SyscallFilterSet
*more
;
1961 more
= syscall_filter_set_find(i
);
1965 r
= seccomp_filter_set_add(filter
, add
, more
);
1971 id
= seccomp_syscall_resolve_name(i
);
1972 if (id
== __NR_SCMP_ERROR
) {
1973 log_debug("Couldn't resolve system call, ignoring: %s", i
);
1978 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
1982 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1989 int seccomp_lock_personality(unsigned long personality
) {
1993 if (personality
>= PERSONALITY_INVALID
)
1996 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1997 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1999 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2003 r
= seccomp_rule_add_exact(
2005 SCMP_ACT_ERRNO(EPERM
),
2006 SCMP_SYS(personality
),
2008 SCMP_A0(SCMP_CMP_NE
, personality
));
2010 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2014 r
= seccomp_load(seccomp
);
2016 if (ERRNO_IS_SECCOMP_FATAL(r
))
2018 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2025 int seccomp_protect_hostname(void) {
2029 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2030 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2032 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2036 r
= seccomp_rule_add_exact(
2038 SCMP_ACT_ERRNO(EPERM
),
2039 SCMP_SYS(sethostname
),
2042 log_debug_errno(r
, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2046 r
= seccomp_rule_add_exact(
2048 SCMP_ACT_ERRNO(EPERM
),
2049 SCMP_SYS(setdomainname
),
2052 log_debug_errno(r
, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2056 r
= seccomp_load(seccomp
);
2058 if (ERRNO_IS_SECCOMP_FATAL(r
))
2060 log_debug_errno(r
, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2067 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp
, mode_t m
) {
2068 /* Checks the mode_t parameter of the following system calls:
2070 * → chmod() + fchmod() + fchmodat()
2071 * → open() + creat() + openat()
2072 * → mkdir() + mkdirat()
2073 * → mknod() + mknodat()
2075 * Returns error if *everything* failed, and 0 otherwise.
2080 r
= seccomp_rule_add_exact(
2082 SCMP_ACT_ERRNO(EPERM
),
2085 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2087 log_debug_errno(r
, "Failed to add filter for chmod: %m");
2091 r
= seccomp_rule_add_exact(
2093 SCMP_ACT_ERRNO(EPERM
),
2096 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2098 log_debug_errno(r
, "Failed to add filter for fchmod: %m");
2102 r
= seccomp_rule_add_exact(
2104 SCMP_ACT_ERRNO(EPERM
),
2107 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2109 log_debug_errno(r
, "Failed to add filter for fchmodat: %m");
2113 r
= seccomp_rule_add_exact(
2115 SCMP_ACT_ERRNO(EPERM
),
2118 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2120 log_debug_errno(r
, "Failed to add filter for mkdir: %m");
2124 r
= seccomp_rule_add_exact(
2126 SCMP_ACT_ERRNO(EPERM
),
2129 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2131 log_debug_errno(r
, "Failed to add filter for mkdirat: %m");
2135 r
= seccomp_rule_add_exact(
2137 SCMP_ACT_ERRNO(EPERM
),
2140 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2142 log_debug_errno(r
, "Failed to add filter for mknod: %m");
2146 r
= seccomp_rule_add_exact(
2148 SCMP_ACT_ERRNO(EPERM
),
2151 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2153 log_debug_errno(r
, "Failed to add filter for mknodat: %m");
2157 r
= seccomp_rule_add_exact(
2159 SCMP_ACT_ERRNO(EPERM
),
2162 SCMP_A1(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2163 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2165 log_debug_errno(r
, "Failed to add filter for open: %m");
2169 r
= seccomp_rule_add_exact(
2171 SCMP_ACT_ERRNO(EPERM
),
2174 SCMP_A2(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2175 SCMP_A3(SCMP_CMP_MASKED_EQ
, m
, m
));
2177 log_debug_errno(r
, "Failed to add filter for openat: %m");
2181 #if defined(__SNR_openat2)
2182 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2183 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2184 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2185 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2186 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2187 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2188 r
= seccomp_rule_add_exact(
2190 SCMP_ACT_ERRNO(ENOSYS
),
2194 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2199 r
= seccomp_rule_add_exact(
2201 SCMP_ACT_ERRNO(EPERM
),
2204 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2206 log_debug_errno(r
, "Failed to add filter for creat: %m");
2213 int seccomp_restrict_suid_sgid(void) {
2217 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2218 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2220 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2224 r
= seccomp_restrict_sxid(seccomp
, S_ISUID
);
2226 log_debug_errno(r
, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2228 k
= seccomp_restrict_sxid(seccomp
, S_ISGID
);
2230 log_debug_errno(r
, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch
));
2235 r
= seccomp_load(seccomp
);
2237 if (ERRNO_IS_SECCOMP_FATAL(r
))
2239 log_debug_errno(r
, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));
2246 uint32_t scmp_act_kill_process(void) {
2248 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2249 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2250 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2251 * for single-threaded apps does the right thing. */
2253 #ifdef SCMP_ACT_KILL_PROCESS
2254 if (seccomp_api_get() >= 3)
2255 return SCMP_ACT_KILL_PROCESS
;
2258 return SCMP_ACT_KILL
; /* same as SCMP_ACT_KILL_THREAD */
2261 int parse_syscall_and_errno(const char *in
, char **name
, int *error
) {
2262 _cleanup_free_
char *n
= NULL
;
2271 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2272 * If errno is omitted, then error is set to -1.
2273 * Empty syscall name is not allowed.
2274 * Here, we do not check that the syscall name is valid or not.
2277 p
= strchr(in
, ':');
2279 e
= seccomp_parse_errno_or_action(p
+ 1);
2283 n
= strndup(in
, p
- in
);
2294 *name
= TAKE_PTR(n
);
2299 static int block_open_flag(scmp_filter_ctx seccomp
, int flag
) {
2303 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2304 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2306 r
= seccomp_rule_add_exact(
2308 SCMP_ACT_ERRNO(EINVAL
),
2311 SCMP_A1(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2313 log_debug_errno(r
, "Failed to add filter for open: %m");
2317 r
= seccomp_rule_add_exact(
2319 SCMP_ACT_ERRNO(EINVAL
),
2322 SCMP_A2(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2324 log_debug_errno(r
, "Failed to add filter for openat: %m");
2328 #if defined(__SNR_openat2)
2329 /* The new openat2() system call can't be filtered sensibly, see above. */
2330 r
= seccomp_rule_add_exact(
2332 SCMP_ACT_ERRNO(ENOSYS
),
2336 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2344 int seccomp_suppress_sync(void) {
2348 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2349 * manageable, and also masks O_SYNC/O_DSYNC */
2351 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2352 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2354 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2358 NULSTR_FOREACH(c
, syscall_filter_sets
[SYSCALL_FILTER_SET_SYNC
].value
) {
2361 id
= seccomp_syscall_resolve_name(c
);
2362 if (id
== __NR_SCMP_ERROR
) {
2363 log_debug("System call %s is not known, ignoring.", c
);
2367 r
= seccomp_rule_add_exact(
2369 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2373 log_debug_errno(r
, "Failed to add filter for system call %s, ignoring: %m", c
);
2376 (void) block_open_flag(seccomp
, O_SYNC
);
2377 #if O_DSYNC != O_SYNC
2378 (void) block_open_flag(seccomp
, O_DSYNC
);
2381 r
= seccomp_load(seccomp
);
2383 if (ERRNO_IS_SECCOMP_FATAL(r
))
2385 log_debug_errno(r
, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch
));