1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
4 #include <linux/seccomp.h>
12 #include "alloc-util.h"
14 #include "errno-list.h"
16 #include "namespace-util.h"
18 #include "nulstr-util.h"
19 #include "parse-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
23 #include "string-util.h"
28 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
29 uint32_t seccomp_local_archs
[] = {
31 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
33 #if defined(__x86_64__) && defined(__ILP32__)
36 SCMP_ARCH_X32
, /* native */
37 #elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86_64
, /* native */
41 #elif defined(__i386__)
43 #elif defined(__aarch64__)
45 SCMP_ARCH_AARCH64
, /* native */
46 #elif defined(__arm__)
48 #elif defined(__loongarch_lp64)
49 SCMP_ARCH_LOONGARCH64
,
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
52 SCMP_ARCH_MIPS
, /* native */
53 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
55 SCMP_ARCH_MIPSEL
, /* native */
56 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPSEL64N32
,
62 SCMP_ARCH_MIPS64
, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
67 SCMP_ARCH_MIPSEL64N32
,
69 SCMP_ARCH_MIPSEL64
, /* native */
70 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
75 SCMP_ARCH_MIPSEL64N32
,
76 SCMP_ARCH_MIPS64N32
, /* native */
77 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
83 SCMP_ARCH_MIPSEL64N32
, /* native */
84 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
86 SCMP_ARCH_PARISC64
, /* native */
87 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
89 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
92 SCMP_ARCH_PPC64
, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
96 SCMP_ARCH_PPC64LE
, /* native */
97 #elif defined(__powerpc__)
99 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
101 #elif defined(__s390x__)
103 SCMP_ARCH_S390X
, /* native */
104 #elif defined(__s390__)
107 SECCOMP_LOCAL_ARCH_END
110 const char* seccomp_arch_to_string(uint32_t c
) {
111 /* Maintain order used in <seccomp.h>.
113 * Names used here should be the same as those used for ConditionArchitecture=,
114 * except for "subarchitectures" like x32. */
117 case SCMP_ARCH_NATIVE
:
121 case SCMP_ARCH_X86_64
:
127 case SCMP_ARCH_AARCH64
:
129 #ifdef SCMP_ARCH_LOONGARCH64
130 case SCMP_ARCH_LOONGARCH64
:
131 return "loongarch64";
135 case SCMP_ARCH_MIPS64
:
137 case SCMP_ARCH_MIPS64N32
:
139 case SCMP_ARCH_MIPSEL
:
141 case SCMP_ARCH_MIPSEL64
:
143 case SCMP_ARCH_MIPSEL64N32
:
144 return "mips64-le-n32";
145 #ifdef SCMP_ARCH_PARISC
146 case SCMP_ARCH_PARISC
:
149 #ifdef SCMP_ARCH_PARISC64
150 case SCMP_ARCH_PARISC64
:
155 case SCMP_ARCH_PPC64
:
157 case SCMP_ARCH_PPC64LE
:
159 #ifdef SCMP_ARCH_RISCV64
160 case SCMP_ARCH_RISCV64
:
165 case SCMP_ARCH_S390X
:
172 int seccomp_arch_from_string(const char *n
, uint32_t *ret
) {
178 if (streq(n
, "native"))
179 *ret
= SCMP_ARCH_NATIVE
;
180 else if (streq(n
, "x86"))
181 *ret
= SCMP_ARCH_X86
;
182 else if (streq(n
, "x86-64"))
183 *ret
= SCMP_ARCH_X86_64
;
184 else if (streq(n
, "x32"))
185 *ret
= SCMP_ARCH_X32
;
186 else if (streq(n
, "arm"))
187 *ret
= SCMP_ARCH_ARM
;
188 else if (streq(n
, "arm64"))
189 *ret
= SCMP_ARCH_AARCH64
;
190 #ifdef SCMP_ARCH_LOONGARCH64
191 else if (streq(n
, "loongarch64"))
192 *ret
= SCMP_ARCH_LOONGARCH64
;
194 else if (streq(n
, "mips"))
195 *ret
= SCMP_ARCH_MIPS
;
196 else if (streq(n
, "mips64"))
197 *ret
= SCMP_ARCH_MIPS64
;
198 else if (streq(n
, "mips64-n32"))
199 *ret
= SCMP_ARCH_MIPS64N32
;
200 else if (streq(n
, "mips-le"))
201 *ret
= SCMP_ARCH_MIPSEL
;
202 else if (streq(n
, "mips64-le"))
203 *ret
= SCMP_ARCH_MIPSEL64
;
204 else if (streq(n
, "mips64-le-n32"))
205 *ret
= SCMP_ARCH_MIPSEL64N32
;
206 #ifdef SCMP_ARCH_PARISC
207 else if (streq(n
, "parisc"))
208 *ret
= SCMP_ARCH_PARISC
;
210 #ifdef SCMP_ARCH_PARISC64
211 else if (streq(n
, "parisc64"))
212 *ret
= SCMP_ARCH_PARISC64
;
214 else if (streq(n
, "ppc"))
215 *ret
= SCMP_ARCH_PPC
;
216 else if (streq(n
, "ppc64"))
217 *ret
= SCMP_ARCH_PPC64
;
218 else if (streq(n
, "ppc64-le"))
219 *ret
= SCMP_ARCH_PPC64LE
;
220 #ifdef SCMP_ARCH_RISCV64
221 else if (streq(n
, "riscv64"))
222 *ret
= SCMP_ARCH_RISCV64
;
224 else if (streq(n
, "s390"))
225 *ret
= SCMP_ARCH_S390
;
226 else if (streq(n
, "s390x"))
227 *ret
= SCMP_ARCH_S390X
;
234 int seccomp_init_for_arch(scmp_filter_ctx
*ret
, uint32_t arch
, uint32_t default_action
) {
235 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
238 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
239 * any others. Also, turns off the NNP fiddling. */
241 seccomp
= seccomp_init(default_action
);
245 if (arch
!= SCMP_ARCH_NATIVE
&&
246 arch
!= seccomp_arch_native()) {
248 r
= seccomp_arch_remove(seccomp
, seccomp_arch_native());
252 r
= seccomp_arch_add(seccomp
, arch
);
256 assert(seccomp_arch_exist(seccomp
, arch
) >= 0);
257 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) == -EEXIST
);
258 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) == -EEXIST
);
260 assert(seccomp_arch_exist(seccomp
, SCMP_ARCH_NATIVE
) >= 0);
261 assert(seccomp_arch_exist(seccomp
, seccomp_arch_native()) >= 0);
264 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_ACT_BADARCH
, SCMP_ACT_ALLOW
);
268 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
272 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
273 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
274 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_LOG
, 1);
276 log_debug_errno(r
, "Failed to enable seccomp event logging: %m");
280 *ret
= TAKE_PTR(seccomp
);
284 static bool is_basic_seccomp_available(void) {
285 return prctl(PR_GET_SECCOMP
, 0, 0, 0, 0) >= 0;
288 static bool is_seccomp_filter_available(void) {
289 return prctl(PR_SET_SECCOMP
, SECCOMP_MODE_FILTER
, NULL
, 0, 0) < 0 &&
293 bool is_seccomp_available(void) {
294 static int cached_enabled
= -1;
296 if (cached_enabled
< 0) {
299 b
= secure_getenv_bool("SYSTEMD_SECCOMP");
301 if (b
< 0 && b
!= -ENXIO
) /* ENXIO: env var unset */
302 log_debug_errno(b
, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
305 is_basic_seccomp_available() &&
306 is_seccomp_filter_available();
308 cached_enabled
= false;
311 return cached_enabled
;
314 const SyscallFilterSet syscall_filter_sets
[_SYSCALL_FILTER_SET_MAX
] = {
315 [SYSCALL_FILTER_SET_DEFAULT
] = {
317 .help
= "System calls that are always permitted",
320 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
324 "clock_getres_time64\0"
328 "clock_nanosleep_time64\0"
354 "getrlimit\0" /* make sure processes can query stack size and such */
370 "riscv_flush_icache\0"
374 "sched_getaffinity\0"
385 [SYSCALL_FILTER_SET_AIO
] = {
387 .help
= "Asynchronous IO",
393 "io_pgetevents_time64\0"
397 "io_uring_register\0"
400 [SYSCALL_FILTER_SET_BASIC_IO
] = {
423 [SYSCALL_FILTER_SET_CHOWN
] = {
425 .help
= "Change ownership of files and directories",
435 [SYSCALL_FILTER_SET_CLOCK
] = {
437 .help
= "Change the system time",
446 [SYSCALL_FILTER_SET_CPU_EMULATION
] = {
447 .name
= "@cpu-emulation",
448 .help
= "System calls for CPU emulation functionality",
456 [SYSCALL_FILTER_SET_DEBUG
] = {
458 .help
= "Debugging, performance monitoring and tracing functionality",
465 "s390_runtime_instr\0"
466 "sys_debug_setcontext\0"
468 [SYSCALL_FILTER_SET_FILE_SYSTEM
] = {
469 .name
= "@file-system",
470 .help
= "File system operations",
504 "inotify_add_watch\0"
558 [SYSCALL_FILTER_SET_IO_EVENT
] = {
560 .help
= "Event loop system calls",
580 [SYSCALL_FILTER_SET_IPC
] = {
582 .help
= "SysV IPC, POSIX Message Queues or other IPC",
590 "mq_timedreceive_time64\0"
592 "mq_timedsend_time64\0"
602 "process_vm_writev\0"
607 "semtimedop_time64\0"
613 [SYSCALL_FILTER_SET_KEYRING
] = {
615 .help
= "Kernel keyring access",
621 [SYSCALL_FILTER_SET_MEMLOCK
] = {
623 .help
= "Memory locking control",
631 [SYSCALL_FILTER_SET_MODULE
] = {
633 .help
= "Loading and unloading of kernel modules",
639 [SYSCALL_FILTER_SET_MOUNT
] = {
641 .help
= "Mounting and unmounting of file systems",
657 [SYSCALL_FILTER_SET_NETWORK_IO
] = {
658 .name
= "@network-io",
659 .help
= "Network or Unix socket IO, should not be needed if not network facing",
684 [SYSCALL_FILTER_SET_OBSOLETE
] = {
685 /* some unknown even to libseccomp */
687 .help
= "Unusual, obsolete or unimplemented system calls",
717 [SYSCALL_FILTER_SET_PKEY
] = {
719 .help
= "System calls used for memory protection keys",
725 [SYSCALL_FILTER_SET_PRIVILEGED
] = {
726 .name
= "@privileged",
727 .help
= "All system calls which need super-user capabilities",
743 "open_by_handle_at\0"
757 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
761 [SYSCALL_FILTER_SET_PROCESS
] = {
763 .help
= "Process control, execution, namespacing operations",
765 "capget\0" /* Able to query arbitrary processes */
767 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
768 * implement seccomp, so we don't need to list it at all. C.f.
769 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
776 "pidfd_send_signal\0"
779 "rt_tgsigqueueinfo\0"
781 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
791 [SYSCALL_FILTER_SET_RAW_IO
] = {
793 .help
= "Raw I/O port access",
800 "s390_pci_mmio_read\0"
801 "s390_pci_mmio_write\0"
803 [SYSCALL_FILTER_SET_REBOOT
] = {
805 .help
= "Reboot and reboot preparation/kexec",
811 [SYSCALL_FILTER_SET_RESOURCES
] = {
812 .name
= "@resources",
813 .help
= "Alter resource settings",
820 "sched_setaffinity\0"
823 "sched_setscheduler\0"
825 "set_mempolicy_home_node\0"
829 [SYSCALL_FILTER_SET_SANDBOX
] = {
831 .help
= "Sandbox functionality",
833 "landlock_add_rule\0"
834 "landlock_create_ruleset\0"
835 "landlock_restrict_self\0"
838 [SYSCALL_FILTER_SET_SETUID
] = {
840 .help
= "Operations for changing user/group credentials",
857 [SYSCALL_FILTER_SET_SIGNAL
] = {
859 .help
= "Process signal handling",
866 "rt_sigtimedwait_time64\0"
876 [SYSCALL_FILTER_SET_SWAP
] = {
878 .help
= "Enable/disable swap devices",
883 [SYSCALL_FILTER_SET_SYNC
] = {
885 .help
= "Synchronize files and memory to storage",
887 /* Please also update the list in seccomp_suppress_sync(). */
896 [SYSCALL_FILTER_SET_SYSTEM_SERVICE
] = {
897 .name
= "@system-service",
898 .help
= "General system service operations",
931 "name_to_handle_at\0"
938 "sched_get_priority_max\0"
939 "sched_get_priority_min\0"
942 "sched_getscheduler\0"
943 "sched_rr_get_interval\0"
944 "sched_rr_get_interval_time64\0"
962 [SYSCALL_FILTER_SET_TIMER
] = {
964 .help
= "Schedule operations by time",
978 "timerfd_gettime64\0"
980 "timerfd_settime64\0"
983 [SYSCALL_FILTER_SET_KNOWN
] = {
985 .help
= "All known syscalls declared in the kernel",
988 #include "syscall-list.inc"
992 const SyscallFilterSet
*syscall_filter_set_find(const char *name
) {
993 if (isempty(name
) || name
[0] != '@')
996 for (unsigned i
= 0; i
< _SYSCALL_FILTER_SET_MAX
; i
++)
997 if (streq(syscall_filter_sets
[i
].name
, name
))
998 return syscall_filter_sets
+ i
;
1003 static int add_syscall_filter_set(
1004 scmp_filter_ctx seccomp
,
1005 const SyscallFilterSet
*set
,
1011 int seccomp_add_syscall_filter_item(
1012 scmp_filter_ctx
*seccomp
,
1022 if (strv_contains(exclude
, name
))
1025 /* Any syscalls that are handled are added to the *added strv. The pointer
1026 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1028 if (name
[0] == '@') {
1029 const SyscallFilterSet
*other
;
1031 other
= syscall_filter_set_find(name
);
1033 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL
),
1034 "Filter set %s is not known!",
1037 return add_syscall_filter_set(seccomp
, other
, action
, exclude
, log_missing
, added
);
1042 id
= seccomp_syscall_resolve_name(name
);
1043 if (id
== __NR_SCMP_ERROR
) {
1045 log_debug("System call %s is not known, ignoring.", name
);
1049 r
= seccomp_rule_add_exact(seccomp
, action
, id
, 0);
1051 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1052 bool ignore
= r
== -EDOM
;
1054 if (!ignore
|| log_missing
)
1055 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1056 name
, id
, ignore
? ", ignoring" : "");
1062 r
= strv_extend(added
, name
);
1071 static int add_syscall_filter_set(
1072 scmp_filter_ctx seccomp
,
1073 const SyscallFilterSet
*set
,
1081 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1086 NULSTR_FOREACH(sys
, set
->value
) {
1087 r
= seccomp_add_syscall_filter_item(seccomp
, sys
, action
, exclude
, log_missing
, added
);
1095 static uint32_t override_default_action(uint32_t default_action
) {
1096 /* When the requested filter is an allow-list, and the default action is something critical, we
1097 * install ENOSYS as the default action, but it will only apply to syscalls which are not in the
1100 if (default_action
== SCMP_ACT_ALLOW
)
1101 return default_action
;
1104 if (default_action
== SCMP_ACT_LOG
)
1105 return default_action
;
1108 return SCMP_ACT_ERRNO(ENOSYS
);
1111 int seccomp_load_syscall_filter_set(uint32_t default_action
, const SyscallFilterSet
*set
, uint32_t action
, bool log_missing
) {
1112 uint32_t arch
, default_action_override
;
1117 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1118 * each local arch. */
1120 default_action_override
= override_default_action(default_action
);
1122 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1123 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1124 _cleanup_strv_free_
char **added
= NULL
;
1126 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1128 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action_override
);
1132 r
= add_syscall_filter_set(seccomp
, set
, action
, NULL
, log_missing
, &added
);
1134 return log_debug_errno(r
, "Failed to add filter set: %m");
1136 if (default_action
!= default_action_override
)
1137 NULSTR_FOREACH(name
, syscall_filter_sets
[SYSCALL_FILTER_SET_KNOWN
].value
) {
1140 id
= seccomp_syscall_resolve_name(name
);
1144 /* Ignore the syscall if it was already handled above */
1145 if (strv_contains(added
, name
))
1148 r
= seccomp_rule_add_exact(seccomp
, default_action
, id
, 0);
1149 if (r
< 0 && r
!= -EDOM
) /* EDOM means that the syscall is not available for arch */
1150 return log_debug_errno(r
, "Failed to add rule for system call %s() / %d: %m",
1154 #if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1155 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1156 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_OPTIMIZE
, 2);
1158 log_warning_errno(r
, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1161 r
= seccomp_load(seccomp
);
1162 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1165 log_debug_errno(r
, "Failed to install filter set for architecture %s, skipping: %m",
1166 seccomp_arch_to_string(arch
));
1172 int seccomp_load_syscall_filter_set_raw(uint32_t default_action
, Hashmap
* filter
, uint32_t action
, bool log_missing
) {
1173 uint32_t arch
, default_action_override
;
1176 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1177 * of a SyscallFilterSet* table. */
1179 if (hashmap_isempty(filter
) && default_action
== SCMP_ACT_ALLOW
)
1182 default_action_override
= override_default_action(default_action
);
1184 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1185 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1186 void *syscall_id
, *val
;
1188 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1190 r
= seccomp_init_for_arch(&seccomp
, arch
, default_action_override
);
1194 HASHMAP_FOREACH_KEY(val
, syscall_id
, filter
) {
1195 uint32_t a
= action
;
1196 int id
= PTR_TO_INT(syscall_id
) - 1;
1197 int error
= PTR_TO_INT(val
);
1199 if (error
== SECCOMP_ERROR_NUMBER_KILL
)
1200 a
= scmp_act_kill_process();
1202 else if (action
== SCMP_ACT_LOG
)
1205 else if (error
>= 0)
1206 a
= SCMP_ACT_ERRNO(error
);
1208 r
= seccomp_rule_add_exact(seccomp
, a
, id
, 0);
1210 /* If the system call is not known on this architecture, then that's
1211 * fine, let's ignore it */
1212 _cleanup_free_
char *n
= NULL
;
1215 n
= seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE
, id
);
1216 ignore
= r
== -EDOM
;
1217 if (!ignore
|| log_missing
)
1218 log_debug_errno(r
, "Failed to add rule for system call %s() / %d%s: %m",
1219 strna(n
), id
, ignore
? ", ignoring" : "");
1225 if (default_action
!= default_action_override
)
1226 NULSTR_FOREACH(name
, syscall_filter_sets
[SYSCALL_FILTER_SET_KNOWN
].value
) {
1229 id
= seccomp_syscall_resolve_name(name
);
1233 /* Ignore the syscall if it was already handled above */
1234 if (hashmap_contains(filter
, INT_TO_PTR(id
+ 1)))
1237 r
= seccomp_rule_add_exact(seccomp
, default_action
, id
, 0);
1238 if (r
< 0 && r
!= -EDOM
) /* EDOM means that the syscall is not available for arch */
1239 return log_debug_errno(r
, "Failed to add rule for system call %s() / %d: %m",
1243 #if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1244 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1245 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_OPTIMIZE
, 2);
1247 log_warning_errno(r
, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1250 r
= seccomp_load(seccomp
);
1251 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1254 log_debug_errno(r
, "Failed to install system call filter for architecture %s, skipping: %m",
1255 seccomp_arch_to_string(arch
));
1261 int seccomp_parse_syscall_filter(
1265 SeccompParseFlags flags
,
1267 const char *filename
,
1275 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) && errno_num
>= 0)
1278 if (name
[0] == '@') {
1279 const SyscallFilterSet
*set
;
1281 set
= syscall_filter_set_find(name
);
1283 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1286 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1287 "Unknown system call group, ignoring: %s", name
);
1291 NULSTR_FOREACH(i
, set
->value
) {
1292 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1293 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1294 * are our own problem, not a problem in user configuration data and we shouldn't
1295 * pretend otherwise by complaining about them. */
1296 r
= seccomp_parse_syscall_filter(i
, errno_num
, filter
, flags
&~ SECCOMP_PARSE_LOG
, unit
, filename
, line
);
1303 id
= seccomp_syscall_resolve_name(name
);
1304 if (id
== __NR_SCMP_ERROR
) {
1305 if (!FLAGS_SET(flags
, SECCOMP_PARSE_PERMISSIVE
))
1308 log_syntax(unit
, FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? LOG_WARNING
: LOG_DEBUG
, filename
, line
, 0,
1309 "System call %s is not known, ignoring.", name
);
1313 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1314 * from the list. The entries in allow-list with non-negative error value will be handled
1315 * with SCMP_ACT_ERRNO() instead of the default action. */
1316 if (!FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
) == FLAGS_SET(flags
, SECCOMP_PARSE_ALLOW_LIST
) ||
1317 (FLAGS_SET(flags
, SECCOMP_PARSE_INVERT
| SECCOMP_PARSE_ALLOW_LIST
) && errno_num
>= 0)) {
1318 r
= hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
));
1322 return FLAGS_SET(flags
, SECCOMP_PARSE_LOG
) ? log_oom() : -ENOMEM
;
1324 assert_se(hashmap_update(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(errno_num
)) == 0);
1330 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
1336 int seccomp_restrict_namespaces(unsigned long retain
) {
1340 if (DEBUG_LOGGING
) {
1341 _cleanup_free_
char *s
= NULL
;
1343 (void) namespace_flags_to_string(retain
, &s
);
1344 log_debug("Restricting namespace to: %s.", strna(s
));
1348 if (FLAGS_SET(retain
, NAMESPACE_FLAGS_ALL
))
1351 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1352 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1354 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1356 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1360 /* We cannot filter on individual flags to clone3(), and we need to disable the
1361 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1362 * users shall fall back to clone(), as if on an older kernel.
1364 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1365 * https://github.com/moby/moby/issues/42680. */
1367 r
= seccomp_rule_add_exact(
1369 SCMP_ACT_ERRNO(ENOSYS
),
1373 log_debug_errno(r
, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1374 seccomp_arch_to_string(arch
));
1376 if ((retain
& NAMESPACE_FLAGS_ALL
) == 0)
1377 /* If every single kind of namespace shall be prohibited, then let's block the whole
1378 * setns() syscall altogether. */
1379 r
= seccomp_rule_add_exact(
1381 SCMP_ACT_ERRNO(EPERM
),
1385 /* Otherwise, block only the invocations with the appropriate flags in the loop
1386 * below, but also the special invocation with a zero flags argument, right here. */
1387 r
= seccomp_rule_add_exact(
1389 SCMP_ACT_ERRNO(EPERM
),
1392 SCMP_A1(SCMP_CMP_EQ
, 0));
1394 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m",
1395 seccomp_arch_to_string(arch
));
1399 for (unsigned i
= 0; namespace_info
[i
].proc_name
; i
++) {
1402 f
= namespace_info
[i
].clone_flag
;
1403 if (FLAGS_SET(retain
, f
)) {
1404 log_debug("Permitting %s.", namespace_info
[i
].proc_name
);
1408 log_trace("Blocking %s.", namespace_info
[i
].proc_name
);
1410 r
= seccomp_rule_add_exact(
1412 SCMP_ACT_ERRNO(EPERM
),
1415 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1417 log_debug_errno(r
, "Failed to add unshare() rule for architecture %s, skipping: %m",
1418 seccomp_arch_to_string(arch
));
1422 /* On s390/s390x the first two parameters to clone are switched */
1423 if (!IN_SET(arch
, SCMP_ARCH_S390
, SCMP_ARCH_S390X
))
1424 r
= seccomp_rule_add_exact(
1426 SCMP_ACT_ERRNO(EPERM
),
1429 SCMP_A0(SCMP_CMP_MASKED_EQ
, f
, f
));
1431 r
= seccomp_rule_add_exact(
1433 SCMP_ACT_ERRNO(EPERM
),
1436 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1438 log_debug_errno(r
, "Failed to add clone() rule for architecture %s, skipping: %m",
1439 seccomp_arch_to_string(arch
));
1443 if ((retain
& NAMESPACE_FLAGS_ALL
) != 0) {
1444 r
= seccomp_rule_add_exact(
1446 SCMP_ACT_ERRNO(EPERM
),
1449 SCMP_A1(SCMP_CMP_MASKED_EQ
, f
, f
));
1451 log_debug_errno(r
, "Failed to add setns() rule for architecture %s, skipping: %m",
1452 seccomp_arch_to_string(arch
));
1460 r
= seccomp_load(seccomp
);
1461 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1464 log_debug_errno(r
, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1465 seccomp_arch_to_string(arch
));
1471 int seccomp_protect_sysctl(void) {
1475 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1476 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1478 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1482 #ifdef SCMP_ARCH_LOONGARCH64
1483 SCMP_ARCH_LOONGARCH64
,
1485 #ifdef SCMP_ARCH_RISCV64
1490 /* No _sysctl syscall */
1493 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1497 r
= seccomp_rule_add_exact(
1499 SCMP_ACT_ERRNO(EPERM
),
1503 log_debug_errno(r
, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1504 seccomp_arch_to_string(arch
));
1508 r
= seccomp_load(seccomp
);
1509 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1512 log_debug_errno(r
, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1513 seccomp_arch_to_string(arch
));
1519 int seccomp_protect_syslog(void) {
1523 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1524 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1526 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1530 r
= seccomp_rule_add_exact(
1532 SCMP_ACT_ERRNO(EPERM
),
1537 log_debug_errno(r
, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch
));
1541 r
= seccomp_load(seccomp
);
1542 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1545 log_debug_errno(r
, "Failed to install syslog protection rules for architecture %s, skipping %m",
1546 seccomp_arch_to_string(arch
));
1552 int seccomp_restrict_address_families(Set
*address_families
, bool allow_list
) {
1556 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1557 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1560 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1564 case SCMP_ARCH_X86_64
:
1567 case SCMP_ARCH_AARCH64
:
1568 #ifdef SCMP_ARCH_LOONGARCH64
1569 case SCMP_ARCH_LOONGARCH64
:
1571 case SCMP_ARCH_MIPSEL64N32
:
1572 case SCMP_ARCH_MIPS64N32
:
1573 case SCMP_ARCH_MIPSEL64
:
1574 case SCMP_ARCH_MIPS64
:
1575 #ifdef SCMP_ARCH_RISCV64
1576 case SCMP_ARCH_RISCV64
:
1578 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1582 case SCMP_ARCH_S390
:
1583 case SCMP_ARCH_S390X
:
1585 case SCMP_ARCH_MIPSEL
:
1586 case SCMP_ARCH_MIPS
:
1587 #ifdef SCMP_ARCH_PARISC
1588 case SCMP_ARCH_PARISC
:
1590 #ifdef SCMP_ARCH_PARISC64
1591 case SCMP_ARCH_PARISC64
:
1594 case SCMP_ARCH_PPC64
:
1595 case SCMP_ARCH_PPC64LE
:
1597 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1605 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1610 int first
= 0, last
= 0;
1613 /* If this is an allow list, we first block the address families that are out of
1614 * range and then everything that is not in the set. First, we find the lowest and
1615 * highest address family in the set. */
1617 SET_FOREACH(afp
, address_families
) {
1618 int af
= PTR_TO_INT(afp
);
1620 if (af
<= 0 || af
>= af_max())
1623 if (first
== 0 || af
< first
)
1626 if (last
== 0 || af
> last
)
1630 assert((first
== 0) == (last
== 0));
1634 /* No entries in the valid range, block everything */
1635 r
= seccomp_rule_add_exact(
1637 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1641 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m",
1642 seccomp_arch_to_string(arch
));
1648 /* Block everything below the first entry */
1649 r
= seccomp_rule_add_exact(
1651 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1654 SCMP_A0(SCMP_CMP_LT
, first
));
1656 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m",
1657 seccomp_arch_to_string(arch
));
1661 /* Block everything above the last entry */
1662 r
= seccomp_rule_add_exact(
1664 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1667 SCMP_A0(SCMP_CMP_GT
, last
));
1669 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m",
1670 seccomp_arch_to_string(arch
));
1674 /* Block everything between the first and last entry */
1675 for (int af
= 1; af
< af_max(); af
++) {
1677 if (set_contains(address_families
, INT_TO_PTR(af
)))
1680 r
= seccomp_rule_add_exact(
1682 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1685 SCMP_A0(SCMP_CMP_EQ
, af
));
1690 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m",
1691 seccomp_arch_to_string(arch
));
1699 /* If this is a deny list, then generate one rule for each address family that are
1700 * then combined in OR checks. */
1702 SET_FOREACH(af
, address_families
) {
1703 r
= seccomp_rule_add_exact(
1705 SCMP_ACT_ERRNO(EAFNOSUPPORT
),
1708 SCMP_A0(SCMP_CMP_EQ
, PTR_TO_INT(af
)));
1713 log_debug_errno(r
, "Failed to add socket() rule for architecture %s, skipping: %m",
1714 seccomp_arch_to_string(arch
));
1719 r
= seccomp_load(seccomp
);
1720 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1723 log_debug_errno(r
, "Failed to install socket family rules for architecture %s, skipping: %m",
1724 seccomp_arch_to_string(arch
));
1730 int seccomp_restrict_realtime_full(int error_code
) {
1731 static const int permitted_policies
[] = {
1737 int r
, max_policy
= 0;
1740 assert(error_code
> 0);
1742 /* Determine the highest policy constant we want to allow */
1743 FOREACH_ELEMENT(policy
, permitted_policies
)
1744 if (*policy
> max_policy
)
1745 max_policy
= *policy
;
1747 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1748 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1751 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1753 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1757 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1759 for (p
= 0; p
< max_policy
; p
++) {
1762 /* Check if this is in the allow list. */
1763 FOREACH_ELEMENT(policy
, permitted_policies
)
1772 /* Deny this policy */
1773 r
= seccomp_rule_add_exact(
1775 SCMP_ACT_ERRNO(error_code
),
1776 SCMP_SYS(sched_setscheduler
),
1778 SCMP_A1(SCMP_CMP_EQ
, p
));
1780 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m",
1781 seccomp_arch_to_string(arch
));
1786 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1787 * are unsigned here, hence no need no check for < 0 values. */
1788 r
= seccomp_rule_add_exact(
1790 SCMP_ACT_ERRNO(error_code
),
1791 SCMP_SYS(sched_setscheduler
),
1793 SCMP_A1(SCMP_CMP_GT
, max_policy
));
1795 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m",
1796 seccomp_arch_to_string(arch
));
1800 r
= seccomp_load(seccomp
);
1801 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1804 log_debug_errno(r
, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1805 seccomp_arch_to_string(arch
));
1811 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp
,
1815 const struct scmp_arg_cmp arg
) {
1818 r
= seccomp_rule_add_exact(seccomp
, SCMP_ACT_ERRNO(EPERM
), nr
, arg_cnt
, arg
);
1820 _cleanup_free_
char *n
= NULL
;
1822 n
= seccomp_syscall_resolve_num_arch(arch
, nr
);
1823 log_debug_errno(r
, "Failed to add %s() rule for architecture %s, skipping: %m",
1825 seccomp_arch_to_string(arch
));
1831 /* For known architectures, check that syscalls are indeed defined or not. */
1832 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
1833 assert_cc(SCMP_SYS(shmget
) > 0);
1834 assert_cc(SCMP_SYS(shmat
) > 0);
1835 assert_cc(SCMP_SYS(shmdt
) > 0);
1838 int seccomp_memory_deny_write_execute(void) {
1840 unsigned loaded
= 0;
1842 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
1843 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1844 int filter_syscall
= 0, block_syscall
= 0, shmat_syscall
= 0, r
;
1846 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch
));
1850 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1851 * We ignore that here, which means there's still a way to get writable/executable
1852 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1854 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1855 * on that front (kernel work done in 5.18).
1859 case SCMP_ARCH_S390
:
1860 filter_syscall
= SCMP_SYS(mmap2
);
1861 block_syscall
= SCMP_SYS(mmap
);
1862 /* shmat multiplexed, see above */
1866 case SCMP_ARCH_PPC64
:
1867 case SCMP_ARCH_PPC64LE
:
1868 case SCMP_ARCH_S390X
:
1869 filter_syscall
= SCMP_SYS(mmap
);
1870 /* shmat multiplexed, see above */
1874 filter_syscall
= SCMP_SYS(mmap2
); /* arm has only mmap2 */
1875 shmat_syscall
= SCMP_SYS(shmat
);
1878 case SCMP_ARCH_X86_64
:
1880 case SCMP_ARCH_AARCH64
:
1881 #ifdef SCMP_ARCH_LOONGARCH64
1882 case SCMP_ARCH_LOONGARCH64
:
1884 #ifdef SCMP_ARCH_RISCV64
1885 case SCMP_ARCH_RISCV64
:
1887 filter_syscall
= SCMP_SYS(mmap
); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
1888 shmat_syscall
= SCMP_SYS(shmat
);
1891 /* Please add more definitions here, if you port systemd to other architectures! */
1893 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
1894 #warning "Consider adding the right mmap() syscall definitions here!"
1898 /* Can't filter mmap() on this arch, then skip it */
1899 if (filter_syscall
== 0)
1902 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
1906 r
= add_seccomp_syscall_filter(seccomp
, arch
, filter_syscall
,
1908 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
|PROT_WRITE
, PROT_EXEC
|PROT_WRITE
));
1912 if (block_syscall
!= 0) {
1913 r
= add_seccomp_syscall_filter(seccomp
, arch
, block_syscall
, 0, (const struct scmp_arg_cmp
){} );
1918 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(mprotect
),
1920 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1924 r
= add_seccomp_syscall_filter(seccomp
, arch
, SCMP_SYS(pkey_mprotect
),
1926 SCMP_A2(SCMP_CMP_MASKED_EQ
, PROT_EXEC
, PROT_EXEC
));
1930 if (shmat_syscall
> 0) {
1931 r
= add_seccomp_syscall_filter(seccomp
, arch
, shmat_syscall
,
1933 SCMP_A2(SCMP_CMP_MASKED_EQ
, SHM_EXEC
, SHM_EXEC
));
1938 r
= seccomp_load(seccomp
);
1939 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
1942 log_debug_errno(r
, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1943 seccomp_arch_to_string(arch
));
1948 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1953 int seccomp_restrict_archs(Set
*archs
) {
1954 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
1956 bool blocked_new
= false;
1958 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1961 * There are some qualifications. However the most important use is to stop processes from bypassing
1962 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1963 * in a non-native architecture. There are no holes in this use case, at least so far. */
1965 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1966 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1967 * to run a program with the restrictions applied. */
1968 seccomp
= seccomp_init(SCMP_ACT_ALLOW
);
1972 for (unsigned i
= 0; seccomp_local_archs
[i
] != SECCOMP_LOCAL_ARCH_END
; ++i
) {
1973 uint32_t arch
= seccomp_local_archs
[i
];
1975 /* See above comment, our "native" architecture is never blocked. */
1976 if (arch
== seccomp_arch_native())
1979 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1980 if (arch
== SECCOMP_LOCAL_ARCH_BLOCKED
)
1983 bool block
= !set_contains(archs
, UINT32_TO_PTR(arch
+ 1));
1985 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1986 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1987 * The important thing is that you can block the old 32-bit x86 syscalls.
1988 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1989 if (block
&& arch
== SCMP_ARCH_X86_64
&& seccomp_arch_native() == SCMP_ARCH_X32
)
1990 block
= !set_contains(archs
, UINT32_TO_PTR(SCMP_ARCH_X32
+ 1));
1993 seccomp_local_archs
[i
] = SECCOMP_LOCAL_ARCH_BLOCKED
;
1996 r
= seccomp_arch_add(seccomp
, arch
);
1997 if (r
< 0 && r
!= -EEXIST
)
2002 /* All architectures that will be blocked by the seccomp program were
2003 * already blocked. */
2007 r
= seccomp_attr_set(seccomp
, SCMP_FLTATR_CTL_NNP
, 0);
2011 r
= seccomp_load(seccomp
);
2012 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
2015 log_debug_errno(r
, "Failed to restrict system call architectures, skipping: %m");
2020 int parse_syscall_archs(char **l
, Set
**archs
) {
2026 STRV_FOREACH(s
, l
) {
2029 r
= seccomp_arch_from_string(*s
, &a
);
2033 r
= set_ensure_put(archs
, NULL
, UINT32_TO_PTR(a
+ 1));
2041 int seccomp_filter_set_add_by_name(Hashmap
*filter
, bool add
, const char *name
) {
2045 if (name
[0] == '@') {
2046 const SyscallFilterSet
*more
;
2048 more
= syscall_filter_set_find(name
);
2052 return seccomp_filter_set_add(filter
, add
, more
);
2055 int id
= seccomp_syscall_resolve_name(name
);
2056 if (id
== __NR_SCMP_ERROR
) {
2057 log_debug("System call %s is not known, ignoring.", name
);
2062 return hashmap_put(filter
, INT_TO_PTR(id
+ 1), INT_TO_PTR(-1));
2064 (void) hashmap_remove(filter
, INT_TO_PTR(id
+ 1));
2068 int seccomp_filter_set_add(Hashmap
*filter
, bool add
, const SyscallFilterSet
*set
) {
2074 NULSTR_FOREACH(i
, set
->value
) {
2075 r
= seccomp_filter_set_add_by_name(filter
, add
, i
);
2083 int seccomp_lock_personality(unsigned long personality
) {
2087 if (personality
>= PERSONALITY_INVALID
)
2090 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2091 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2093 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2097 r
= seccomp_rule_add_exact(
2099 SCMP_ACT_ERRNO(EPERM
),
2100 SCMP_SYS(personality
),
2102 SCMP_A0(SCMP_CMP_NE
, personality
));
2104 log_debug_errno(r
, "Failed to add scheduler rule for architecture %s, skipping: %m",
2105 seccomp_arch_to_string(arch
));
2109 r
= seccomp_load(seccomp
);
2110 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
2113 log_debug_errno(r
, "Failed to enable personality lock for architecture %s, skipping: %m",
2114 seccomp_arch_to_string(arch
));
2120 int seccomp_protect_hostname(void) {
2124 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2125 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2127 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2131 r
= seccomp_rule_add_exact(
2133 SCMP_ACT_ERRNO(EPERM
),
2134 SCMP_SYS(sethostname
),
2137 log_debug_errno(r
, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2138 seccomp_arch_to_string(arch
));
2142 r
= seccomp_rule_add_exact(
2144 SCMP_ACT_ERRNO(EPERM
),
2145 SCMP_SYS(setdomainname
),
2148 log_debug_errno(r
, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2149 seccomp_arch_to_string(arch
));
2153 r
= seccomp_load(seccomp
);
2154 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
2157 log_debug_errno(r
, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2158 seccomp_arch_to_string(arch
));
2164 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp
, mode_t m
) {
2165 /* Checks the mode_t parameter of the following system calls:
2167 * → chmod() + fchmod() + fchmodat() + fchmodat2()
2168 * → open() + creat() + openat()
2169 * → mkdir() + mkdirat()
2170 * → mknod() + mknodat()
2172 * Returns error if *everything* failed, and 0 otherwise.
2177 r
= seccomp_rule_add_exact(
2179 SCMP_ACT_ERRNO(EPERM
),
2182 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2184 log_debug_errno(r
, "Failed to add filter for chmod: %m");
2188 r
= seccomp_rule_add_exact(
2190 SCMP_ACT_ERRNO(EPERM
),
2193 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2195 log_debug_errno(r
, "Failed to add filter for fchmod: %m");
2199 r
= seccomp_rule_add_exact(
2201 SCMP_ACT_ERRNO(EPERM
),
2204 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2206 log_debug_errno(r
, "Failed to add filter for fchmodat: %m");
2210 #if defined(__SNR_fchmodat2)
2211 r
= seccomp_rule_add_exact(
2213 SCMP_ACT_ERRNO(EPERM
),
2214 SCMP_SYS(fchmodat2
),
2216 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2218 /* It looks like this libseccomp does not know about fchmodat2().
2219 * Pretend the fchmodat2() system call is not supported at all,
2220 * regardless of the kernel version. */
2221 r
= seccomp_rule_add_exact(
2223 SCMP_ACT_ERRNO(ENOSYS
),
2228 log_debug_errno(r
, "Failed to add filter for fchmodat2: %m");
2232 r
= seccomp_rule_add_exact(
2234 SCMP_ACT_ERRNO(EPERM
),
2237 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2239 log_debug_errno(r
, "Failed to add filter for mkdir: %m");
2243 r
= seccomp_rule_add_exact(
2245 SCMP_ACT_ERRNO(EPERM
),
2248 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2250 log_debug_errno(r
, "Failed to add filter for mkdirat: %m");
2254 r
= seccomp_rule_add_exact(
2256 SCMP_ACT_ERRNO(EPERM
),
2259 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2261 log_debug_errno(r
, "Failed to add filter for mknod: %m");
2265 r
= seccomp_rule_add_exact(
2267 SCMP_ACT_ERRNO(EPERM
),
2270 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2272 log_debug_errno(r
, "Failed to add filter for mknodat: %m");
2276 r
= seccomp_rule_add_exact(
2278 SCMP_ACT_ERRNO(EPERM
),
2281 SCMP_A1(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2282 SCMP_A2(SCMP_CMP_MASKED_EQ
, m
, m
));
2284 log_debug_errno(r
, "Failed to add filter for open: %m");
2288 r
= seccomp_rule_add_exact(
2290 SCMP_ACT_ERRNO(EPERM
),
2293 SCMP_A2(SCMP_CMP_MASKED_EQ
, O_CREAT
, O_CREAT
),
2294 SCMP_A3(SCMP_CMP_MASKED_EQ
, m
, m
));
2296 log_debug_errno(r
, "Failed to add filter for openat: %m");
2300 #if defined(__SNR_openat2)
2301 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2302 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2303 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2304 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2305 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2306 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2307 r
= seccomp_rule_add_exact(
2309 SCMP_ACT_ERRNO(ENOSYS
),
2313 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2318 r
= seccomp_rule_add_exact(
2320 SCMP_ACT_ERRNO(EPERM
),
2323 SCMP_A1(SCMP_CMP_MASKED_EQ
, m
, m
));
2325 log_debug_errno(r
, "Failed to add filter for creat: %m");
2332 int seccomp_restrict_suid_sgid(void) {
2336 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2337 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2339 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2343 r
= seccomp_restrict_sxid(seccomp
, S_ISUID
);
2345 log_debug_errno(r
, "Failed to add suid rule for architecture %s, ignoring: %m",
2346 seccomp_arch_to_string(arch
));
2348 k
= seccomp_restrict_sxid(seccomp
, S_ISGID
);
2350 log_debug_errno(k
, "Failed to add sgid rule for architecture %s, ignoring: %m",
2351 seccomp_arch_to_string(arch
));
2356 r
= seccomp_load(seccomp
);
2357 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
2360 log_debug_errno(r
, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2361 seccomp_arch_to_string(arch
));
2367 uint32_t scmp_act_kill_process(void) {
2369 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2370 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2371 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2372 * for single-threaded apps does the right thing. */
2374 #ifdef SCMP_ACT_KILL_PROCESS
2375 if (seccomp_api_get() >= 3)
2376 return SCMP_ACT_KILL_PROCESS
;
2379 return SCMP_ACT_KILL
; /* same as SCMP_ACT_KILL_THREAD */
2382 int parse_syscall_and_errno(const char *in
, char **name
, int *error
) {
2383 _cleanup_free_
char *n
= NULL
;
2392 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2393 * If errno is omitted, then error is set to -1.
2394 * Empty syscall name is not allowed.
2395 * Here, we do not check that the syscall name is valid or not.
2398 p
= strchr(in
, ':');
2400 e
= seccomp_parse_errno_or_action(p
+ 1);
2404 n
= strndup(in
, p
- in
);
2415 *name
= TAKE_PTR(n
);
2420 static int block_open_flag(scmp_filter_ctx seccomp
, int flag
) {
2424 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2425 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2427 r
= seccomp_rule_add_exact(
2429 SCMP_ACT_ERRNO(EINVAL
),
2432 SCMP_A1(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2434 log_debug_errno(r
, "Failed to add filter for open: %m");
2438 r
= seccomp_rule_add_exact(
2440 SCMP_ACT_ERRNO(EINVAL
),
2443 SCMP_A2(SCMP_CMP_MASKED_EQ
, flag
, flag
));
2445 log_debug_errno(r
, "Failed to add filter for openat: %m");
2449 #if defined(__SNR_openat2)
2450 /* The new openat2() system call can't be filtered sensibly, see above. */
2451 r
= seccomp_rule_add_exact(
2453 SCMP_ACT_ERRNO(ENOSYS
),
2457 log_debug_errno(r
, "Failed to add filter for openat2: %m");
2465 int seccomp_suppress_sync(void) {
2469 /* This behaves slightly differently from SystemCallFilter=~@sync:0, in that negative fds (which
2470 * we can determine to be invalid) are still refused with EBADF. See #34478.
2472 * Additionally, O_SYNC/O_DSYNC are masked. */
2474 SECCOMP_FOREACH_LOCAL_ARCH(arch
) {
2475 _cleanup_(seccomp_releasep
) scmp_filter_ctx seccomp
= NULL
;
2477 r
= seccomp_init_for_arch(&seccomp
, arch
, SCMP_ACT_ALLOW
);
2481 NULSTR_FOREACH(c
, syscall_filter_sets
[SYSCALL_FILTER_SET_SYNC
].value
) {
2484 id
= seccomp_syscall_resolve_name(c
);
2485 if (id
== __NR_SCMP_ERROR
) {
2486 log_debug("System call %s is not known, ignoring.", c
);
2490 if (STR_IN_SET(c
, "fdatasync", "fsync", "sync_file_range", "sync_file_range2", "syncfs"))
2491 r
= seccomp_rule_add_exact(
2493 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2496 SCMP_A0(SCMP_CMP_LE
, INT_MAX
)); /* The rule handles arguments in unsigned. Hence, this
2497 * means non-negative fd matches the rule, and the negative
2498 * fd passed to the syscall (then it fails with EBADF). */
2500 r
= seccomp_rule_add_exact(
2502 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2506 log_debug_errno(r
, "Failed to add filter for system call %s, ignoring: %m", c
);
2509 (void) block_open_flag(seccomp
, O_SYNC
);
2510 #if O_DSYNC != O_SYNC
2511 (void) block_open_flag(seccomp
, O_DSYNC
);
2514 r
= seccomp_load(seccomp
);
2515 if (ERRNO_IS_NEG_SECCOMP_FATAL(r
))
2518 log_debug_errno(r
, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2519 seccomp_arch_to_string(arch
));
2527 bool seccomp_errno_or_action_is_valid(int n
) {
2528 return n
== SECCOMP_ERROR_NUMBER_KILL
|| errno_is_valid(n
);
2531 int seccomp_parse_errno_or_action(const char *p
) {
2532 if (streq_ptr(p
, "kill"))
2533 return SECCOMP_ERROR_NUMBER_KILL
;
2534 return parse_errno(p
);
2537 const char* seccomp_errno_or_action_to_string(int num
) {
2538 if (num
== SECCOMP_ERROR_NUMBER_KILL
)
2540 return errno_to_name(num
);