]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #16491 from keszybz/udev-logging
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "errno-list.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
21 #include "set.h"
22 #include "string-util.h"
23 #include "strv.h"
24
25 const uint32_t seccomp_local_archs[] = {
26
27 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
28
29 #if defined(__x86_64__) && defined(__ILP32__)
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
32 SCMP_ARCH_X32, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X32,
36 SCMP_ARCH_X86_64, /* native */
37 #elif defined(__i386__)
38 SCMP_ARCH_X86,
39 #elif defined(__aarch64__)
40 SCMP_ARCH_ARM,
41 SCMP_ARCH_AARCH64, /* native */
42 #elif defined(__arm__)
43 SCMP_ARCH_ARM,
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS,
49 SCMP_ARCH_MIPSEL, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
54 SCMP_ARCH_MIPS64N32,
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 SCMP_ARCH_PPC,
80 SCMP_ARCH_PPC64LE,
81 SCMP_ARCH_PPC64, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86 #elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88 #elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91 #elif defined(__s390__)
92 SCMP_ARCH_S390,
93 #endif
94 (uint32_t) -1
95 };
96
97 const char* seccomp_arch_to_string(uint32_t c) {
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
102
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
105 return "native";
106 case SCMP_ARCH_X86:
107 return "x86";
108 case SCMP_ARCH_X86_64:
109 return "x86-64";
110 case SCMP_ARCH_X32:
111 return "x32";
112 case SCMP_ARCH_ARM:
113 return "arm";
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
135 return "s390";
136 case SCMP_ARCH_S390X:
137 return "s390x";
138 default:
139 return NULL;
140 }
141 }
142
143 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
183 else
184 return -EINVAL;
185
186 return 0;
187 }
188
189 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 scmp_filter_ctx seccomp;
191 int r;
192
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 if (r < 0)
205 goto finish;
206
207 r = seccomp_arch_add(seccomp, arch);
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230 finish:
231 seccomp_release(seccomp);
232 return r;
233 }
234
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 }
238
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
242 }
243
244 bool is_seccomp_available(void) {
245 static int cached_enabled = -1;
246
247 if (cached_enabled < 0)
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
252 return cached_enabled;
253 }
254
255 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 [SYSCALL_FILTER_SET_DEFAULT] = {
257 .name = "@default",
258 .help = "System calls that are always permitted",
259 .value =
260 "clock_getres\0"
261 "clock_getres_time64\0"
262 "clock_gettime\0"
263 "clock_gettime64\0"
264 "clock_nanosleep\0"
265 "clock_nanosleep_time64\0"
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
269 "futex\0"
270 "futex_time64\0"
271 "get_robust_list\0"
272 "get_thread_area\0"
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
289 "getrlimit\0" /* make sure processes can query stack size and such */
290 "getsid\0"
291 "gettid\0"
292 "gettimeofday\0"
293 "getuid\0"
294 "getuid32\0"
295 "membarrier\0"
296 "nanosleep\0"
297 "pause\0"
298 "prlimit64\0"
299 "restart_syscall\0"
300 "rseq\0"
301 "rt_sigreturn\0"
302 "sched_yield\0"
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
306 "set_tls\0"
307 "sigreturn\0"
308 "time\0"
309 "ugetrlimit\0"
310 },
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
318 "io_pgetevents\0"
319 "io_pgetevents_time64\0"
320 "io_setup\0"
321 "io_submit\0"
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
325 },
326 [SYSCALL_FILTER_SET_BASIC_IO] = {
327 .name = "@basic-io",
328 .help = "Basic IO",
329 .value =
330 "_llseek\0"
331 "close\0"
332 "dup\0"
333 "dup2\0"
334 "dup3\0"
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
338 "preadv2\0"
339 "pwrite64\0"
340 "pwritev\0"
341 "pwritev2\0"
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
359 [SYSCALL_FILTER_SET_CLOCK] = {
360 .name = "@clock",
361 .help = "Change the system time",
362 .value =
363 "adjtimex\0"
364 "clock_adjtime\0"
365 "clock_adjtime64\0"
366 "clock_settime\0"
367 "clock_settime64\0"
368 "settimeofday\0"
369 "stime\0"
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
372 .name = "@cpu-emulation",
373 .help = "System calls for CPU emulation functionality",
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
382 .name = "@debug",
383 .help = "Debugging, performance monitoring and tracing functionality",
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
387 "pidfd_getfd\0"
388 "ptrace\0"
389 "rtas\0"
390 #ifdef __NR_s390_runtime_instr
391 "s390_runtime_instr\0"
392 #endif
393 "sys_debug_setcontext\0"
394 },
395 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
396 .name = "@file-system",
397 .help = "File system operations",
398 .value =
399 "access\0"
400 "chdir\0"
401 "chmod\0"
402 "close\0"
403 "creat\0"
404 "faccessat\0"
405 "fallocate\0"
406 "fchdir\0"
407 "fchmod\0"
408 "fchmodat\0"
409 "fcntl\0"
410 "fcntl64\0"
411 "fgetxattr\0"
412 "flistxattr\0"
413 "fremovexattr\0"
414 "fsetxattr\0"
415 "fstat\0"
416 "fstat64\0"
417 "fstatat64\0"
418 "fstatfs\0"
419 "fstatfs64\0"
420 "ftruncate\0"
421 "ftruncate64\0"
422 "futimesat\0"
423 "getcwd\0"
424 "getdents\0"
425 "getdents64\0"
426 "getxattr\0"
427 "inotify_add_watch\0"
428 "inotify_init\0"
429 "inotify_init1\0"
430 "inotify_rm_watch\0"
431 "lgetxattr\0"
432 "link\0"
433 "linkat\0"
434 "listxattr\0"
435 "llistxattr\0"
436 "lremovexattr\0"
437 "lsetxattr\0"
438 "lstat\0"
439 "lstat64\0"
440 "mkdir\0"
441 "mkdirat\0"
442 "mknod\0"
443 "mknodat\0"
444 "mmap\0"
445 "mmap2\0"
446 "munmap\0"
447 "newfstatat\0"
448 "oldfstat\0"
449 "oldlstat\0"
450 "oldstat\0"
451 "open\0"
452 "openat\0"
453 "openat2\0"
454 "readlink\0"
455 "readlinkat\0"
456 "removexattr\0"
457 "rename\0"
458 "renameat\0"
459 "renameat2\0"
460 "rmdir\0"
461 "setxattr\0"
462 "stat\0"
463 "stat64\0"
464 "statfs\0"
465 "statfs64\0"
466 #ifdef __NR_statx
467 "statx\0"
468 #endif
469 "symlink\0"
470 "symlinkat\0"
471 "truncate\0"
472 "truncate64\0"
473 "unlink\0"
474 "unlinkat\0"
475 "utime\0"
476 "utimensat\0"
477 "utimensat_time64\0"
478 "utimes\0"
479 },
480 [SYSCALL_FILTER_SET_IO_EVENT] = {
481 .name = "@io-event",
482 .help = "Event loop system calls",
483 .value =
484 "_newselect\0"
485 "epoll_create\0"
486 "epoll_create1\0"
487 "epoll_ctl\0"
488 "epoll_ctl_old\0"
489 "epoll_pwait\0"
490 "epoll_wait\0"
491 "epoll_wait_old\0"
492 "eventfd\0"
493 "eventfd2\0"
494 "poll\0"
495 "ppoll\0"
496 "ppoll_time64\0"
497 "pselect6\0"
498 "pselect6_time64\0"
499 "select\0"
500 },
501 [SYSCALL_FILTER_SET_IPC] = {
502 .name = "@ipc",
503 .help = "SysV IPC, POSIX Message Queues or other IPC",
504 .value =
505 "ipc\0"
506 "memfd_create\0"
507 "mq_getsetattr\0"
508 "mq_notify\0"
509 "mq_open\0"
510 "mq_timedreceive\0"
511 "mq_timedreceive_time64\0"
512 "mq_timedsend\0"
513 "mq_timedsend_time64\0"
514 "mq_unlink\0"
515 "msgctl\0"
516 "msgget\0"
517 "msgrcv\0"
518 "msgsnd\0"
519 "pipe\0"
520 "pipe2\0"
521 "process_vm_readv\0"
522 "process_vm_writev\0"
523 "semctl\0"
524 "semget\0"
525 "semop\0"
526 "semtimedop\0"
527 "semtimedop_time64\0"
528 "shmat\0"
529 "shmctl\0"
530 "shmdt\0"
531 "shmget\0"
532 },
533 [SYSCALL_FILTER_SET_KEYRING] = {
534 .name = "@keyring",
535 .help = "Kernel keyring access",
536 .value =
537 "add_key\0"
538 "keyctl\0"
539 "request_key\0"
540 },
541 [SYSCALL_FILTER_SET_MEMLOCK] = {
542 .name = "@memlock",
543 .help = "Memory locking control",
544 .value =
545 "mlock\0"
546 "mlock2\0"
547 "mlockall\0"
548 "munlock\0"
549 "munlockall\0"
550 },
551 [SYSCALL_FILTER_SET_MODULE] = {
552 .name = "@module",
553 .help = "Loading and unloading of kernel modules",
554 .value =
555 "delete_module\0"
556 "finit_module\0"
557 "init_module\0"
558 },
559 [SYSCALL_FILTER_SET_MOUNT] = {
560 .name = "@mount",
561 .help = "Mounting and unmounting of file systems",
562 .value =
563 "chroot\0"
564 "fsconfig\0"
565 "fsmount\0"
566 "fsopen\0"
567 "fspick\0"
568 "mount\0"
569 "move_mount\0"
570 "open_tree\0"
571 "pivot_root\0"
572 "umount\0"
573 "umount2\0"
574 },
575 [SYSCALL_FILTER_SET_NETWORK_IO] = {
576 .name = "@network-io",
577 .help = "Network or Unix socket IO, should not be needed if not network facing",
578 .value =
579 "accept\0"
580 "accept4\0"
581 "bind\0"
582 "connect\0"
583 "getpeername\0"
584 "getsockname\0"
585 "getsockopt\0"
586 "listen\0"
587 "recv\0"
588 "recvfrom\0"
589 "recvmmsg\0"
590 "recvmmsg_time64\0"
591 "recvmsg\0"
592 "send\0"
593 "sendmmsg\0"
594 "sendmsg\0"
595 "sendto\0"
596 "setsockopt\0"
597 "shutdown\0"
598 "socket\0"
599 "socketcall\0"
600 "socketpair\0"
601 },
602 [SYSCALL_FILTER_SET_OBSOLETE] = {
603 /* some unknown even to libseccomp */
604 .name = "@obsolete",
605 .help = "Unusual, obsolete or unimplemented system calls",
606 .value =
607 "_sysctl\0"
608 "afs_syscall\0"
609 "bdflush\0"
610 "break\0"
611 "create_module\0"
612 "ftime\0"
613 "get_kernel_syms\0"
614 "getpmsg\0"
615 "gtty\0"
616 "idle\0"
617 "lock\0"
618 "mpx\0"
619 "prof\0"
620 "profil\0"
621 "putpmsg\0"
622 "query_module\0"
623 "security\0"
624 "sgetmask\0"
625 "ssetmask\0"
626 "stty\0"
627 "sysfs\0"
628 "tuxcall\0"
629 "ulimit\0"
630 "uselib\0"
631 "ustat\0"
632 "vserver\0"
633 },
634 [SYSCALL_FILTER_SET_PKEY] = {
635 .name = "@pkey",
636 .help = "System calls used for memory protection keys",
637 .value =
638 "pkey_alloc\0"
639 "pkey_free\0"
640 "pkey_mprotect\0"
641 },
642 [SYSCALL_FILTER_SET_PRIVILEGED] = {
643 .name = "@privileged",
644 .help = "All system calls which need super-user capabilities",
645 .value =
646 "@chown\0"
647 "@clock\0"
648 "@module\0"
649 "@raw-io\0"
650 "@reboot\0"
651 "@swap\0"
652 "_sysctl\0"
653 "acct\0"
654 "bpf\0"
655 "capset\0"
656 "chroot\0"
657 "fanotify_init\0"
658 "fanotify_mark\0"
659 "nfsservctl\0"
660 "open_by_handle_at\0"
661 "pivot_root\0"
662 "quotactl\0"
663 "setdomainname\0"
664 "setfsuid\0"
665 "setfsuid32\0"
666 "setgroups\0"
667 "setgroups32\0"
668 "sethostname\0"
669 "setresuid\0"
670 "setresuid32\0"
671 "setreuid\0"
672 "setreuid32\0"
673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
674 "setuid32\0"
675 "vhangup\0"
676 },
677 [SYSCALL_FILTER_SET_PROCESS] = {
678 .name = "@process",
679 .help = "Process control, execution, namespaceing operations",
680 .value =
681 "arch_prctl\0"
682 "capget\0" /* Able to query arbitrary processes */
683 "clone\0"
684 "clone3\0"
685 "execveat\0"
686 "fork\0"
687 "getrusage\0"
688 "kill\0"
689 "pidfd_open\0"
690 "pidfd_send_signal\0"
691 "prctl\0"
692 "rt_sigqueueinfo\0"
693 "rt_tgsigqueueinfo\0"
694 "setns\0"
695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
696 "tgkill\0"
697 "times\0"
698 "tkill\0"
699 "unshare\0"
700 "vfork\0"
701 "wait4\0"
702 "waitid\0"
703 "waitpid\0"
704 },
705 [SYSCALL_FILTER_SET_RAW_IO] = {
706 .name = "@raw-io",
707 .help = "Raw I/O port access",
708 .value =
709 "ioperm\0"
710 "iopl\0"
711 "pciconfig_iobase\0"
712 "pciconfig_read\0"
713 "pciconfig_write\0"
714 #ifdef __NR_s390_pci_mmio_read
715 "s390_pci_mmio_read\0"
716 #endif
717 #ifdef __NR_s390_pci_mmio_write
718 "s390_pci_mmio_write\0"
719 #endif
720 },
721 [SYSCALL_FILTER_SET_REBOOT] = {
722 .name = "@reboot",
723 .help = "Reboot and reboot preparation/kexec",
724 .value =
725 "kexec_file_load\0"
726 "kexec_load\0"
727 "reboot\0"
728 },
729 [SYSCALL_FILTER_SET_RESOURCES] = {
730 .name = "@resources",
731 .help = "Alter resource settings",
732 .value =
733 "ioprio_set\0"
734 "mbind\0"
735 "migrate_pages\0"
736 "move_pages\0"
737 "nice\0"
738 "sched_setaffinity\0"
739 "sched_setattr\0"
740 "sched_setparam\0"
741 "sched_setscheduler\0"
742 "set_mempolicy\0"
743 "setpriority\0"
744 "setrlimit\0"
745 },
746 [SYSCALL_FILTER_SET_SETUID] = {
747 .name = "@setuid",
748 .help = "Operations for changing user/group credentials",
749 .value =
750 "setgid\0"
751 "setgid32\0"
752 "setgroups\0"
753 "setgroups32\0"
754 "setregid\0"
755 "setregid32\0"
756 "setresgid\0"
757 "setresgid32\0"
758 "setresuid\0"
759 "setresuid32\0"
760 "setreuid\0"
761 "setreuid32\0"
762 "setuid\0"
763 "setuid32\0"
764 },
765 [SYSCALL_FILTER_SET_SIGNAL] = {
766 .name = "@signal",
767 .help = "Process signal handling",
768 .value =
769 "rt_sigaction\0"
770 "rt_sigpending\0"
771 "rt_sigprocmask\0"
772 "rt_sigsuspend\0"
773 "rt_sigtimedwait\0"
774 "rt_sigtimedwait_time64\0"
775 "sigaction\0"
776 "sigaltstack\0"
777 "signal\0"
778 "signalfd\0"
779 "signalfd4\0"
780 "sigpending\0"
781 "sigprocmask\0"
782 "sigsuspend\0"
783 },
784 [SYSCALL_FILTER_SET_SWAP] = {
785 .name = "@swap",
786 .help = "Enable/disable swap devices",
787 .value =
788 "swapoff\0"
789 "swapon\0"
790 },
791 [SYSCALL_FILTER_SET_SYNC] = {
792 .name = "@sync",
793 .help = "Synchronize files and memory to storage",
794 .value =
795 "fdatasync\0"
796 "fsync\0"
797 "msync\0"
798 "sync\0"
799 "sync_file_range\0"
800 "sync_file_range2\0"
801 "syncfs\0"
802 },
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
804 .name = "@system-service",
805 .help = "General system service operations",
806 .value =
807 "@aio\0"
808 "@basic-io\0"
809 "@chown\0"
810 "@default\0"
811 "@file-system\0"
812 "@io-event\0"
813 "@ipc\0"
814 "@keyring\0"
815 "@memlock\0"
816 "@network-io\0"
817 "@process\0"
818 "@resources\0"
819 "@setuid\0"
820 "@signal\0"
821 "@sync\0"
822 "@timer\0"
823 "brk\0"
824 "capget\0"
825 "capset\0"
826 "copy_file_range\0"
827 "fadvise64\0"
828 "fadvise64_64\0"
829 "flock\0"
830 "get_mempolicy\0"
831 "getcpu\0"
832 "getpriority\0"
833 "getrandom\0"
834 "ioctl\0"
835 "ioprio_get\0"
836 "kcmp\0"
837 "madvise\0"
838 "mprotect\0"
839 "mremap\0"
840 "name_to_handle_at\0"
841 "oldolduname\0"
842 "olduname\0"
843 "personality\0"
844 "readahead\0"
845 "readdir\0"
846 "remap_file_pages\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
850 "sched_getattr\0"
851 "sched_getparam\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
854 "sched_rr_get_interval_time64\0"
855 "sched_yield\0"
856 "sendfile\0"
857 "sendfile64\0"
858 "setfsgid\0"
859 "setfsgid32\0"
860 "setfsuid\0"
861 "setfsuid32\0"
862 "setpgid\0"
863 "setsid\0"
864 "splice\0"
865 "sysinfo\0"
866 "tee\0"
867 "umask\0"
868 "uname\0"
869 "userfaultfd\0"
870 "vmsplice\0"
871 },
872 [SYSCALL_FILTER_SET_TIMER] = {
873 .name = "@timer",
874 .help = "Schedule operations by time",
875 .value =
876 "alarm\0"
877 "getitimer\0"
878 "setitimer\0"
879 "timer_create\0"
880 "timer_delete\0"
881 "timer_getoverrun\0"
882 "timer_gettime\0"
883 "timer_gettime64\0"
884 "timer_settime\0"
885 "timer_settime64\0"
886 "timerfd_create\0"
887 "timerfd_gettime\0"
888 "timerfd_gettime64\0"
889 "timerfd_settime\0"
890 "timerfd_settime64\0"
891 "times\0"
892 },
893 };
894
895 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
896 unsigned i;
897
898 if (isempty(name) || name[0] != '@')
899 return NULL;
900
901 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
902 if (streq(syscall_filter_sets[i].name, name))
903 return syscall_filter_sets + i;
904
905 return NULL;
906 }
907
908 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
909
910 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
911 assert(seccomp);
912 assert(name);
913
914 if (strv_contains(exclude, name))
915 return 0;
916
917 if (name[0] == '@') {
918 const SyscallFilterSet *other;
919
920 other = syscall_filter_set_find(name);
921 if (!other)
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Filter set %s is not known!",
924 name);
925
926 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
927
928 } else {
929 int id, r;
930
931 id = seccomp_syscall_resolve_name(name);
932 if (id == __NR_SCMP_ERROR) {
933 if (log_missing)
934 log_debug("System call %s is not known, ignoring.", name);
935 return 0;
936 }
937
938 r = seccomp_rule_add_exact(seccomp, action, id, 0);
939 if (r < 0) {
940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
941 bool ignore = r == -EDOM;
942
943 if (!ignore || log_missing)
944 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
945 name, id, ignore ? ", ignoring" : "");
946 if (!ignore)
947 return r;
948 }
949
950 return 0;
951 }
952 }
953
954 static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp,
956 const SyscallFilterSet *set,
957 uint32_t action,
958 char **exclude,
959 bool log_missing) {
960
961 const char *sys;
962 int r;
963
964 assert(seccomp);
965 assert(set);
966
967 NULSTR_FOREACH(sys, set->value) {
968 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
969 if (r < 0)
970 return r;
971 }
972
973 return 0;
974 }
975
976 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
977 uint32_t arch;
978 int r;
979
980 assert(set);
981
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
983 * each local arch. */
984
985 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
986 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
987
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
989
990 r = seccomp_init_for_arch(&seccomp, arch, default_action);
991 if (r < 0)
992 return r;
993
994 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
995 if (r < 0)
996 return log_debug_errno(r, "Failed to add filter set: %m");
997
998 r = seccomp_load(seccomp);
999 if (ERRNO_IS_SECCOMP_FATAL(r))
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1003 }
1004
1005 return 0;
1006 }
1007
1008 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1009 uint32_t arch;
1010 int r;
1011
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
1014
1015 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1016 return 0;
1017
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1019 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1020 Iterator i;
1021 void *syscall_id, *val;
1022
1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1024
1025 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1026 if (r < 0)
1027 return r;
1028
1029 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
1030 uint32_t a = action;
1031 int id = PTR_TO_INT(syscall_id) - 1;
1032 int error = PTR_TO_INT(val);
1033
1034 if (action != SCMP_ACT_ALLOW && error >= 0)
1035 a = SCMP_ACT_ERRNO(error);
1036
1037 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1038 if (r < 0) {
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_ char *n = NULL;
1041 bool ignore;
1042
1043 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1044 ignore = r == -EDOM;
1045 if (!ignore || log_missing)
1046 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n), id, ignore ? ", ignoring" : "");
1048 if (!ignore)
1049 return r;
1050 }
1051 }
1052
1053 r = seccomp_load(seccomp);
1054 if (ERRNO_IS_SECCOMP_FATAL(r))
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
1061 }
1062
1063 int seccomp_parse_syscall_filter(
1064 const char *name,
1065 int errno_num,
1066 Hashmap *filter,
1067 SeccompParseFlags flags,
1068 const char *unit,
1069 const char *filename,
1070 unsigned line) {
1071
1072 int r;
1073
1074 assert(name);
1075 assert(filter);
1076
1077 if (name[0] == '@') {
1078 const SyscallFilterSet *set;
1079 const char *i;
1080
1081 set = syscall_filter_set_find(name);
1082 if (!set) {
1083 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1084 return -EINVAL;
1085
1086 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1087 "Unknown system call group, ignoring: %s", name);
1088 return 0;
1089 }
1090
1091 NULSTR_FOREACH(i, set->value) {
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1095 * about them. */
1096 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1097 if (r < 0)
1098 return r;
1099 }
1100 } else {
1101 int id;
1102
1103 id = seccomp_syscall_resolve_name(name);
1104 if (id == __NR_SCMP_ERROR) {
1105 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1106 return -EINVAL;
1107
1108 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1109 "Failed to parse system call, ignoring: %s", name);
1110 return 0;
1111 }
1112
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
1115 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
1116 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1117 if (r < 0)
1118 switch (r) {
1119 case -ENOMEM:
1120 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1121 case -EEXIST:
1122 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1123 break;
1124 default:
1125 return r;
1126 }
1127 } else
1128 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1129 }
1130
1131 return 0;
1132 }
1133
1134 int seccomp_restrict_namespaces(unsigned long retain) {
1135 uint32_t arch;
1136 int r;
1137
1138 if (DEBUG_LOGGING) {
1139 _cleanup_free_ char *s = NULL;
1140
1141 (void) namespace_flags_to_string(retain, &s);
1142 log_debug("Restricting namespace to: %s.", strna(s));
1143 }
1144
1145 /* NOOP? */
1146 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1147 return 0;
1148
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1150 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1151 unsigned i;
1152
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1154
1155 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1156 if (r < 0)
1157 return r;
1158
1159 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1161 * altogether. */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(setns),
1166 0);
1167 else
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(setns),
1174 1,
1175 SCMP_A1(SCMP_CMP_EQ, 0));
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 continue;
1179 }
1180
1181 for (i = 0; namespace_flag_map[i].name; i++) {
1182 unsigned long f;
1183
1184 f = namespace_flag_map[i].flag;
1185 if ((retain & f) == f) {
1186 log_debug("Permitting %s.", namespace_flag_map[i].name);
1187 continue;
1188 }
1189
1190 log_debug("Blocking %s.", namespace_flag_map[i].name);
1191
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EPERM),
1195 SCMP_SYS(unshare),
1196 1,
1197 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 break;
1201 }
1202
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1211 else
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EPERM),
1215 SCMP_SYS(clone),
1216 1,
1217 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 break;
1221 }
1222
1223 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1224 r = seccomp_rule_add_exact(
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
1227 SCMP_SYS(setns),
1228 1,
1229 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1230 if (r < 0) {
1231 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1232 break;
1233 }
1234 }
1235 }
1236 if (r < 0)
1237 continue;
1238
1239 r = seccomp_load(seccomp);
1240 if (ERRNO_IS_SECCOMP_FATAL(r))
1241 return r;
1242 if (r < 0)
1243 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 }
1245
1246 return 0;
1247 }
1248
1249 int seccomp_protect_sysctl(void) {
1250 uint32_t arch;
1251 int r;
1252
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1254 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1255
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1257
1258 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1259 /* No _sysctl syscall */
1260 continue;
1261
1262 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1263 if (r < 0)
1264 return r;
1265
1266 r = seccomp_rule_add_exact(
1267 seccomp,
1268 SCMP_ACT_ERRNO(EPERM),
1269 SCMP_SYS(_sysctl),
1270 0);
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 continue;
1274 }
1275
1276 r = seccomp_load(seccomp);
1277 if (ERRNO_IS_SECCOMP_FATAL(r))
1278 return r;
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1281 }
1282
1283 return 0;
1284 }
1285
1286 int seccomp_protect_syslog(void) {
1287 uint32_t arch;
1288 int r;
1289
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1291 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1292
1293 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1294 if (r < 0)
1295 return r;
1296
1297 r = seccomp_rule_add_exact(
1298 seccomp,
1299 SCMP_ACT_ERRNO(EPERM),
1300 SCMP_SYS(syslog),
1301 0);
1302
1303 if (r < 0) {
1304 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1305 continue;
1306 }
1307
1308 r = seccomp_load(seccomp);
1309 if (ERRNO_IS_SECCOMP_FATAL(r))
1310 return r;
1311 if (r < 0)
1312 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1313 }
1314
1315 return 0;
1316 }
1317
1318 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1319 uint32_t arch;
1320 int r;
1321
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1323 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1324 bool supported;
1325 Iterator i;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
1329 switch (arch) {
1330
1331 case SCMP_ARCH_X86_64:
1332 case SCMP_ARCH_X32:
1333 case SCMP_ARCH_ARM:
1334 case SCMP_ARCH_AARCH64:
1335 case SCMP_ARCH_PPC:
1336 case SCMP_ARCH_PPC64:
1337 case SCMP_ARCH_PPC64LE:
1338 case SCMP_ARCH_MIPSEL64N32:
1339 case SCMP_ARCH_MIPS64N32:
1340 case SCMP_ARCH_MIPSEL64:
1341 case SCMP_ARCH_MIPS64:
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1343 supported = true;
1344 break;
1345
1346 case SCMP_ARCH_S390:
1347 case SCMP_ARCH_S390X:
1348 case SCMP_ARCH_X86:
1349 case SCMP_ARCH_MIPSEL:
1350 case SCMP_ARCH_MIPS:
1351 default:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1353 * don't know */
1354 supported = false;
1355 break;
1356 }
1357
1358 if (!supported)
1359 continue;
1360
1361 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1362 if (r < 0)
1363 return r;
1364
1365 if (allow_list) {
1366 int af, first = 0, last = 0;
1367 void *afp;
1368
1369 /* If this is an allow list, we first block the address families that are out of
1370 * range and then everything that is not in the set. First, we find the lowest and
1371 * highest address family in the set. */
1372
1373 SET_FOREACH(afp, address_families, i) {
1374 af = PTR_TO_INT(afp);
1375
1376 if (af <= 0 || af >= af_max())
1377 continue;
1378
1379 if (first == 0 || af < first)
1380 first = af;
1381
1382 if (last == 0 || af > last)
1383 last = af;
1384 }
1385
1386 assert((first == 0) == (last == 0));
1387
1388 if (first == 0) {
1389
1390 /* No entries in the valid range, block everything */
1391 r = seccomp_rule_add_exact(
1392 seccomp,
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1394 SCMP_SYS(socket),
1395 0);
1396 if (r < 0) {
1397 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 continue;
1399 }
1400
1401 } else {
1402
1403 /* Block everything below the first entry */
1404 r = seccomp_rule_add_exact(
1405 seccomp,
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1407 SCMP_SYS(socket),
1408 1,
1409 SCMP_A0(SCMP_CMP_LT, first));
1410 if (r < 0) {
1411 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1412 continue;
1413 }
1414
1415 /* Block everything above the last entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_GT, last));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything between the first and last entry */
1428 for (af = 1; af < af_max(); af++) {
1429
1430 if (set_contains(address_families, INT_TO_PTR(af)))
1431 continue;
1432
1433 r = seccomp_rule_add_exact(
1434 seccomp,
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1436 SCMP_SYS(socket),
1437 1,
1438 SCMP_A0(SCMP_CMP_EQ, af));
1439 if (r < 0)
1440 break;
1441 }
1442 if (r < 0) {
1443 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1444 continue;
1445 }
1446 }
1447
1448 } else {
1449 void *af;
1450
1451 /* If this is a deny list, then generate one rule for each address family that are
1452 * then combined in OR checks. */
1453
1454 SET_FOREACH(af, address_families, i) {
1455
1456 r = seccomp_rule_add_exact(
1457 seccomp,
1458 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1459 SCMP_SYS(socket),
1460 1,
1461 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1462 if (r < 0)
1463 break;
1464 }
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
1469 }
1470
1471 r = seccomp_load(seccomp);
1472 if (ERRNO_IS_SECCOMP_FATAL(r))
1473 return r;
1474 if (r < 0)
1475 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1476 }
1477
1478 return 0;
1479 }
1480
1481 int seccomp_restrict_realtime(void) {
1482 static const int permitted_policies[] = {
1483 SCHED_OTHER,
1484 SCHED_BATCH,
1485 SCHED_IDLE,
1486 };
1487
1488 int r, max_policy = 0;
1489 uint32_t arch;
1490 unsigned i;
1491
1492 /* Determine the highest policy constant we want to allow */
1493 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1494 if (permitted_policies[i] > max_policy)
1495 max_policy = permitted_policies[i];
1496
1497 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1498 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1499 int p;
1500
1501 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1502
1503 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1504 if (r < 0)
1505 return r;
1506
1507 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1508 * allow list. */
1509 for (p = 0; p < max_policy; p++) {
1510 bool good = false;
1511
1512 /* Check if this is in the allow list. */
1513 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1514 if (permitted_policies[i] == p) {
1515 good = true;
1516 break;
1517 }
1518
1519 if (good)
1520 continue;
1521
1522 /* Deny this policy */
1523 r = seccomp_rule_add_exact(
1524 seccomp,
1525 SCMP_ACT_ERRNO(EPERM),
1526 SCMP_SYS(sched_setscheduler),
1527 1,
1528 SCMP_A1(SCMP_CMP_EQ, p));
1529 if (r < 0) {
1530 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1531 continue;
1532 }
1533 }
1534
1535 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1536 * are unsigned here, hence no need no check for < 0 values. */
1537 r = seccomp_rule_add_exact(
1538 seccomp,
1539 SCMP_ACT_ERRNO(EPERM),
1540 SCMP_SYS(sched_setscheduler),
1541 1,
1542 SCMP_A1(SCMP_CMP_GT, max_policy));
1543 if (r < 0) {
1544 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1545 continue;
1546 }
1547
1548 r = seccomp_load(seccomp);
1549 if (ERRNO_IS_SECCOMP_FATAL(r))
1550 return r;
1551 if (r < 0)
1552 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1553 }
1554
1555 return 0;
1556 }
1557
1558 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1559 uint32_t arch,
1560 int nr,
1561 unsigned arg_cnt,
1562 const struct scmp_arg_cmp arg) {
1563 int r;
1564
1565 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1566 if (r < 0) {
1567 _cleanup_free_ char *n = NULL;
1568
1569 n = seccomp_syscall_resolve_num_arch(arch, nr);
1570 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1571 strna(n),
1572 seccomp_arch_to_string(arch));
1573 }
1574
1575 return r;
1576 }
1577
1578 /* For known architectures, check that syscalls are indeed defined or not. */
1579 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1580 assert_cc(SCMP_SYS(shmget) > 0);
1581 assert_cc(SCMP_SYS(shmat) > 0);
1582 assert_cc(SCMP_SYS(shmdt) > 0);
1583 #endif
1584
1585 int seccomp_memory_deny_write_execute(void) {
1586 uint32_t arch;
1587 unsigned loaded = 0;
1588
1589 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1590 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1591 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1592
1593 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1594
1595 switch (arch) {
1596
1597 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1598 * We ignore that here, which means there's still a way to get writable/executable
1599 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1600
1601 case SCMP_ARCH_X86:
1602 case SCMP_ARCH_S390:
1603 filter_syscall = SCMP_SYS(mmap2);
1604 block_syscall = SCMP_SYS(mmap);
1605 /* shmat multiplexed, see above */
1606 break;
1607
1608 case SCMP_ARCH_PPC:
1609 case SCMP_ARCH_PPC64:
1610 case SCMP_ARCH_PPC64LE:
1611 case SCMP_ARCH_S390X:
1612 filter_syscall = SCMP_SYS(mmap);
1613 /* shmat multiplexed, see above */
1614 break;
1615
1616 case SCMP_ARCH_ARM:
1617 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1618 shmat_syscall = SCMP_SYS(shmat);
1619 break;
1620
1621 case SCMP_ARCH_X86_64:
1622 case SCMP_ARCH_X32:
1623 case SCMP_ARCH_AARCH64:
1624 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
1625 shmat_syscall = SCMP_SYS(shmat);
1626 break;
1627
1628 /* Please add more definitions here, if you port systemd to other architectures! */
1629
1630 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1631 #warning "Consider adding the right mmap() syscall definitions here!"
1632 #endif
1633 }
1634
1635 /* Can't filter mmap() on this arch, then skip it */
1636 if (filter_syscall == 0)
1637 continue;
1638
1639 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1640 if (r < 0)
1641 return r;
1642
1643 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1644 1,
1645 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1646 if (r < 0)
1647 continue;
1648
1649 if (block_syscall != 0) {
1650 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1651 if (r < 0)
1652 continue;
1653 }
1654
1655 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1656 1,
1657 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1658 if (r < 0)
1659 continue;
1660
1661 #ifdef __NR_pkey_mprotect
1662 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1663 1,
1664 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1665 if (r < 0)
1666 continue;
1667 #endif
1668
1669 if (shmat_syscall > 0) {
1670 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1671 1,
1672 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1673 if (r < 0)
1674 continue;
1675 }
1676
1677 r = seccomp_load(seccomp);
1678 if (ERRNO_IS_SECCOMP_FATAL(r))
1679 return r;
1680 if (r < 0)
1681 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1682 seccomp_arch_to_string(arch));
1683 loaded++;
1684 }
1685
1686 if (loaded == 0)
1687 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1688
1689 return loaded;
1690 }
1691
1692 int seccomp_restrict_archs(Set *archs) {
1693 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1694 Iterator i;
1695 void *id;
1696 int r;
1697
1698 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1699 * list.
1700 *
1701 * There are some qualifications. However the most important use is to stop processes from bypassing
1702 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1703 * in a non-native architecture. There are no holes in this use case, at least so far. */
1704
1705 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1706 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1707 * to run a program with the restrictions applied. */
1708 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1709 if (!seccomp)
1710 return -ENOMEM;
1711
1712 SET_FOREACH(id, archs, i) {
1713 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1714 if (r < 0 && r != -EEXIST)
1715 return r;
1716 }
1717
1718 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1719 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1720 * The important thing is that you can block the old 32-bit x86 syscalls.
1721 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1722
1723 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1724 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1725
1726 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1727 if (r < 0 && r != -EEXIST)
1728 return r;
1729 }
1730
1731 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1732 if (r < 0)
1733 return r;
1734
1735 r = seccomp_load(seccomp);
1736 if (ERRNO_IS_SECCOMP_FATAL(r))
1737 return r;
1738 if (r < 0)
1739 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1740
1741 return 0;
1742 }
1743
1744 int parse_syscall_archs(char **l, Set **ret_archs) {
1745 _cleanup_set_free_ Set *archs = NULL;
1746 char **s;
1747 int r;
1748
1749 assert(l);
1750 assert(ret_archs);
1751
1752 STRV_FOREACH(s, l) {
1753 uint32_t a;
1754
1755 r = seccomp_arch_from_string(*s, &a);
1756 if (r < 0)
1757 return -EINVAL;
1758
1759 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1760 if (r < 0)
1761 return -ENOMEM;
1762 }
1763
1764 *ret_archs = TAKE_PTR(archs);
1765 return 0;
1766 }
1767
1768 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1769 const char *i;
1770 int r;
1771
1772 assert(set);
1773
1774 NULSTR_FOREACH(i, set->value) {
1775
1776 if (i[0] == '@') {
1777 const SyscallFilterSet *more;
1778
1779 more = syscall_filter_set_find(i);
1780 if (!more)
1781 return -ENXIO;
1782
1783 r = seccomp_filter_set_add(filter, add, more);
1784 if (r < 0)
1785 return r;
1786 } else {
1787 int id;
1788
1789 id = seccomp_syscall_resolve_name(i);
1790 if (id == __NR_SCMP_ERROR) {
1791 log_debug("Couldn't resolve system call, ignoring: %s", i);
1792 continue;
1793 }
1794
1795 if (add) {
1796 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1797 if (r < 0)
1798 return r;
1799 } else
1800 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1801 }
1802 }
1803
1804 return 0;
1805 }
1806
1807 int seccomp_lock_personality(unsigned long personality) {
1808 uint32_t arch;
1809 int r;
1810
1811 if (personality >= PERSONALITY_INVALID)
1812 return -EINVAL;
1813
1814 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1815 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1816
1817 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1818 if (r < 0)
1819 return r;
1820
1821 r = seccomp_rule_add_exact(
1822 seccomp,
1823 SCMP_ACT_ERRNO(EPERM),
1824 SCMP_SYS(personality),
1825 1,
1826 SCMP_A0(SCMP_CMP_NE, personality));
1827 if (r < 0) {
1828 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1829 continue;
1830 }
1831
1832 r = seccomp_load(seccomp);
1833 if (ERRNO_IS_SECCOMP_FATAL(r))
1834 return r;
1835 if (r < 0)
1836 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1837 }
1838
1839 return 0;
1840 }
1841
1842 int seccomp_protect_hostname(void) {
1843 uint32_t arch;
1844 int r;
1845
1846 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1847 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1848
1849 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1850 if (r < 0)
1851 return r;
1852
1853 r = seccomp_rule_add_exact(
1854 seccomp,
1855 SCMP_ACT_ERRNO(EPERM),
1856 SCMP_SYS(sethostname),
1857 0);
1858 if (r < 0) {
1859 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1860 continue;
1861 }
1862
1863 r = seccomp_rule_add_exact(
1864 seccomp,
1865 SCMP_ACT_ERRNO(EPERM),
1866 SCMP_SYS(setdomainname),
1867 0);
1868 if (r < 0) {
1869 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1870 continue;
1871 }
1872
1873 r = seccomp_load(seccomp);
1874 if (ERRNO_IS_SECCOMP_FATAL(r))
1875 return r;
1876 if (r < 0)
1877 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1878 }
1879
1880 return 0;
1881 }
1882
1883 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1884 /* Checks the mode_t parameter of the following system calls:
1885 *
1886 * → chmod() + fchmod() + fchmodat()
1887 * → open() + creat() + openat()
1888 * → mkdir() + mkdirat()
1889 * → mknod() + mknodat()
1890 *
1891 * Returns error if *everything* failed, and 0 otherwise.
1892 */
1893 int r = 0;
1894 bool any = false;
1895
1896 r = seccomp_rule_add_exact(
1897 seccomp,
1898 SCMP_ACT_ERRNO(EPERM),
1899 SCMP_SYS(chmod),
1900 1,
1901 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1902 if (r < 0)
1903 log_debug_errno(r, "Failed to add filter for chmod: %m");
1904 else
1905 any = true;
1906
1907 r = seccomp_rule_add_exact(
1908 seccomp,
1909 SCMP_ACT_ERRNO(EPERM),
1910 SCMP_SYS(fchmod),
1911 1,
1912 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1913 if (r < 0)
1914 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1915 else
1916 any = true;
1917
1918 r = seccomp_rule_add_exact(
1919 seccomp,
1920 SCMP_ACT_ERRNO(EPERM),
1921 SCMP_SYS(fchmodat),
1922 1,
1923 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1924 if (r < 0)
1925 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1926 else
1927 any = true;
1928
1929 r = seccomp_rule_add_exact(
1930 seccomp,
1931 SCMP_ACT_ERRNO(EPERM),
1932 SCMP_SYS(mkdir),
1933 1,
1934 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1935 if (r < 0)
1936 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1937 else
1938 any = true;
1939
1940 r = seccomp_rule_add_exact(
1941 seccomp,
1942 SCMP_ACT_ERRNO(EPERM),
1943 SCMP_SYS(mkdirat),
1944 1,
1945 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1946 if (r < 0)
1947 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1948 else
1949 any = true;
1950
1951 r = seccomp_rule_add_exact(
1952 seccomp,
1953 SCMP_ACT_ERRNO(EPERM),
1954 SCMP_SYS(mknod),
1955 1,
1956 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1957 if (r < 0)
1958 log_debug_errno(r, "Failed to add filter for mknod: %m");
1959 else
1960 any = true;
1961
1962 r = seccomp_rule_add_exact(
1963 seccomp,
1964 SCMP_ACT_ERRNO(EPERM),
1965 SCMP_SYS(mknodat),
1966 1,
1967 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1968 if (r < 0)
1969 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1970 else
1971 any = true;
1972
1973 #if SCMP_SYS(open) > 0
1974 r = seccomp_rule_add_exact(
1975 seccomp,
1976 SCMP_ACT_ERRNO(EPERM),
1977 SCMP_SYS(open),
1978 2,
1979 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1980 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1981 if (r < 0)
1982 log_debug_errno(r, "Failed to add filter for open: %m");
1983 else
1984 any = true;
1985 #endif
1986
1987 r = seccomp_rule_add_exact(
1988 seccomp,
1989 SCMP_ACT_ERRNO(EPERM),
1990 SCMP_SYS(openat),
1991 2,
1992 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1993 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1994 if (r < 0)
1995 log_debug_errno(r, "Failed to add filter for openat: %m");
1996 else
1997 any = true;
1998
1999 #if defined(__SNR_openat2)
2000 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2001 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2002 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2003 * compatible with kernels that are not absolutely recent. */
2004 r = seccomp_rule_add_exact(
2005 seccomp,
2006 SCMP_ACT_ERRNO(EPERM),
2007 SCMP_SYS(openat2),
2008 0);
2009 if (r < 0)
2010 log_debug_errno(r, "Failed to add filter for openat2: %m");
2011 else
2012 any = true;
2013 #endif
2014
2015 r = seccomp_rule_add_exact(
2016 seccomp,
2017 SCMP_ACT_ERRNO(EPERM),
2018 SCMP_SYS(creat),
2019 1,
2020 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2021 if (r < 0)
2022 log_debug_errno(r, "Failed to add filter for creat: %m");
2023 else
2024 any = true;
2025
2026 return any ? 0 : r;
2027 }
2028
2029 int seccomp_restrict_suid_sgid(void) {
2030 uint32_t arch;
2031 int r, k;
2032
2033 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2034 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2035
2036 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2037 if (r < 0)
2038 return r;
2039
2040 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2041 if (r < 0)
2042 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2043
2044 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2045 if (k < 0)
2046 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2047
2048 if (r < 0 && k < 0)
2049 continue;
2050
2051 r = seccomp_load(seccomp);
2052 if (ERRNO_IS_SECCOMP_FATAL(r))
2053 return r;
2054 if (r < 0)
2055 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2056 }
2057
2058 return 0;
2059 }
2060
2061 uint32_t scmp_act_kill_process(void) {
2062
2063 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2064 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2065 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2066 * for single-threaded apps does the right thing. */
2067
2068 #ifdef SCMP_ACT_KILL_PROCESS
2069 if (seccomp_api_get() >= 3)
2070 return SCMP_ACT_KILL_PROCESS;
2071 #endif
2072
2073 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2074 }