]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
tree-wide: use set_ensure_put()
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "errno-list.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "nulstr-util.h"
19 #include "process-util.h"
20 #include "seccomp-util.h"
21 #include "set.h"
22 #include "string-util.h"
23 #include "strv.h"
24
25 const uint32_t seccomp_local_archs[] = {
26
27 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
28
29 #if defined(__x86_64__) && defined(__ILP32__)
30 SCMP_ARCH_X86,
31 SCMP_ARCH_X86_64,
32 SCMP_ARCH_X32, /* native */
33 #elif defined(__x86_64__) && !defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X32,
36 SCMP_ARCH_X86_64, /* native */
37 #elif defined(__i386__)
38 SCMP_ARCH_X86,
39 #elif defined(__aarch64__)
40 SCMP_ARCH_ARM,
41 SCMP_ARCH_AARCH64, /* native */
42 #elif defined(__arm__)
43 SCMP_ARCH_ARM,
44 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
45 SCMP_ARCH_MIPSEL,
46 SCMP_ARCH_MIPS, /* native */
47 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
48 SCMP_ARCH_MIPS,
49 SCMP_ARCH_MIPSEL, /* native */
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS,
53 SCMP_ARCH_MIPSEL64N32,
54 SCMP_ARCH_MIPS64N32,
55 SCMP_ARCH_MIPSEL64,
56 SCMP_ARCH_MIPS64, /* native */
57 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64N32,
62 SCMP_ARCH_MIPS64,
63 SCMP_ARCH_MIPSEL64, /* native */
64 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS,
67 SCMP_ARCH_MIPSEL64,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64N32,
70 SCMP_ARCH_MIPS64N32, /* native */
71 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64,
76 SCMP_ARCH_MIPS64N32,
77 SCMP_ARCH_MIPSEL64N32, /* native */
78 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
79 SCMP_ARCH_PPC,
80 SCMP_ARCH_PPC64LE,
81 SCMP_ARCH_PPC64, /* native */
82 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
83 SCMP_ARCH_PPC,
84 SCMP_ARCH_PPC64,
85 SCMP_ARCH_PPC64LE, /* native */
86 #elif defined(__powerpc__)
87 SCMP_ARCH_PPC,
88 #elif defined(__s390x__)
89 SCMP_ARCH_S390,
90 SCMP_ARCH_S390X, /* native */
91 #elif defined(__s390__)
92 SCMP_ARCH_S390,
93 #endif
94 (uint32_t) -1
95 };
96
97 const char* seccomp_arch_to_string(uint32_t c) {
98 /* Maintain order used in <seccomp.h>.
99 *
100 * Names used here should be the same as those used for ConditionArchitecture=,
101 * except for "subarchitectures" like x32. */
102
103 switch(c) {
104 case SCMP_ARCH_NATIVE:
105 return "native";
106 case SCMP_ARCH_X86:
107 return "x86";
108 case SCMP_ARCH_X86_64:
109 return "x86-64";
110 case SCMP_ARCH_X32:
111 return "x32";
112 case SCMP_ARCH_ARM:
113 return "arm";
114 case SCMP_ARCH_AARCH64:
115 return "arm64";
116 case SCMP_ARCH_MIPS:
117 return "mips";
118 case SCMP_ARCH_MIPS64:
119 return "mips64";
120 case SCMP_ARCH_MIPS64N32:
121 return "mips64-n32";
122 case SCMP_ARCH_MIPSEL:
123 return "mips-le";
124 case SCMP_ARCH_MIPSEL64:
125 return "mips64-le";
126 case SCMP_ARCH_MIPSEL64N32:
127 return "mips64-le-n32";
128 case SCMP_ARCH_PPC:
129 return "ppc";
130 case SCMP_ARCH_PPC64:
131 return "ppc64";
132 case SCMP_ARCH_PPC64LE:
133 return "ppc64-le";
134 case SCMP_ARCH_S390:
135 return "s390";
136 case SCMP_ARCH_S390X:
137 return "s390x";
138 default:
139 return NULL;
140 }
141 }
142
143 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
144 if (!n)
145 return -EINVAL;
146
147 assert(ret);
148
149 if (streq(n, "native"))
150 *ret = SCMP_ARCH_NATIVE;
151 else if (streq(n, "x86"))
152 *ret = SCMP_ARCH_X86;
153 else if (streq(n, "x86-64"))
154 *ret = SCMP_ARCH_X86_64;
155 else if (streq(n, "x32"))
156 *ret = SCMP_ARCH_X32;
157 else if (streq(n, "arm"))
158 *ret = SCMP_ARCH_ARM;
159 else if (streq(n, "arm64"))
160 *ret = SCMP_ARCH_AARCH64;
161 else if (streq(n, "mips"))
162 *ret = SCMP_ARCH_MIPS;
163 else if (streq(n, "mips64"))
164 *ret = SCMP_ARCH_MIPS64;
165 else if (streq(n, "mips64-n32"))
166 *ret = SCMP_ARCH_MIPS64N32;
167 else if (streq(n, "mips-le"))
168 *ret = SCMP_ARCH_MIPSEL;
169 else if (streq(n, "mips64-le"))
170 *ret = SCMP_ARCH_MIPSEL64;
171 else if (streq(n, "mips64-le-n32"))
172 *ret = SCMP_ARCH_MIPSEL64N32;
173 else if (streq(n, "ppc"))
174 *ret = SCMP_ARCH_PPC;
175 else if (streq(n, "ppc64"))
176 *ret = SCMP_ARCH_PPC64;
177 else if (streq(n, "ppc64-le"))
178 *ret = SCMP_ARCH_PPC64LE;
179 else if (streq(n, "s390"))
180 *ret = SCMP_ARCH_S390;
181 else if (streq(n, "s390x"))
182 *ret = SCMP_ARCH_S390X;
183 else
184 return -EINVAL;
185
186 return 0;
187 }
188
189 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
190 scmp_filter_ctx seccomp;
191 int r;
192
193 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
194 * any others. Also, turns off the NNP fiddling. */
195
196 seccomp = seccomp_init(default_action);
197 if (!seccomp)
198 return -ENOMEM;
199
200 if (arch != SCMP_ARCH_NATIVE &&
201 arch != seccomp_arch_native()) {
202
203 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
204 if (r < 0)
205 goto finish;
206
207 r = seccomp_arch_add(seccomp, arch);
208 if (r < 0)
209 goto finish;
210
211 assert(seccomp_arch_exist(seccomp, arch) >= 0);
212 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
213 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
214 } else {
215 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
216 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
217 }
218
219 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
220 if (r < 0)
221 goto finish;
222
223 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
224 if (r < 0)
225 goto finish;
226
227 *ret = seccomp;
228 return 0;
229
230 finish:
231 seccomp_release(seccomp);
232 return r;
233 }
234
235 static bool is_basic_seccomp_available(void) {
236 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
237 }
238
239 static bool is_seccomp_filter_available(void) {
240 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
241 errno == EFAULT;
242 }
243
244 bool is_seccomp_available(void) {
245 static int cached_enabled = -1;
246
247 if (cached_enabled < 0)
248 cached_enabled =
249 is_basic_seccomp_available() &&
250 is_seccomp_filter_available();
251
252 return cached_enabled;
253 }
254
255 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
256 [SYSCALL_FILTER_SET_DEFAULT] = {
257 .name = "@default",
258 .help = "System calls that are always permitted",
259 .value =
260 "clock_getres\0"
261 "clock_getres_time64\0"
262 "clock_gettime\0"
263 "clock_gettime64\0"
264 "clock_nanosleep\0"
265 "clock_nanosleep_time64\0"
266 "execve\0"
267 "exit\0"
268 "exit_group\0"
269 "futex\0"
270 "futex_time64\0"
271 "get_robust_list\0"
272 "get_thread_area\0"
273 "getegid\0"
274 "getegid32\0"
275 "geteuid\0"
276 "geteuid32\0"
277 "getgid\0"
278 "getgid32\0"
279 "getgroups\0"
280 "getgroups32\0"
281 "getpgid\0"
282 "getpgrp\0"
283 "getpid\0"
284 "getppid\0"
285 "getresgid\0"
286 "getresgid32\0"
287 "getresuid\0"
288 "getresuid32\0"
289 "getrlimit\0" /* make sure processes can query stack size and such */
290 "getsid\0"
291 "gettid\0"
292 "gettimeofday\0"
293 "getuid\0"
294 "getuid32\0"
295 "membarrier\0"
296 "nanosleep\0"
297 "pause\0"
298 "prlimit64\0"
299 "restart_syscall\0"
300 "rseq\0"
301 "rt_sigreturn\0"
302 "sched_yield\0"
303 "set_robust_list\0"
304 "set_thread_area\0"
305 "set_tid_address\0"
306 "set_tls\0"
307 "sigreturn\0"
308 "time\0"
309 "ugetrlimit\0"
310 },
311 [SYSCALL_FILTER_SET_AIO] = {
312 .name = "@aio",
313 .help = "Asynchronous IO",
314 .value =
315 "io_cancel\0"
316 "io_destroy\0"
317 "io_getevents\0"
318 "io_pgetevents\0"
319 "io_pgetevents_time64\0"
320 "io_setup\0"
321 "io_submit\0"
322 "io_uring_enter\0"
323 "io_uring_register\0"
324 "io_uring_setup\0"
325 },
326 [SYSCALL_FILTER_SET_BASIC_IO] = {
327 .name = "@basic-io",
328 .help = "Basic IO",
329 .value =
330 "_llseek\0"
331 "close\0"
332 "dup\0"
333 "dup2\0"
334 "dup3\0"
335 "lseek\0"
336 "pread64\0"
337 "preadv\0"
338 "preadv2\0"
339 "pwrite64\0"
340 "pwritev\0"
341 "pwritev2\0"
342 "read\0"
343 "readv\0"
344 "write\0"
345 "writev\0"
346 },
347 [SYSCALL_FILTER_SET_CHOWN] = {
348 .name = "@chown",
349 .help = "Change ownership of files and directories",
350 .value =
351 "chown\0"
352 "chown32\0"
353 "fchown\0"
354 "fchown32\0"
355 "fchownat\0"
356 "lchown\0"
357 "lchown32\0"
358 },
359 [SYSCALL_FILTER_SET_CLOCK] = {
360 .name = "@clock",
361 .help = "Change the system time",
362 .value =
363 "adjtimex\0"
364 "clock_adjtime\0"
365 "clock_adjtime64\0"
366 "clock_settime\0"
367 "clock_settime64\0"
368 "settimeofday\0"
369 "stime\0"
370 },
371 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
372 .name = "@cpu-emulation",
373 .help = "System calls for CPU emulation functionality",
374 .value =
375 "modify_ldt\0"
376 "subpage_prot\0"
377 "switch_endian\0"
378 "vm86\0"
379 "vm86old\0"
380 },
381 [SYSCALL_FILTER_SET_DEBUG] = {
382 .name = "@debug",
383 .help = "Debugging, performance monitoring and tracing functionality",
384 .value =
385 "lookup_dcookie\0"
386 "perf_event_open\0"
387 "pidfd_getfd\0"
388 "ptrace\0"
389 "rtas\0"
390 #ifdef __NR_s390_runtime_instr
391 "s390_runtime_instr\0"
392 #endif
393 "sys_debug_setcontext\0"
394 },
395 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
396 .name = "@file-system",
397 .help = "File system operations",
398 .value =
399 "access\0"
400 "chdir\0"
401 "chmod\0"
402 "close\0"
403 "creat\0"
404 "faccessat\0"
405 "fallocate\0"
406 "fchdir\0"
407 "fchmod\0"
408 "fchmodat\0"
409 "fcntl\0"
410 "fcntl64\0"
411 "fgetxattr\0"
412 "flistxattr\0"
413 "fremovexattr\0"
414 "fsetxattr\0"
415 "fstat\0"
416 "fstat64\0"
417 "fstatat64\0"
418 "fstatfs\0"
419 "fstatfs64\0"
420 "ftruncate\0"
421 "ftruncate64\0"
422 "futimesat\0"
423 "getcwd\0"
424 "getdents\0"
425 "getdents64\0"
426 "getxattr\0"
427 "inotify_add_watch\0"
428 "inotify_init\0"
429 "inotify_init1\0"
430 "inotify_rm_watch\0"
431 "lgetxattr\0"
432 "link\0"
433 "linkat\0"
434 "listxattr\0"
435 "llistxattr\0"
436 "lremovexattr\0"
437 "lsetxattr\0"
438 "lstat\0"
439 "lstat64\0"
440 "mkdir\0"
441 "mkdirat\0"
442 "mknod\0"
443 "mknodat\0"
444 "mmap\0"
445 "mmap2\0"
446 "munmap\0"
447 "newfstatat\0"
448 "oldfstat\0"
449 "oldlstat\0"
450 "oldstat\0"
451 "open\0"
452 "openat\0"
453 "openat2\0"
454 "readlink\0"
455 "readlinkat\0"
456 "removexattr\0"
457 "rename\0"
458 "renameat\0"
459 "renameat2\0"
460 "rmdir\0"
461 "setxattr\0"
462 "stat\0"
463 "stat64\0"
464 "statfs\0"
465 "statfs64\0"
466 #ifdef __NR_statx
467 "statx\0"
468 #endif
469 "symlink\0"
470 "symlinkat\0"
471 "truncate\0"
472 "truncate64\0"
473 "unlink\0"
474 "unlinkat\0"
475 "utime\0"
476 "utimensat\0"
477 "utimensat_time64\0"
478 "utimes\0"
479 },
480 [SYSCALL_FILTER_SET_IO_EVENT] = {
481 .name = "@io-event",
482 .help = "Event loop system calls",
483 .value =
484 "_newselect\0"
485 "epoll_create\0"
486 "epoll_create1\0"
487 "epoll_ctl\0"
488 "epoll_ctl_old\0"
489 "epoll_pwait\0"
490 "epoll_wait\0"
491 "epoll_wait_old\0"
492 "eventfd\0"
493 "eventfd2\0"
494 "poll\0"
495 "ppoll\0"
496 "ppoll_time64\0"
497 "pselect6\0"
498 "pselect6_time64\0"
499 "select\0"
500 },
501 [SYSCALL_FILTER_SET_IPC] = {
502 .name = "@ipc",
503 .help = "SysV IPC, POSIX Message Queues or other IPC",
504 .value =
505 "ipc\0"
506 "memfd_create\0"
507 "mq_getsetattr\0"
508 "mq_notify\0"
509 "mq_open\0"
510 "mq_timedreceive\0"
511 "mq_timedreceive_time64\0"
512 "mq_timedsend\0"
513 "mq_timedsend_time64\0"
514 "mq_unlink\0"
515 "msgctl\0"
516 "msgget\0"
517 "msgrcv\0"
518 "msgsnd\0"
519 "pipe\0"
520 "pipe2\0"
521 "process_vm_readv\0"
522 "process_vm_writev\0"
523 "semctl\0"
524 "semget\0"
525 "semop\0"
526 "semtimedop\0"
527 "semtimedop_time64\0"
528 "shmat\0"
529 "shmctl\0"
530 "shmdt\0"
531 "shmget\0"
532 },
533 [SYSCALL_FILTER_SET_KEYRING] = {
534 .name = "@keyring",
535 .help = "Kernel keyring access",
536 .value =
537 "add_key\0"
538 "keyctl\0"
539 "request_key\0"
540 },
541 [SYSCALL_FILTER_SET_MEMLOCK] = {
542 .name = "@memlock",
543 .help = "Memory locking control",
544 .value =
545 "mlock\0"
546 "mlock2\0"
547 "mlockall\0"
548 "munlock\0"
549 "munlockall\0"
550 },
551 [SYSCALL_FILTER_SET_MODULE] = {
552 .name = "@module",
553 .help = "Loading and unloading of kernel modules",
554 .value =
555 "delete_module\0"
556 "finit_module\0"
557 "init_module\0"
558 },
559 [SYSCALL_FILTER_SET_MOUNT] = {
560 .name = "@mount",
561 .help = "Mounting and unmounting of file systems",
562 .value =
563 "chroot\0"
564 "fsconfig\0"
565 "fsmount\0"
566 "fsopen\0"
567 "fspick\0"
568 "mount\0"
569 "move_mount\0"
570 "open_tree\0"
571 "pivot_root\0"
572 "umount\0"
573 "umount2\0"
574 },
575 [SYSCALL_FILTER_SET_NETWORK_IO] = {
576 .name = "@network-io",
577 .help = "Network or Unix socket IO, should not be needed if not network facing",
578 .value =
579 "accept\0"
580 "accept4\0"
581 "bind\0"
582 "connect\0"
583 "getpeername\0"
584 "getsockname\0"
585 "getsockopt\0"
586 "listen\0"
587 "recv\0"
588 "recvfrom\0"
589 "recvmmsg\0"
590 "recvmmsg_time64\0"
591 "recvmsg\0"
592 "send\0"
593 "sendmmsg\0"
594 "sendmsg\0"
595 "sendto\0"
596 "setsockopt\0"
597 "shutdown\0"
598 "socket\0"
599 "socketcall\0"
600 "socketpair\0"
601 },
602 [SYSCALL_FILTER_SET_OBSOLETE] = {
603 /* some unknown even to libseccomp */
604 .name = "@obsolete",
605 .help = "Unusual, obsolete or unimplemented system calls",
606 .value =
607 "_sysctl\0"
608 "afs_syscall\0"
609 "bdflush\0"
610 "break\0"
611 "create_module\0"
612 "ftime\0"
613 "get_kernel_syms\0"
614 "getpmsg\0"
615 "gtty\0"
616 "idle\0"
617 "lock\0"
618 "mpx\0"
619 "prof\0"
620 "profil\0"
621 "putpmsg\0"
622 "query_module\0"
623 "security\0"
624 "sgetmask\0"
625 "ssetmask\0"
626 "stty\0"
627 "sysfs\0"
628 "tuxcall\0"
629 "ulimit\0"
630 "uselib\0"
631 "ustat\0"
632 "vserver\0"
633 },
634 [SYSCALL_FILTER_SET_PKEY] = {
635 .name = "@pkey",
636 .help = "System calls used for memory protection keys",
637 .value =
638 "pkey_alloc\0"
639 "pkey_free\0"
640 "pkey_mprotect\0"
641 },
642 [SYSCALL_FILTER_SET_PRIVILEGED] = {
643 .name = "@privileged",
644 .help = "All system calls which need super-user capabilities",
645 .value =
646 "@chown\0"
647 "@clock\0"
648 "@module\0"
649 "@raw-io\0"
650 "@reboot\0"
651 "@swap\0"
652 "_sysctl\0"
653 "acct\0"
654 "bpf\0"
655 "capset\0"
656 "chroot\0"
657 "fanotify_init\0"
658 "fanotify_mark\0"
659 "nfsservctl\0"
660 "open_by_handle_at\0"
661 "pivot_root\0"
662 "quotactl\0"
663 "setdomainname\0"
664 "setfsuid\0"
665 "setfsuid32\0"
666 "setgroups\0"
667 "setgroups32\0"
668 "sethostname\0"
669 "setresuid\0"
670 "setresuid32\0"
671 "setreuid\0"
672 "setreuid32\0"
673 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
674 "setuid32\0"
675 "vhangup\0"
676 },
677 [SYSCALL_FILTER_SET_PROCESS] = {
678 .name = "@process",
679 .help = "Process control, execution, namespaceing operations",
680 .value =
681 "arch_prctl\0"
682 "capget\0" /* Able to query arbitrary processes */
683 "clone\0"
684 "clone3\0"
685 "execveat\0"
686 "fork\0"
687 "getrusage\0"
688 "kill\0"
689 "pidfd_open\0"
690 "pidfd_send_signal\0"
691 "prctl\0"
692 "rt_sigqueueinfo\0"
693 "rt_tgsigqueueinfo\0"
694 "setns\0"
695 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
696 "tgkill\0"
697 "times\0"
698 "tkill\0"
699 "unshare\0"
700 "vfork\0"
701 "wait4\0"
702 "waitid\0"
703 "waitpid\0"
704 },
705 [SYSCALL_FILTER_SET_RAW_IO] = {
706 .name = "@raw-io",
707 .help = "Raw I/O port access",
708 .value =
709 "ioperm\0"
710 "iopl\0"
711 "pciconfig_iobase\0"
712 "pciconfig_read\0"
713 "pciconfig_write\0"
714 #ifdef __NR_s390_pci_mmio_read
715 "s390_pci_mmio_read\0"
716 #endif
717 #ifdef __NR_s390_pci_mmio_write
718 "s390_pci_mmio_write\0"
719 #endif
720 },
721 [SYSCALL_FILTER_SET_REBOOT] = {
722 .name = "@reboot",
723 .help = "Reboot and reboot preparation/kexec",
724 .value =
725 "kexec_file_load\0"
726 "kexec_load\0"
727 "reboot\0"
728 },
729 [SYSCALL_FILTER_SET_RESOURCES] = {
730 .name = "@resources",
731 .help = "Alter resource settings",
732 .value =
733 "ioprio_set\0"
734 "mbind\0"
735 "migrate_pages\0"
736 "move_pages\0"
737 "nice\0"
738 "sched_setaffinity\0"
739 "sched_setattr\0"
740 "sched_setparam\0"
741 "sched_setscheduler\0"
742 "set_mempolicy\0"
743 "setpriority\0"
744 "setrlimit\0"
745 },
746 [SYSCALL_FILTER_SET_SETUID] = {
747 .name = "@setuid",
748 .help = "Operations for changing user/group credentials",
749 .value =
750 "setgid\0"
751 "setgid32\0"
752 "setgroups\0"
753 "setgroups32\0"
754 "setregid\0"
755 "setregid32\0"
756 "setresgid\0"
757 "setresgid32\0"
758 "setresuid\0"
759 "setresuid32\0"
760 "setreuid\0"
761 "setreuid32\0"
762 "setuid\0"
763 "setuid32\0"
764 },
765 [SYSCALL_FILTER_SET_SIGNAL] = {
766 .name = "@signal",
767 .help = "Process signal handling",
768 .value =
769 "rt_sigaction\0"
770 "rt_sigpending\0"
771 "rt_sigprocmask\0"
772 "rt_sigsuspend\0"
773 "rt_sigtimedwait\0"
774 "rt_sigtimedwait_time64\0"
775 "sigaction\0"
776 "sigaltstack\0"
777 "signal\0"
778 "signalfd\0"
779 "signalfd4\0"
780 "sigpending\0"
781 "sigprocmask\0"
782 "sigsuspend\0"
783 },
784 [SYSCALL_FILTER_SET_SWAP] = {
785 .name = "@swap",
786 .help = "Enable/disable swap devices",
787 .value =
788 "swapoff\0"
789 "swapon\0"
790 },
791 [SYSCALL_FILTER_SET_SYNC] = {
792 .name = "@sync",
793 .help = "Synchronize files and memory to storage",
794 .value =
795 "fdatasync\0"
796 "fsync\0"
797 "msync\0"
798 "sync\0"
799 "sync_file_range\0"
800 "sync_file_range2\0"
801 "syncfs\0"
802 },
803 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
804 .name = "@system-service",
805 .help = "General system service operations",
806 .value =
807 "@aio\0"
808 "@basic-io\0"
809 "@chown\0"
810 "@default\0"
811 "@file-system\0"
812 "@io-event\0"
813 "@ipc\0"
814 "@keyring\0"
815 "@memlock\0"
816 "@network-io\0"
817 "@process\0"
818 "@resources\0"
819 "@setuid\0"
820 "@signal\0"
821 "@sync\0"
822 "@timer\0"
823 "brk\0"
824 "capget\0"
825 "capset\0"
826 "copy_file_range\0"
827 "fadvise64\0"
828 "fadvise64_64\0"
829 "flock\0"
830 "get_mempolicy\0"
831 "getcpu\0"
832 "getpriority\0"
833 "getrandom\0"
834 "ioctl\0"
835 "ioprio_get\0"
836 "kcmp\0"
837 "madvise\0"
838 "mprotect\0"
839 "mremap\0"
840 "name_to_handle_at\0"
841 "oldolduname\0"
842 "olduname\0"
843 "personality\0"
844 "readahead\0"
845 "readdir\0"
846 "remap_file_pages\0"
847 "sched_get_priority_max\0"
848 "sched_get_priority_min\0"
849 "sched_getaffinity\0"
850 "sched_getattr\0"
851 "sched_getparam\0"
852 "sched_getscheduler\0"
853 "sched_rr_get_interval\0"
854 "sched_rr_get_interval_time64\0"
855 "sched_yield\0"
856 "sendfile\0"
857 "sendfile64\0"
858 "setfsgid\0"
859 "setfsgid32\0"
860 "setfsuid\0"
861 "setfsuid32\0"
862 "setpgid\0"
863 "setsid\0"
864 "splice\0"
865 "sysinfo\0"
866 "tee\0"
867 "umask\0"
868 "uname\0"
869 "userfaultfd\0"
870 "vmsplice\0"
871 },
872 [SYSCALL_FILTER_SET_TIMER] = {
873 .name = "@timer",
874 .help = "Schedule operations by time",
875 .value =
876 "alarm\0"
877 "getitimer\0"
878 "setitimer\0"
879 "timer_create\0"
880 "timer_delete\0"
881 "timer_getoverrun\0"
882 "timer_gettime\0"
883 "timer_gettime64\0"
884 "timer_settime\0"
885 "timer_settime64\0"
886 "timerfd_create\0"
887 "timerfd_gettime\0"
888 "timerfd_gettime64\0"
889 "timerfd_settime\0"
890 "timerfd_settime64\0"
891 "times\0"
892 },
893 };
894
895 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
896 unsigned i;
897
898 if (isempty(name) || name[0] != '@')
899 return NULL;
900
901 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
902 if (streq(syscall_filter_sets[i].name, name))
903 return syscall_filter_sets + i;
904
905 return NULL;
906 }
907
908 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
909
910 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
911 assert(seccomp);
912 assert(name);
913
914 if (strv_contains(exclude, name))
915 return 0;
916
917 if (name[0] == '@') {
918 const SyscallFilterSet *other;
919
920 other = syscall_filter_set_find(name);
921 if (!other)
922 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
923 "Filter set %s is not known!",
924 name);
925
926 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
927
928 } else {
929 int id, r;
930
931 id = seccomp_syscall_resolve_name(name);
932 if (id == __NR_SCMP_ERROR) {
933 if (log_missing)
934 log_debug("System call %s is not known, ignoring.", name);
935 return 0;
936 }
937
938 r = seccomp_rule_add_exact(seccomp, action, id, 0);
939 if (r < 0) {
940 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
941 bool ignore = r == -EDOM;
942
943 if (!ignore || log_missing)
944 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
945 name, id, ignore ? ", ignoring" : "");
946 if (!ignore)
947 return r;
948 }
949
950 return 0;
951 }
952 }
953
954 static int seccomp_add_syscall_filter_set(
955 scmp_filter_ctx seccomp,
956 const SyscallFilterSet *set,
957 uint32_t action,
958 char **exclude,
959 bool log_missing) {
960
961 const char *sys;
962 int r;
963
964 assert(seccomp);
965 assert(set);
966
967 NULSTR_FOREACH(sys, set->value) {
968 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
969 if (r < 0)
970 return r;
971 }
972
973 return 0;
974 }
975
976 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
977 uint32_t arch;
978 int r;
979
980 assert(set);
981
982 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
983 * each local arch. */
984
985 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
986 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
987
988 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
989
990 r = seccomp_init_for_arch(&seccomp, arch, default_action);
991 if (r < 0)
992 return r;
993
994 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
995 if (r < 0)
996 return log_debug_errno(r, "Failed to add filter set: %m");
997
998 r = seccomp_load(seccomp);
999 if (ERRNO_IS_SECCOMP_FATAL(r))
1000 return r;
1001 if (r < 0)
1002 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1003 }
1004
1005 return 0;
1006 }
1007
1008 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1009 uint32_t arch;
1010 int r;
1011
1012 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1013 * SyscallFilterSet* table. */
1014
1015 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1016 return 0;
1017
1018 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1019 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1020 Iterator i;
1021 void *syscall_id, *val;
1022
1023 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1024
1025 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1026 if (r < 0)
1027 return r;
1028
1029 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
1030 uint32_t a = action;
1031 int id = PTR_TO_INT(syscall_id) - 1;
1032 int error = PTR_TO_INT(val);
1033
1034 if (action != SCMP_ACT_ALLOW && error >= 0)
1035 a = SCMP_ACT_ERRNO(error);
1036
1037 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1038 if (r < 0) {
1039 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1040 _cleanup_free_ char *n = NULL;
1041 bool ignore;
1042
1043 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1044 ignore = r == -EDOM;
1045 if (!ignore || log_missing)
1046 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1047 strna(n), id, ignore ? ", ignoring" : "");
1048 if (!ignore)
1049 return r;
1050 }
1051 }
1052
1053 r = seccomp_load(seccomp);
1054 if (ERRNO_IS_SECCOMP_FATAL(r))
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
1061 }
1062
1063 int seccomp_parse_syscall_filter(
1064 const char *name,
1065 int errno_num,
1066 Hashmap *filter,
1067 SeccompParseFlags flags,
1068 const char *unit,
1069 const char *filename,
1070 unsigned line) {
1071
1072 int r;
1073
1074 assert(name);
1075 assert(filter);
1076
1077 if (name[0] == '@') {
1078 const SyscallFilterSet *set;
1079 const char *i;
1080
1081 set = syscall_filter_set_find(name);
1082 if (!set) {
1083 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1084 return -EINVAL;
1085
1086 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1087 "Unknown system call group, ignoring: %s", name);
1088 return 0;
1089 }
1090
1091 NULSTR_FOREACH(i, set->value) {
1092 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1093 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1094 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1095 * about them. */
1096 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1097 if (r < 0)
1098 return r;
1099 }
1100 } else {
1101 int id;
1102
1103 id = seccomp_syscall_resolve_name(name);
1104 if (id == __NR_SCMP_ERROR) {
1105 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1106 return -EINVAL;
1107
1108 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1109 "Failed to parse system call, ignoring: %s", name);
1110 return 0;
1111 }
1112
1113 /* If we previously wanted to forbid a syscall and now
1114 * we want to allow it, then remove it from the list. */
1115 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1116 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1117 if (r < 0)
1118 switch (r) {
1119 case -ENOMEM:
1120 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1121 case -EEXIST:
1122 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1123 break;
1124 default:
1125 return r;
1126 }
1127 } else
1128 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1129 }
1130
1131 return 0;
1132 }
1133
1134 int seccomp_restrict_namespaces(unsigned long retain) {
1135 uint32_t arch;
1136 int r;
1137
1138 if (DEBUG_LOGGING) {
1139 _cleanup_free_ char *s = NULL;
1140
1141 (void) namespace_flags_to_string(retain, &s);
1142 log_debug("Restricting namespace to: %s.", strna(s));
1143 }
1144
1145 /* NOOP? */
1146 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1147 return 0;
1148
1149 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1150 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1151 unsigned i;
1152
1153 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1154
1155 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1156 if (r < 0)
1157 return r;
1158
1159 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1160 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1161 * altogether. */
1162 r = seccomp_rule_add_exact(
1163 seccomp,
1164 SCMP_ACT_ERRNO(EPERM),
1165 SCMP_SYS(setns),
1166 0);
1167 else
1168 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1169 * special invocation with a zero flags argument, right here. */
1170 r = seccomp_rule_add_exact(
1171 seccomp,
1172 SCMP_ACT_ERRNO(EPERM),
1173 SCMP_SYS(setns),
1174 1,
1175 SCMP_A1(SCMP_CMP_EQ, 0));
1176 if (r < 0) {
1177 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1178 continue;
1179 }
1180
1181 for (i = 0; namespace_flag_map[i].name; i++) {
1182 unsigned long f;
1183
1184 f = namespace_flag_map[i].flag;
1185 if ((retain & f) == f) {
1186 log_debug("Permitting %s.", namespace_flag_map[i].name);
1187 continue;
1188 }
1189
1190 log_debug("Blocking %s.", namespace_flag_map[i].name);
1191
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EPERM),
1195 SCMP_SYS(unshare),
1196 1,
1197 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 break;
1201 }
1202
1203 /* On s390/s390x the first two parameters to clone are switched */
1204 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1205 r = seccomp_rule_add_exact(
1206 seccomp,
1207 SCMP_ACT_ERRNO(EPERM),
1208 SCMP_SYS(clone),
1209 1,
1210 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1211 else
1212 r = seccomp_rule_add_exact(
1213 seccomp,
1214 SCMP_ACT_ERRNO(EPERM),
1215 SCMP_SYS(clone),
1216 1,
1217 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 break;
1221 }
1222
1223 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1224 r = seccomp_rule_add_exact(
1225 seccomp,
1226 SCMP_ACT_ERRNO(EPERM),
1227 SCMP_SYS(setns),
1228 1,
1229 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1230 if (r < 0) {
1231 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1232 break;
1233 }
1234 }
1235 }
1236 if (r < 0)
1237 continue;
1238
1239 r = seccomp_load(seccomp);
1240 if (ERRNO_IS_SECCOMP_FATAL(r))
1241 return r;
1242 if (r < 0)
1243 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 }
1245
1246 return 0;
1247 }
1248
1249 int seccomp_protect_sysctl(void) {
1250 uint32_t arch;
1251 int r;
1252
1253 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1254 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1255
1256 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1257
1258 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1259 /* No _sysctl syscall */
1260 continue;
1261
1262 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1263 if (r < 0)
1264 return r;
1265
1266 r = seccomp_rule_add_exact(
1267 seccomp,
1268 SCMP_ACT_ERRNO(EPERM),
1269 SCMP_SYS(_sysctl),
1270 0);
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 continue;
1274 }
1275
1276 r = seccomp_load(seccomp);
1277 if (ERRNO_IS_SECCOMP_FATAL(r))
1278 return r;
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1281 }
1282
1283 return 0;
1284 }
1285
1286 int seccomp_protect_syslog(void) {
1287 uint32_t arch;
1288 int r;
1289
1290 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1291 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1292
1293 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1294 if (r < 0)
1295 return r;
1296
1297 r = seccomp_rule_add_exact(
1298 seccomp,
1299 SCMP_ACT_ERRNO(EPERM),
1300 SCMP_SYS(syslog),
1301 0);
1302
1303 if (r < 0) {
1304 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1305 continue;
1306 }
1307
1308 r = seccomp_load(seccomp);
1309 if (ERRNO_IS_SECCOMP_FATAL(r))
1310 return r;
1311 if (r < 0)
1312 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1313 }
1314
1315 return 0;
1316 }
1317
1318 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1319 uint32_t arch;
1320 int r;
1321
1322 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1323 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1324 bool supported;
1325 Iterator i;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
1329 switch (arch) {
1330
1331 case SCMP_ARCH_X86_64:
1332 case SCMP_ARCH_X32:
1333 case SCMP_ARCH_ARM:
1334 case SCMP_ARCH_AARCH64:
1335 case SCMP_ARCH_PPC:
1336 case SCMP_ARCH_PPC64:
1337 case SCMP_ARCH_PPC64LE:
1338 case SCMP_ARCH_MIPSEL64N32:
1339 case SCMP_ARCH_MIPS64N32:
1340 case SCMP_ARCH_MIPSEL64:
1341 case SCMP_ARCH_MIPS64:
1342 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1343 supported = true;
1344 break;
1345
1346 case SCMP_ARCH_S390:
1347 case SCMP_ARCH_S390X:
1348 case SCMP_ARCH_X86:
1349 case SCMP_ARCH_MIPSEL:
1350 case SCMP_ARCH_MIPS:
1351 default:
1352 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1353 * don't know */
1354 supported = false;
1355 break;
1356 }
1357
1358 if (!supported)
1359 continue;
1360
1361 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1362 if (r < 0)
1363 return r;
1364
1365 if (whitelist) {
1366 int af, first = 0, last = 0;
1367 void *afp;
1368
1369 /* If this is a whitelist, we first block the address families that are out of range and then
1370 * everything that is not in the set. First, we find the lowest and highest address family in
1371 * the set. */
1372
1373 SET_FOREACH(afp, address_families, i) {
1374 af = PTR_TO_INT(afp);
1375
1376 if (af <= 0 || af >= af_max())
1377 continue;
1378
1379 if (first == 0 || af < first)
1380 first = af;
1381
1382 if (last == 0 || af > last)
1383 last = af;
1384 }
1385
1386 assert((first == 0) == (last == 0));
1387
1388 if (first == 0) {
1389
1390 /* No entries in the valid range, block everything */
1391 r = seccomp_rule_add_exact(
1392 seccomp,
1393 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1394 SCMP_SYS(socket),
1395 0);
1396 if (r < 0) {
1397 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 continue;
1399 }
1400
1401 } else {
1402
1403 /* Block everything below the first entry */
1404 r = seccomp_rule_add_exact(
1405 seccomp,
1406 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1407 SCMP_SYS(socket),
1408 1,
1409 SCMP_A0(SCMP_CMP_LT, first));
1410 if (r < 0) {
1411 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1412 continue;
1413 }
1414
1415 /* Block everything above the last entry */
1416 r = seccomp_rule_add_exact(
1417 seccomp,
1418 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1419 SCMP_SYS(socket),
1420 1,
1421 SCMP_A0(SCMP_CMP_GT, last));
1422 if (r < 0) {
1423 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1424 continue;
1425 }
1426
1427 /* Block everything between the first and last entry */
1428 for (af = 1; af < af_max(); af++) {
1429
1430 if (set_contains(address_families, INT_TO_PTR(af)))
1431 continue;
1432
1433 r = seccomp_rule_add_exact(
1434 seccomp,
1435 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1436 SCMP_SYS(socket),
1437 1,
1438 SCMP_A0(SCMP_CMP_EQ, af));
1439 if (r < 0)
1440 break;
1441 }
1442 if (r < 0) {
1443 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1444 continue;
1445 }
1446 }
1447
1448 } else {
1449 void *af;
1450
1451 /* If this is a blacklist, then generate one rule for
1452 * each address family that are then combined in OR
1453 * checks. */
1454
1455 SET_FOREACH(af, address_families, i) {
1456
1457 r = seccomp_rule_add_exact(
1458 seccomp,
1459 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1460 SCMP_SYS(socket),
1461 1,
1462 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1463 if (r < 0)
1464 break;
1465 }
1466 if (r < 0) {
1467 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1468 continue;
1469 }
1470 }
1471
1472 r = seccomp_load(seccomp);
1473 if (ERRNO_IS_SECCOMP_FATAL(r))
1474 return r;
1475 if (r < 0)
1476 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 }
1478
1479 return 0;
1480 }
1481
1482 int seccomp_restrict_realtime(void) {
1483 static const int permitted_policies[] = {
1484 SCHED_OTHER,
1485 SCHED_BATCH,
1486 SCHED_IDLE,
1487 };
1488
1489 int r, max_policy = 0;
1490 uint32_t arch;
1491 unsigned i;
1492
1493 /* Determine the highest policy constant we want to allow */
1494 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1495 if (permitted_policies[i] > max_policy)
1496 max_policy = permitted_policies[i];
1497
1498 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1499 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1500 int p;
1501
1502 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1503
1504 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1505 if (r < 0)
1506 return r;
1507
1508 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1509 * whitelist. */
1510 for (p = 0; p < max_policy; p++) {
1511 bool good = false;
1512
1513 /* Check if this is in the whitelist. */
1514 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1515 if (permitted_policies[i] == p) {
1516 good = true;
1517 break;
1518 }
1519
1520 if (good)
1521 continue;
1522
1523 /* Deny this policy */
1524 r = seccomp_rule_add_exact(
1525 seccomp,
1526 SCMP_ACT_ERRNO(EPERM),
1527 SCMP_SYS(sched_setscheduler),
1528 1,
1529 SCMP_A1(SCMP_CMP_EQ, p));
1530 if (r < 0) {
1531 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1532 continue;
1533 }
1534 }
1535
1536 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1537 * unsigned here, hence no need no check for < 0 values. */
1538 r = seccomp_rule_add_exact(
1539 seccomp,
1540 SCMP_ACT_ERRNO(EPERM),
1541 SCMP_SYS(sched_setscheduler),
1542 1,
1543 SCMP_A1(SCMP_CMP_GT, max_policy));
1544 if (r < 0) {
1545 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1546 continue;
1547 }
1548
1549 r = seccomp_load(seccomp);
1550 if (ERRNO_IS_SECCOMP_FATAL(r))
1551 return r;
1552 if (r < 0)
1553 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 }
1555
1556 return 0;
1557 }
1558
1559 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1560 uint32_t arch,
1561 int nr,
1562 unsigned arg_cnt,
1563 const struct scmp_arg_cmp arg) {
1564 int r;
1565
1566 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1567 if (r < 0) {
1568 _cleanup_free_ char *n = NULL;
1569
1570 n = seccomp_syscall_resolve_num_arch(arch, nr);
1571 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1572 strna(n),
1573 seccomp_arch_to_string(arch));
1574 }
1575
1576 return r;
1577 }
1578
1579 /* For known architectures, check that syscalls are indeed defined or not. */
1580 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1581 assert_cc(SCMP_SYS(shmget) > 0);
1582 assert_cc(SCMP_SYS(shmat) > 0);
1583 assert_cc(SCMP_SYS(shmdt) > 0);
1584 #endif
1585
1586 int seccomp_memory_deny_write_execute(void) {
1587 uint32_t arch;
1588 unsigned loaded = 0;
1589
1590 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1591 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1592 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1593
1594 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1595
1596 switch (arch) {
1597
1598 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1599 * We ignore that here, which means there's still a way to get writable/executable
1600 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1601
1602 case SCMP_ARCH_X86:
1603 case SCMP_ARCH_S390:
1604 filter_syscall = SCMP_SYS(mmap2);
1605 block_syscall = SCMP_SYS(mmap);
1606 /* shmat multiplexed, see above */
1607 break;
1608
1609 case SCMP_ARCH_PPC:
1610 case SCMP_ARCH_PPC64:
1611 case SCMP_ARCH_PPC64LE:
1612 case SCMP_ARCH_S390X:
1613 filter_syscall = SCMP_SYS(mmap);
1614 /* shmat multiplexed, see above */
1615 break;
1616
1617 case SCMP_ARCH_ARM:
1618 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1619 shmat_syscall = SCMP_SYS(shmat);
1620 break;
1621
1622 case SCMP_ARCH_X86_64:
1623 case SCMP_ARCH_X32:
1624 case SCMP_ARCH_AARCH64:
1625 filter_syscall = SCMP_SYS(mmap); /* amd64, x32 and arm64 have only mmap */
1626 shmat_syscall = SCMP_SYS(shmat);
1627 break;
1628
1629 /* Please add more definitions here, if you port systemd to other architectures! */
1630
1631 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__)
1632 #warning "Consider adding the right mmap() syscall definitions here!"
1633 #endif
1634 }
1635
1636 /* Can't filter mmap() on this arch, then skip it */
1637 if (filter_syscall == 0)
1638 continue;
1639
1640 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1641 if (r < 0)
1642 return r;
1643
1644 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1645 1,
1646 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1647 if (r < 0)
1648 continue;
1649
1650 if (block_syscall != 0) {
1651 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1652 if (r < 0)
1653 continue;
1654 }
1655
1656 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1657 1,
1658 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1659 if (r < 0)
1660 continue;
1661
1662 #ifdef __NR_pkey_mprotect
1663 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1664 1,
1665 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1666 if (r < 0)
1667 continue;
1668 #endif
1669
1670 if (shmat_syscall > 0) {
1671 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1672 1,
1673 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1674 if (r < 0)
1675 continue;
1676 }
1677
1678 r = seccomp_load(seccomp);
1679 if (ERRNO_IS_SECCOMP_FATAL(r))
1680 return r;
1681 if (r < 0)
1682 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1683 seccomp_arch_to_string(arch));
1684 loaded++;
1685 }
1686
1687 if (loaded == 0)
1688 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1689
1690 return loaded;
1691 }
1692
1693 int seccomp_restrict_archs(Set *archs) {
1694 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1695 Iterator i;
1696 void *id;
1697 int r;
1698
1699 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1700 * list.
1701 *
1702 * There are some qualifications. However the most important use is to stop processes from bypassing
1703 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1704 * in a non-native architecture. There are no holes in this use case, at least so far. */
1705
1706 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1707 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1708 * to run a program with the restrictions applied. */
1709 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1710 if (!seccomp)
1711 return -ENOMEM;
1712
1713 SET_FOREACH(id, archs, i) {
1714 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1715 if (r < 0 && r != -EEXIST)
1716 return r;
1717 }
1718
1719 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1720 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1721 * The important thing is that you can block the old 32-bit x86 syscalls.
1722 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1723
1724 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1725 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1726
1727 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1728 if (r < 0 && r != -EEXIST)
1729 return r;
1730 }
1731
1732 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1733 if (r < 0)
1734 return r;
1735
1736 r = seccomp_load(seccomp);
1737 if (ERRNO_IS_SECCOMP_FATAL(r))
1738 return r;
1739 if (r < 0)
1740 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1741
1742 return 0;
1743 }
1744
1745 int parse_syscall_archs(char **l, Set **ret_archs) {
1746 _cleanup_set_free_ Set *archs = NULL;
1747 char **s;
1748 int r;
1749
1750 assert(l);
1751 assert(ret_archs);
1752
1753 STRV_FOREACH(s, l) {
1754 uint32_t a;
1755
1756 r = seccomp_arch_from_string(*s, &a);
1757 if (r < 0)
1758 return -EINVAL;
1759
1760 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1761 if (r < 0)
1762 return -ENOMEM;
1763 }
1764
1765 *ret_archs = TAKE_PTR(archs);
1766 return 0;
1767 }
1768
1769 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1770 const char *i;
1771 int r;
1772
1773 assert(set);
1774
1775 NULSTR_FOREACH(i, set->value) {
1776
1777 if (i[0] == '@') {
1778 const SyscallFilterSet *more;
1779
1780 more = syscall_filter_set_find(i);
1781 if (!more)
1782 return -ENXIO;
1783
1784 r = seccomp_filter_set_add(filter, add, more);
1785 if (r < 0)
1786 return r;
1787 } else {
1788 int id;
1789
1790 id = seccomp_syscall_resolve_name(i);
1791 if (id == __NR_SCMP_ERROR) {
1792 log_debug("Couldn't resolve system call, ignoring: %s", i);
1793 continue;
1794 }
1795
1796 if (add) {
1797 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1798 if (r < 0)
1799 return r;
1800 } else
1801 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1802 }
1803 }
1804
1805 return 0;
1806 }
1807
1808 int seccomp_lock_personality(unsigned long personality) {
1809 uint32_t arch;
1810 int r;
1811
1812 if (personality >= PERSONALITY_INVALID)
1813 return -EINVAL;
1814
1815 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1816 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1817
1818 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1819 if (r < 0)
1820 return r;
1821
1822 r = seccomp_rule_add_exact(
1823 seccomp,
1824 SCMP_ACT_ERRNO(EPERM),
1825 SCMP_SYS(personality),
1826 1,
1827 SCMP_A0(SCMP_CMP_NE, personality));
1828 if (r < 0) {
1829 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1830 continue;
1831 }
1832
1833 r = seccomp_load(seccomp);
1834 if (ERRNO_IS_SECCOMP_FATAL(r))
1835 return r;
1836 if (r < 0)
1837 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1838 }
1839
1840 return 0;
1841 }
1842
1843 int seccomp_protect_hostname(void) {
1844 uint32_t arch;
1845 int r;
1846
1847 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1848 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1849
1850 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1851 if (r < 0)
1852 return r;
1853
1854 r = seccomp_rule_add_exact(
1855 seccomp,
1856 SCMP_ACT_ERRNO(EPERM),
1857 SCMP_SYS(sethostname),
1858 0);
1859 if (r < 0) {
1860 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1861 continue;
1862 }
1863
1864 r = seccomp_rule_add_exact(
1865 seccomp,
1866 SCMP_ACT_ERRNO(EPERM),
1867 SCMP_SYS(setdomainname),
1868 0);
1869 if (r < 0) {
1870 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1871 continue;
1872 }
1873
1874 r = seccomp_load(seccomp);
1875 if (ERRNO_IS_SECCOMP_FATAL(r))
1876 return r;
1877 if (r < 0)
1878 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1879 }
1880
1881 return 0;
1882 }
1883
1884 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1885 /* Checks the mode_t parameter of the following system calls:
1886 *
1887 * → chmod() + fchmod() + fchmodat()
1888 * → open() + creat() + openat()
1889 * → mkdir() + mkdirat()
1890 * → mknod() + mknodat()
1891 *
1892 * Returns error if *everything* failed, and 0 otherwise.
1893 */
1894 int r = 0;
1895 bool any = false;
1896
1897 r = seccomp_rule_add_exact(
1898 seccomp,
1899 SCMP_ACT_ERRNO(EPERM),
1900 SCMP_SYS(chmod),
1901 1,
1902 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1903 if (r < 0)
1904 log_debug_errno(r, "Failed to add filter for chmod: %m");
1905 else
1906 any = true;
1907
1908 r = seccomp_rule_add_exact(
1909 seccomp,
1910 SCMP_ACT_ERRNO(EPERM),
1911 SCMP_SYS(fchmod),
1912 1,
1913 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1914 if (r < 0)
1915 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1916 else
1917 any = true;
1918
1919 r = seccomp_rule_add_exact(
1920 seccomp,
1921 SCMP_ACT_ERRNO(EPERM),
1922 SCMP_SYS(fchmodat),
1923 1,
1924 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1925 if (r < 0)
1926 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1927 else
1928 any = true;
1929
1930 r = seccomp_rule_add_exact(
1931 seccomp,
1932 SCMP_ACT_ERRNO(EPERM),
1933 SCMP_SYS(mkdir),
1934 1,
1935 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1936 if (r < 0)
1937 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1938 else
1939 any = true;
1940
1941 r = seccomp_rule_add_exact(
1942 seccomp,
1943 SCMP_ACT_ERRNO(EPERM),
1944 SCMP_SYS(mkdirat),
1945 1,
1946 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1947 if (r < 0)
1948 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1949 else
1950 any = true;
1951
1952 r = seccomp_rule_add_exact(
1953 seccomp,
1954 SCMP_ACT_ERRNO(EPERM),
1955 SCMP_SYS(mknod),
1956 1,
1957 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1958 if (r < 0)
1959 log_debug_errno(r, "Failed to add filter for mknod: %m");
1960 else
1961 any = true;
1962
1963 r = seccomp_rule_add_exact(
1964 seccomp,
1965 SCMP_ACT_ERRNO(EPERM),
1966 SCMP_SYS(mknodat),
1967 1,
1968 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1969 if (r < 0)
1970 log_debug_errno(r, "Failed to add filter for mknodat: %m");
1971 else
1972 any = true;
1973
1974 #if SCMP_SYS(open) > 0
1975 r = seccomp_rule_add_exact(
1976 seccomp,
1977 SCMP_ACT_ERRNO(EPERM),
1978 SCMP_SYS(open),
1979 2,
1980 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1981 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1982 if (r < 0)
1983 log_debug_errno(r, "Failed to add filter for open: %m");
1984 else
1985 any = true;
1986 #endif
1987
1988 r = seccomp_rule_add_exact(
1989 seccomp,
1990 SCMP_ACT_ERRNO(EPERM),
1991 SCMP_SYS(openat),
1992 2,
1993 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
1994 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
1995 if (r < 0)
1996 log_debug_errno(r, "Failed to add filter for openat: %m");
1997 else
1998 any = true;
1999
2000 #if defined(__SNR_openat2)
2001 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2002 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2003 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2004 * compatible with kernels that are not absolutely recent. */
2005 r = seccomp_rule_add_exact(
2006 seccomp,
2007 SCMP_ACT_ERRNO(EPERM),
2008 SCMP_SYS(openat2),
2009 0);
2010 if (r < 0)
2011 log_debug_errno(r, "Failed to add filter for openat2: %m");
2012 else
2013 any = true;
2014 #endif
2015
2016 r = seccomp_rule_add_exact(
2017 seccomp,
2018 SCMP_ACT_ERRNO(EPERM),
2019 SCMP_SYS(creat),
2020 1,
2021 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2022 if (r < 0)
2023 log_debug_errno(r, "Failed to add filter for creat: %m");
2024 else
2025 any = true;
2026
2027 return any ? 0 : r;
2028 }
2029
2030 int seccomp_restrict_suid_sgid(void) {
2031 uint32_t arch;
2032 int r, k;
2033
2034 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2035 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2036
2037 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2038 if (r < 0)
2039 return r;
2040
2041 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2042 if (r < 0)
2043 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2044
2045 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2046 if (k < 0)
2047 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2048
2049 if (r < 0 && k < 0)
2050 continue;
2051
2052 r = seccomp_load(seccomp);
2053 if (ERRNO_IS_SECCOMP_FATAL(r))
2054 return r;
2055 if (r < 0)
2056 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2057 }
2058
2059 return 0;
2060 }
2061
2062 uint32_t scmp_act_kill_process(void) {
2063
2064 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2065 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2066 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2067 * for single-threaded apps does the right thing. */
2068
2069 #ifdef SCMP_ACT_KILL_PROCESS
2070 if (seccomp_api_get() >= 3)
2071 return SCMP_ACT_KILL_PROCESS;
2072 #endif
2073
2074 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2075 }