]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
exec: Add kill action to system call filters
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "env-util.h"
16 #include "errno-list.h"
17 #include "macro.h"
18 #include "nsflags.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
22 #include "set.h"
23 #include "string-util.h"
24 #include "strv.h"
25
26 const uint32_t seccomp_local_archs[] = {
27
28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
29
30 #if defined(__x86_64__) && defined(__ILP32__)
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
33 SCMP_ARCH_X32, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X32,
37 SCMP_ARCH_X86_64, /* native */
38 #elif defined(__i386__)
39 SCMP_ARCH_X86,
40 #elif defined(__aarch64__)
41 SCMP_ARCH_ARM,
42 SCMP_ARCH_AARCH64, /* native */
43 #elif defined(__arm__)
44 SCMP_ARCH_ARM,
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
49 SCMP_ARCH_MIPS,
50 SCMP_ARCH_MIPSEL, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
55 SCMP_ARCH_MIPS64N32,
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64,
69 SCMP_ARCH_MIPS64,
70 SCMP_ARCH_MIPSEL64N32,
71 SCMP_ARCH_MIPS64N32, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
80 SCMP_ARCH_PPC,
81 SCMP_ARCH_PPC64LE,
82 SCMP_ARCH_PPC64, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87 #elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
89 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
91 #elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94 #elif defined(__s390__)
95 SCMP_ARCH_S390,
96 #endif
97 (uint32_t) -1
98 };
99
100 const char* seccomp_arch_to_string(uint32_t c) {
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
105
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
108 return "native";
109 case SCMP_ARCH_X86:
110 return "x86";
111 case SCMP_ARCH_X86_64:
112 return "x86-64";
113 case SCMP_ARCH_X32:
114 return "x32";
115 case SCMP_ARCH_ARM:
116 return "arm";
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
137 #ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140 #endif
141 case SCMP_ARCH_S390:
142 return "s390";
143 case SCMP_ARCH_S390X:
144 return "s390x";
145 default:
146 return NULL;
147 }
148 }
149
150 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
186 #ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189 #endif
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
194 else
195 return -EINVAL;
196
197 return 0;
198 }
199
200 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
202 int r;
203
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
215 if (r < 0)
216 return r;
217
218 r = seccomp_arch_add(seccomp, arch);
219 if (r < 0)
220 return r;
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
231 if (r < 0)
232 return r;
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
236 return r;
237
238 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244 #endif
245
246 *ret = TAKE_PTR(seccomp);
247 return 0;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_getres_time64\0"
277 "clock_gettime\0"
278 "clock_gettime64\0"
279 "clock_nanosleep\0"
280 "clock_nanosleep_time64\0"
281 "execve\0"
282 "exit\0"
283 "exit_group\0"
284 "futex\0"
285 "futex_time64\0"
286 "get_robust_list\0"
287 "get_thread_area\0"
288 "getegid\0"
289 "getegid32\0"
290 "geteuid\0"
291 "geteuid32\0"
292 "getgid\0"
293 "getgid32\0"
294 "getgroups\0"
295 "getgroups32\0"
296 "getpgid\0"
297 "getpgrp\0"
298 "getpid\0"
299 "getppid\0"
300 "getresgid\0"
301 "getresgid32\0"
302 "getresuid\0"
303 "getresuid32\0"
304 "getrlimit\0" /* make sure processes can query stack size and such */
305 "getsid\0"
306 "gettid\0"
307 "gettimeofday\0"
308 "getuid\0"
309 "getuid32\0"
310 "membarrier\0"
311 "nanosleep\0"
312 "pause\0"
313 "prlimit64\0"
314 "restart_syscall\0"
315 "rseq\0"
316 "rt_sigreturn\0"
317 "sched_yield\0"
318 "set_robust_list\0"
319 "set_thread_area\0"
320 "set_tid_address\0"
321 "set_tls\0"
322 "sigreturn\0"
323 "time\0"
324 "ugetrlimit\0"
325 },
326 [SYSCALL_FILTER_SET_AIO] = {
327 .name = "@aio",
328 .help = "Asynchronous IO",
329 .value =
330 "io_cancel\0"
331 "io_destroy\0"
332 "io_getevents\0"
333 "io_pgetevents\0"
334 "io_pgetevents_time64\0"
335 "io_setup\0"
336 "io_submit\0"
337 "io_uring_enter\0"
338 "io_uring_register\0"
339 "io_uring_setup\0"
340 },
341 [SYSCALL_FILTER_SET_BASIC_IO] = {
342 .name = "@basic-io",
343 .help = "Basic IO",
344 .value =
345 "_llseek\0"
346 "close\0"
347 "dup\0"
348 "dup2\0"
349 "dup3\0"
350 "lseek\0"
351 "pread64\0"
352 "preadv\0"
353 "preadv2\0"
354 "pwrite64\0"
355 "pwritev\0"
356 "pwritev2\0"
357 "read\0"
358 "readv\0"
359 "write\0"
360 "writev\0"
361 },
362 [SYSCALL_FILTER_SET_CHOWN] = {
363 .name = "@chown",
364 .help = "Change ownership of files and directories",
365 .value =
366 "chown\0"
367 "chown32\0"
368 "fchown\0"
369 "fchown32\0"
370 "fchownat\0"
371 "lchown\0"
372 "lchown32\0"
373 },
374 [SYSCALL_FILTER_SET_CLOCK] = {
375 .name = "@clock",
376 .help = "Change the system time",
377 .value =
378 "adjtimex\0"
379 "clock_adjtime\0"
380 "clock_adjtime64\0"
381 "clock_settime\0"
382 "clock_settime64\0"
383 "settimeofday\0"
384 "stime\0"
385 },
386 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
387 .name = "@cpu-emulation",
388 .help = "System calls for CPU emulation functionality",
389 .value =
390 "modify_ldt\0"
391 "subpage_prot\0"
392 "switch_endian\0"
393 "vm86\0"
394 "vm86old\0"
395 },
396 [SYSCALL_FILTER_SET_DEBUG] = {
397 .name = "@debug",
398 .help = "Debugging, performance monitoring and tracing functionality",
399 .value =
400 "lookup_dcookie\0"
401 "perf_event_open\0"
402 "pidfd_getfd\0"
403 "ptrace\0"
404 "rtas\0"
405 #if defined __s390__ || defined __s390x__
406 "s390_runtime_instr\0"
407 #endif
408 "sys_debug_setcontext\0"
409 },
410 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
411 .name = "@file-system",
412 .help = "File system operations",
413 .value =
414 "access\0"
415 "chdir\0"
416 "chmod\0"
417 "close\0"
418 "creat\0"
419 "faccessat\0"
420 "faccessat2\0"
421 "fallocate\0"
422 "fchdir\0"
423 "fchmod\0"
424 "fchmodat\0"
425 "fcntl\0"
426 "fcntl64\0"
427 "fgetxattr\0"
428 "flistxattr\0"
429 "fremovexattr\0"
430 "fsetxattr\0"
431 "fstat\0"
432 "fstat64\0"
433 "fstatat64\0"
434 "fstatfs\0"
435 "fstatfs64\0"
436 "ftruncate\0"
437 "ftruncate64\0"
438 "futimesat\0"
439 "getcwd\0"
440 "getdents\0"
441 "getdents64\0"
442 "getxattr\0"
443 "inotify_add_watch\0"
444 "inotify_init\0"
445 "inotify_init1\0"
446 "inotify_rm_watch\0"
447 "lgetxattr\0"
448 "link\0"
449 "linkat\0"
450 "listxattr\0"
451 "llistxattr\0"
452 "lremovexattr\0"
453 "lsetxattr\0"
454 "lstat\0"
455 "lstat64\0"
456 "mkdir\0"
457 "mkdirat\0"
458 "mknod\0"
459 "mknodat\0"
460 "mmap\0"
461 "mmap2\0"
462 "munmap\0"
463 "newfstatat\0"
464 "oldfstat\0"
465 "oldlstat\0"
466 "oldstat\0"
467 "open\0"
468 "openat\0"
469 "openat2\0"
470 "readlink\0"
471 "readlinkat\0"
472 "removexattr\0"
473 "rename\0"
474 "renameat\0"
475 "renameat2\0"
476 "rmdir\0"
477 "setxattr\0"
478 "stat\0"
479 "stat64\0"
480 "statfs\0"
481 "statfs64\0"
482 "statx\0"
483 "symlink\0"
484 "symlinkat\0"
485 "truncate\0"
486 "truncate64\0"
487 "unlink\0"
488 "unlinkat\0"
489 "utime\0"
490 "utimensat\0"
491 "utimensat_time64\0"
492 "utimes\0"
493 },
494 [SYSCALL_FILTER_SET_IO_EVENT] = {
495 .name = "@io-event",
496 .help = "Event loop system calls",
497 .value =
498 "_newselect\0"
499 "epoll_create\0"
500 "epoll_create1\0"
501 "epoll_ctl\0"
502 "epoll_ctl_old\0"
503 "epoll_pwait\0"
504 "epoll_wait\0"
505 "epoll_wait_old\0"
506 "eventfd\0"
507 "eventfd2\0"
508 "poll\0"
509 "ppoll\0"
510 "ppoll_time64\0"
511 "pselect6\0"
512 "pselect6_time64\0"
513 "select\0"
514 },
515 [SYSCALL_FILTER_SET_IPC] = {
516 .name = "@ipc",
517 .help = "SysV IPC, POSIX Message Queues or other IPC",
518 .value =
519 "ipc\0"
520 "memfd_create\0"
521 "mq_getsetattr\0"
522 "mq_notify\0"
523 "mq_open\0"
524 "mq_timedreceive\0"
525 "mq_timedreceive_time64\0"
526 "mq_timedsend\0"
527 "mq_timedsend_time64\0"
528 "mq_unlink\0"
529 "msgctl\0"
530 "msgget\0"
531 "msgrcv\0"
532 "msgsnd\0"
533 "pipe\0"
534 "pipe2\0"
535 "process_vm_readv\0"
536 "process_vm_writev\0"
537 "semctl\0"
538 "semget\0"
539 "semop\0"
540 "semtimedop\0"
541 "semtimedop_time64\0"
542 "shmat\0"
543 "shmctl\0"
544 "shmdt\0"
545 "shmget\0"
546 },
547 [SYSCALL_FILTER_SET_KEYRING] = {
548 .name = "@keyring",
549 .help = "Kernel keyring access",
550 .value =
551 "add_key\0"
552 "keyctl\0"
553 "request_key\0"
554 },
555 [SYSCALL_FILTER_SET_MEMLOCK] = {
556 .name = "@memlock",
557 .help = "Memory locking control",
558 .value =
559 "mlock\0"
560 "mlock2\0"
561 "mlockall\0"
562 "munlock\0"
563 "munlockall\0"
564 },
565 [SYSCALL_FILTER_SET_MODULE] = {
566 .name = "@module",
567 .help = "Loading and unloading of kernel modules",
568 .value =
569 "delete_module\0"
570 "finit_module\0"
571 "init_module\0"
572 },
573 [SYSCALL_FILTER_SET_MOUNT] = {
574 .name = "@mount",
575 .help = "Mounting and unmounting of file systems",
576 .value =
577 "chroot\0"
578 "fsconfig\0"
579 "fsmount\0"
580 "fsopen\0"
581 "fspick\0"
582 "mount\0"
583 "move_mount\0"
584 "open_tree\0"
585 "pivot_root\0"
586 "umount\0"
587 "umount2\0"
588 },
589 [SYSCALL_FILTER_SET_NETWORK_IO] = {
590 .name = "@network-io",
591 .help = "Network or Unix socket IO, should not be needed if not network facing",
592 .value =
593 "accept\0"
594 "accept4\0"
595 "bind\0"
596 "connect\0"
597 "getpeername\0"
598 "getsockname\0"
599 "getsockopt\0"
600 "listen\0"
601 "recv\0"
602 "recvfrom\0"
603 "recvmmsg\0"
604 "recvmmsg_time64\0"
605 "recvmsg\0"
606 "send\0"
607 "sendmmsg\0"
608 "sendmsg\0"
609 "sendto\0"
610 "setsockopt\0"
611 "shutdown\0"
612 "socket\0"
613 "socketcall\0"
614 "socketpair\0"
615 },
616 [SYSCALL_FILTER_SET_OBSOLETE] = {
617 /* some unknown even to libseccomp */
618 .name = "@obsolete",
619 .help = "Unusual, obsolete or unimplemented system calls",
620 .value =
621 "_sysctl\0"
622 "afs_syscall\0"
623 "bdflush\0"
624 "break\0"
625 "create_module\0"
626 "ftime\0"
627 "get_kernel_syms\0"
628 "getpmsg\0"
629 "gtty\0"
630 "idle\0"
631 "lock\0"
632 "mpx\0"
633 "prof\0"
634 "profil\0"
635 "putpmsg\0"
636 "query_module\0"
637 "security\0"
638 "sgetmask\0"
639 "ssetmask\0"
640 "stty\0"
641 "sysfs\0"
642 "tuxcall\0"
643 "ulimit\0"
644 "uselib\0"
645 "ustat\0"
646 "vserver\0"
647 },
648 [SYSCALL_FILTER_SET_PKEY] = {
649 .name = "@pkey",
650 .help = "System calls used for memory protection keys",
651 .value =
652 "pkey_alloc\0"
653 "pkey_free\0"
654 "pkey_mprotect\0"
655 },
656 [SYSCALL_FILTER_SET_PRIVILEGED] = {
657 .name = "@privileged",
658 .help = "All system calls which need super-user capabilities",
659 .value =
660 "@chown\0"
661 "@clock\0"
662 "@module\0"
663 "@raw-io\0"
664 "@reboot\0"
665 "@swap\0"
666 "_sysctl\0"
667 "acct\0"
668 "bpf\0"
669 "capset\0"
670 "chroot\0"
671 "fanotify_init\0"
672 "fanotify_mark\0"
673 "nfsservctl\0"
674 "open_by_handle_at\0"
675 "pivot_root\0"
676 "quotactl\0"
677 "setdomainname\0"
678 "setfsuid\0"
679 "setfsuid32\0"
680 "setgroups\0"
681 "setgroups32\0"
682 "sethostname\0"
683 "setresuid\0"
684 "setresuid32\0"
685 "setreuid\0"
686 "setreuid32\0"
687 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
688 "setuid32\0"
689 "vhangup\0"
690 },
691 [SYSCALL_FILTER_SET_PROCESS] = {
692 .name = "@process",
693 .help = "Process control, execution, namespaceing operations",
694 .value =
695 "arch_prctl\0"
696 "capget\0" /* Able to query arbitrary processes */
697 "clone\0"
698 "clone3\0"
699 "execveat\0"
700 "fork\0"
701 "getrusage\0"
702 "kill\0"
703 "pidfd_open\0"
704 "pidfd_send_signal\0"
705 "prctl\0"
706 "rt_sigqueueinfo\0"
707 "rt_tgsigqueueinfo\0"
708 "setns\0"
709 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
710 "tgkill\0"
711 "times\0"
712 "tkill\0"
713 "unshare\0"
714 "vfork\0"
715 "wait4\0"
716 "waitid\0"
717 "waitpid\0"
718 },
719 [SYSCALL_FILTER_SET_RAW_IO] = {
720 .name = "@raw-io",
721 .help = "Raw I/O port access",
722 .value =
723 "ioperm\0"
724 "iopl\0"
725 "pciconfig_iobase\0"
726 "pciconfig_read\0"
727 "pciconfig_write\0"
728 #if defined __s390__ || defined __s390x__
729 "s390_pci_mmio_read\0"
730 "s390_pci_mmio_write\0"
731 #endif
732 },
733 [SYSCALL_FILTER_SET_REBOOT] = {
734 .name = "@reboot",
735 .help = "Reboot and reboot preparation/kexec",
736 .value =
737 "kexec_file_load\0"
738 "kexec_load\0"
739 "reboot\0"
740 },
741 [SYSCALL_FILTER_SET_RESOURCES] = {
742 .name = "@resources",
743 .help = "Alter resource settings",
744 .value =
745 "ioprio_set\0"
746 "mbind\0"
747 "migrate_pages\0"
748 "move_pages\0"
749 "nice\0"
750 "sched_setaffinity\0"
751 "sched_setattr\0"
752 "sched_setparam\0"
753 "sched_setscheduler\0"
754 "set_mempolicy\0"
755 "setpriority\0"
756 "setrlimit\0"
757 },
758 [SYSCALL_FILTER_SET_SETUID] = {
759 .name = "@setuid",
760 .help = "Operations for changing user/group credentials",
761 .value =
762 "setgid\0"
763 "setgid32\0"
764 "setgroups\0"
765 "setgroups32\0"
766 "setregid\0"
767 "setregid32\0"
768 "setresgid\0"
769 "setresgid32\0"
770 "setresuid\0"
771 "setresuid32\0"
772 "setreuid\0"
773 "setreuid32\0"
774 "setuid\0"
775 "setuid32\0"
776 },
777 [SYSCALL_FILTER_SET_SIGNAL] = {
778 .name = "@signal",
779 .help = "Process signal handling",
780 .value =
781 "rt_sigaction\0"
782 "rt_sigpending\0"
783 "rt_sigprocmask\0"
784 "rt_sigsuspend\0"
785 "rt_sigtimedwait\0"
786 "rt_sigtimedwait_time64\0"
787 "sigaction\0"
788 "sigaltstack\0"
789 "signal\0"
790 "signalfd\0"
791 "signalfd4\0"
792 "sigpending\0"
793 "sigprocmask\0"
794 "sigsuspend\0"
795 },
796 [SYSCALL_FILTER_SET_SWAP] = {
797 .name = "@swap",
798 .help = "Enable/disable swap devices",
799 .value =
800 "swapoff\0"
801 "swapon\0"
802 },
803 [SYSCALL_FILTER_SET_SYNC] = {
804 .name = "@sync",
805 .help = "Synchronize files and memory to storage",
806 .value =
807 "fdatasync\0"
808 "fsync\0"
809 "msync\0"
810 "sync\0"
811 "sync_file_range\0"
812 "sync_file_range2\0"
813 "syncfs\0"
814 },
815 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
816 .name = "@system-service",
817 .help = "General system service operations",
818 .value =
819 "@aio\0"
820 "@basic-io\0"
821 "@chown\0"
822 "@default\0"
823 "@file-system\0"
824 "@io-event\0"
825 "@ipc\0"
826 "@keyring\0"
827 "@memlock\0"
828 "@network-io\0"
829 "@process\0"
830 "@resources\0"
831 "@setuid\0"
832 "@signal\0"
833 "@sync\0"
834 "@timer\0"
835 "brk\0"
836 "capget\0"
837 "capset\0"
838 "copy_file_range\0"
839 "fadvise64\0"
840 "fadvise64_64\0"
841 "flock\0"
842 "get_mempolicy\0"
843 "getcpu\0"
844 "getpriority\0"
845 "getrandom\0"
846 "ioctl\0"
847 "ioprio_get\0"
848 "kcmp\0"
849 "madvise\0"
850 "mprotect\0"
851 "mremap\0"
852 "name_to_handle_at\0"
853 "oldolduname\0"
854 "olduname\0"
855 "personality\0"
856 "readahead\0"
857 "readdir\0"
858 "remap_file_pages\0"
859 "sched_get_priority_max\0"
860 "sched_get_priority_min\0"
861 "sched_getaffinity\0"
862 "sched_getattr\0"
863 "sched_getparam\0"
864 "sched_getscheduler\0"
865 "sched_rr_get_interval\0"
866 "sched_rr_get_interval_time64\0"
867 "sched_yield\0"
868 "sendfile\0"
869 "sendfile64\0"
870 "setfsgid\0"
871 "setfsgid32\0"
872 "setfsuid\0"
873 "setfsuid32\0"
874 "setpgid\0"
875 "setsid\0"
876 "splice\0"
877 "sysinfo\0"
878 "tee\0"
879 "umask\0"
880 "uname\0"
881 "userfaultfd\0"
882 "vmsplice\0"
883 },
884 [SYSCALL_FILTER_SET_TIMER] = {
885 .name = "@timer",
886 .help = "Schedule operations by time",
887 .value =
888 "alarm\0"
889 "getitimer\0"
890 "setitimer\0"
891 "timer_create\0"
892 "timer_delete\0"
893 "timer_getoverrun\0"
894 "timer_gettime\0"
895 "timer_gettime64\0"
896 "timer_settime\0"
897 "timer_settime64\0"
898 "timerfd_create\0"
899 "timerfd_gettime\0"
900 "timerfd_gettime64\0"
901 "timerfd_settime\0"
902 "timerfd_settime64\0"
903 "times\0"
904 },
905 [SYSCALL_FILTER_SET_KNOWN] = {
906 .name = "@known",
907 .help = "All known syscalls declared in the kernel",
908 .value =
909 #include "syscall-list.h"
910 },
911 };
912
913 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
914 if (isempty(name) || name[0] != '@')
915 return NULL;
916
917 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
918 if (streq(syscall_filter_sets[i].name, name))
919 return syscall_filter_sets + i;
920
921 return NULL;
922 }
923
924 static int add_syscall_filter_set(
925 scmp_filter_ctx seccomp,
926 const SyscallFilterSet *set,
927 uint32_t action,
928 char **exclude,
929 bool log_missing,
930 char ***added);
931
932 int seccomp_add_syscall_filter_item(
933 scmp_filter_ctx *seccomp,
934 const char *name,
935 uint32_t action,
936 char **exclude,
937 bool log_missing,
938 char ***added) {
939
940 assert(seccomp);
941 assert(name);
942
943 if (strv_contains(exclude, name))
944 return 0;
945
946 /* Any syscalls that are handled are added to the *added strv. The pointer
947 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
948
949 if (name[0] == '@') {
950 const SyscallFilterSet *other;
951
952 other = syscall_filter_set_find(name);
953 if (!other)
954 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
955 "Filter set %s is not known!",
956 name);
957
958 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
959
960 } else {
961 int id, r;
962
963 id = seccomp_syscall_resolve_name(name);
964 if (id == __NR_SCMP_ERROR) {
965 if (log_missing)
966 log_debug("System call %s is not known, ignoring.", name);
967 return 0;
968 }
969
970 r = seccomp_rule_add_exact(seccomp, action, id, 0);
971 if (r < 0) {
972 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
973 bool ignore = r == -EDOM;
974
975 if (!ignore || log_missing)
976 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
977 name, id, ignore ? ", ignoring" : "");
978 if (!ignore)
979 return r;
980 }
981
982 if (added) {
983 r = strv_extend(added, name);
984 if (r < 0)
985 return r;
986 }
987
988 return 0;
989 }
990 }
991
992 static int add_syscall_filter_set(
993 scmp_filter_ctx seccomp,
994 const SyscallFilterSet *set,
995 uint32_t action,
996 char **exclude,
997 bool log_missing,
998 char ***added) {
999
1000 const char *sys;
1001 int r;
1002
1003 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1004
1005 assert(seccomp);
1006 assert(set);
1007
1008 NULSTR_FOREACH(sys, set->value) {
1009 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1010 if (r < 0)
1011 return r;
1012 }
1013
1014 return 0;
1015 }
1016
1017 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1018 uint32_t arch;
1019 int r;
1020
1021 assert(set);
1022
1023 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1024 * each local arch. */
1025
1026 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1027 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1028
1029 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1030
1031 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1032 if (r < 0)
1033 return r;
1034
1035 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1036 if (r < 0)
1037 return log_debug_errno(r, "Failed to add filter set: %m");
1038
1039 r = seccomp_load(seccomp);
1040 if (ERRNO_IS_SECCOMP_FATAL(r))
1041 return r;
1042 if (r < 0)
1043 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1044 }
1045
1046 return 0;
1047 }
1048
1049 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1050 uint32_t arch;
1051 int r;
1052
1053 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1054 * SyscallFilterSet* table. */
1055
1056 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1057 return 0;
1058
1059 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1060 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1061 void *syscall_id, *val;
1062
1063 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1064
1065 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1066 if (r < 0)
1067 return r;
1068
1069 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
1070 uint32_t a = action;
1071 int id = PTR_TO_INT(syscall_id) - 1;
1072 int error = PTR_TO_INT(val);
1073
1074 if (error == SECCOMP_ERROR_NUMBER_KILL)
1075 a = scmp_act_kill_process();
1076 else if (action != SCMP_ACT_ALLOW && error >= 0)
1077 a = SCMP_ACT_ERRNO(error);
1078
1079 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1080 if (r < 0) {
1081 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1082 _cleanup_free_ char *n = NULL;
1083 bool ignore;
1084
1085 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1086 ignore = r == -EDOM;
1087 if (!ignore || log_missing)
1088 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1089 strna(n), id, ignore ? ", ignoring" : "");
1090 if (!ignore)
1091 return r;
1092 }
1093 }
1094
1095 r = seccomp_load(seccomp);
1096 if (ERRNO_IS_SECCOMP_FATAL(r))
1097 return r;
1098 if (r < 0)
1099 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1100 }
1101
1102 return 0;
1103 }
1104
1105 int seccomp_parse_syscall_filter(
1106 const char *name,
1107 int errno_num,
1108 Hashmap *filter,
1109 SeccompParseFlags flags,
1110 const char *unit,
1111 const char *filename,
1112 unsigned line) {
1113
1114 int r;
1115
1116 assert(name);
1117 assert(filter);
1118
1119 if (name[0] == '@') {
1120 const SyscallFilterSet *set;
1121 const char *i;
1122
1123 set = syscall_filter_set_find(name);
1124 if (!set) {
1125 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1126 return -EINVAL;
1127
1128 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1129 "Unknown system call group, ignoring: %s", name);
1130 return 0;
1131 }
1132
1133 NULSTR_FOREACH(i, set->value) {
1134 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1135 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1136 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1137 * about them. */
1138 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1139 if (r < 0)
1140 return r;
1141 }
1142 } else {
1143 int id;
1144
1145 id = seccomp_syscall_resolve_name(name);
1146 if (id == __NR_SCMP_ERROR) {
1147 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1148 return -EINVAL;
1149
1150 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1151 "Failed to parse system call, ignoring: %s", name);
1152 return 0;
1153 }
1154
1155 /* If we previously wanted to forbid a syscall and now
1156 * we want to allow it, then remove it from the list. */
1157 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
1158 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1159 if (r < 0)
1160 switch (r) {
1161 case -ENOMEM:
1162 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1163 case -EEXIST:
1164 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1165 break;
1166 default:
1167 return r;
1168 }
1169 } else
1170 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1171 }
1172
1173 return 0;
1174 }
1175
1176 int seccomp_restrict_namespaces(unsigned long retain) {
1177 uint32_t arch;
1178 int r;
1179
1180 if (DEBUG_LOGGING) {
1181 _cleanup_free_ char *s = NULL;
1182
1183 (void) namespace_flags_to_string(retain, &s);
1184 log_debug("Restricting namespace to: %s.", strna(s));
1185 }
1186
1187 /* NOOP? */
1188 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1189 return 0;
1190
1191 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1192 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1193
1194 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1195
1196 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1197 if (r < 0)
1198 return r;
1199
1200 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1201 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1202 * altogether. */
1203 r = seccomp_rule_add_exact(
1204 seccomp,
1205 SCMP_ACT_ERRNO(EPERM),
1206 SCMP_SYS(setns),
1207 0);
1208 else
1209 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1210 * special invocation with a zero flags argument, right here. */
1211 r = seccomp_rule_add_exact(
1212 seccomp,
1213 SCMP_ACT_ERRNO(EPERM),
1214 SCMP_SYS(setns),
1215 1,
1216 SCMP_A1(SCMP_CMP_EQ, 0));
1217 if (r < 0) {
1218 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1219 continue;
1220 }
1221
1222 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1223 unsigned long f;
1224
1225 f = namespace_flag_map[i].flag;
1226 if ((retain & f) == f) {
1227 log_debug("Permitting %s.", namespace_flag_map[i].name);
1228 continue;
1229 }
1230
1231 log_debug("Blocking %s.", namespace_flag_map[i].name);
1232
1233 r = seccomp_rule_add_exact(
1234 seccomp,
1235 SCMP_ACT_ERRNO(EPERM),
1236 SCMP_SYS(unshare),
1237 1,
1238 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1239 if (r < 0) {
1240 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1241 break;
1242 }
1243
1244 /* On s390/s390x the first two parameters to clone are switched */
1245 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1246 r = seccomp_rule_add_exact(
1247 seccomp,
1248 SCMP_ACT_ERRNO(EPERM),
1249 SCMP_SYS(clone),
1250 1,
1251 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1252 else
1253 r = seccomp_rule_add_exact(
1254 seccomp,
1255 SCMP_ACT_ERRNO(EPERM),
1256 SCMP_SYS(clone),
1257 1,
1258 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1259 if (r < 0) {
1260 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1261 break;
1262 }
1263
1264 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1265 r = seccomp_rule_add_exact(
1266 seccomp,
1267 SCMP_ACT_ERRNO(EPERM),
1268 SCMP_SYS(setns),
1269 1,
1270 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1271 if (r < 0) {
1272 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1273 break;
1274 }
1275 }
1276 }
1277 if (r < 0)
1278 continue;
1279
1280 r = seccomp_load(seccomp);
1281 if (ERRNO_IS_SECCOMP_FATAL(r))
1282 return r;
1283 if (r < 0)
1284 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1285 }
1286
1287 return 0;
1288 }
1289
1290 int seccomp_protect_sysctl(void) {
1291 uint32_t arch;
1292 int r;
1293
1294 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1295 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1296
1297 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1298
1299 if (IN_SET(arch,
1300 SCMP_ARCH_AARCH64,
1301 #ifdef SCMP_ARCH_RISCV64
1302 SCMP_ARCH_RISCV64,
1303 #endif
1304 SCMP_ARCH_X32
1305 ))
1306 /* No _sysctl syscall */
1307 continue;
1308
1309 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1310 if (r < 0)
1311 return r;
1312
1313 r = seccomp_rule_add_exact(
1314 seccomp,
1315 SCMP_ACT_ERRNO(EPERM),
1316 SCMP_SYS(_sysctl),
1317 0);
1318 if (r < 0) {
1319 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1320 continue;
1321 }
1322
1323 r = seccomp_load(seccomp);
1324 if (ERRNO_IS_SECCOMP_FATAL(r))
1325 return r;
1326 if (r < 0)
1327 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1328 }
1329
1330 return 0;
1331 }
1332
1333 int seccomp_protect_syslog(void) {
1334 uint32_t arch;
1335 int r;
1336
1337 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1338 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1339
1340 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1341 if (r < 0)
1342 return r;
1343
1344 r = seccomp_rule_add_exact(
1345 seccomp,
1346 SCMP_ACT_ERRNO(EPERM),
1347 SCMP_SYS(syslog),
1348 0);
1349
1350 if (r < 0) {
1351 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1352 continue;
1353 }
1354
1355 r = seccomp_load(seccomp);
1356 if (ERRNO_IS_SECCOMP_FATAL(r))
1357 return r;
1358 if (r < 0)
1359 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1360 }
1361
1362 return 0;
1363 }
1364
1365 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1366 uint32_t arch;
1367 int r;
1368
1369 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1370 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1371 bool supported;
1372
1373 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1374
1375 switch (arch) {
1376
1377 case SCMP_ARCH_X86_64:
1378 case SCMP_ARCH_X32:
1379 case SCMP_ARCH_ARM:
1380 case SCMP_ARCH_AARCH64:
1381 case SCMP_ARCH_PPC:
1382 case SCMP_ARCH_PPC64:
1383 case SCMP_ARCH_PPC64LE:
1384 case SCMP_ARCH_MIPSEL64N32:
1385 case SCMP_ARCH_MIPS64N32:
1386 case SCMP_ARCH_MIPSEL64:
1387 case SCMP_ARCH_MIPS64:
1388 #ifdef SCMP_ARCH_RISCV64
1389 case SCMP_ARCH_RISCV64:
1390 #endif
1391 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1392 supported = true;
1393 break;
1394
1395 case SCMP_ARCH_S390:
1396 case SCMP_ARCH_S390X:
1397 case SCMP_ARCH_X86:
1398 case SCMP_ARCH_MIPSEL:
1399 case SCMP_ARCH_MIPS:
1400 default:
1401 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1402 * don't know */
1403 supported = false;
1404 break;
1405 }
1406
1407 if (!supported)
1408 continue;
1409
1410 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1411 if (r < 0)
1412 return r;
1413
1414 if (allow_list) {
1415 int first = 0, last = 0;
1416 void *afp;
1417
1418 /* If this is an allow list, we first block the address families that are out of
1419 * range and then everything that is not in the set. First, we find the lowest and
1420 * highest address family in the set. */
1421
1422 SET_FOREACH(afp, address_families) {
1423 int af = PTR_TO_INT(afp);
1424
1425 if (af <= 0 || af >= af_max())
1426 continue;
1427
1428 if (first == 0 || af < first)
1429 first = af;
1430
1431 if (last == 0 || af > last)
1432 last = af;
1433 }
1434
1435 assert((first == 0) == (last == 0));
1436
1437 if (first == 0) {
1438
1439 /* No entries in the valid range, block everything */
1440 r = seccomp_rule_add_exact(
1441 seccomp,
1442 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1443 SCMP_SYS(socket),
1444 0);
1445 if (r < 0) {
1446 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1447 continue;
1448 }
1449
1450 } else {
1451
1452 /* Block everything below the first entry */
1453 r = seccomp_rule_add_exact(
1454 seccomp,
1455 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1456 SCMP_SYS(socket),
1457 1,
1458 SCMP_A0(SCMP_CMP_LT, first));
1459 if (r < 0) {
1460 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1461 continue;
1462 }
1463
1464 /* Block everything above the last entry */
1465 r = seccomp_rule_add_exact(
1466 seccomp,
1467 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1468 SCMP_SYS(socket),
1469 1,
1470 SCMP_A0(SCMP_CMP_GT, last));
1471 if (r < 0) {
1472 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1473 continue;
1474 }
1475
1476 /* Block everything between the first and last entry */
1477 for (int af = 1; af < af_max(); af++) {
1478
1479 if (set_contains(address_families, INT_TO_PTR(af)))
1480 continue;
1481
1482 r = seccomp_rule_add_exact(
1483 seccomp,
1484 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1485 SCMP_SYS(socket),
1486 1,
1487 SCMP_A0(SCMP_CMP_EQ, af));
1488 if (r < 0)
1489 break;
1490 }
1491 if (r < 0) {
1492 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1493 continue;
1494 }
1495 }
1496
1497 } else {
1498 void *af;
1499
1500 /* If this is a deny list, then generate one rule for each address family that are
1501 * then combined in OR checks. */
1502
1503 SET_FOREACH(af, address_families) {
1504 r = seccomp_rule_add_exact(
1505 seccomp,
1506 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1507 SCMP_SYS(socket),
1508 1,
1509 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1510 if (r < 0)
1511 break;
1512 }
1513 if (r < 0) {
1514 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1515 continue;
1516 }
1517 }
1518
1519 r = seccomp_load(seccomp);
1520 if (ERRNO_IS_SECCOMP_FATAL(r))
1521 return r;
1522 if (r < 0)
1523 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1524 }
1525
1526 return 0;
1527 }
1528
1529 int seccomp_restrict_realtime(void) {
1530 static const int permitted_policies[] = {
1531 SCHED_OTHER,
1532 SCHED_BATCH,
1533 SCHED_IDLE,
1534 };
1535
1536 int r, max_policy = 0;
1537 uint32_t arch;
1538 unsigned i;
1539
1540 /* Determine the highest policy constant we want to allow */
1541 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1542 if (permitted_policies[i] > max_policy)
1543 max_policy = permitted_policies[i];
1544
1545 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1546 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1547 int p;
1548
1549 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1550
1551 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1552 if (r < 0)
1553 return r;
1554
1555 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1556 * allow list. */
1557 for (p = 0; p < max_policy; p++) {
1558 bool good = false;
1559
1560 /* Check if this is in the allow list. */
1561 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1562 if (permitted_policies[i] == p) {
1563 good = true;
1564 break;
1565 }
1566
1567 if (good)
1568 continue;
1569
1570 /* Deny this policy */
1571 r = seccomp_rule_add_exact(
1572 seccomp,
1573 SCMP_ACT_ERRNO(EPERM),
1574 SCMP_SYS(sched_setscheduler),
1575 1,
1576 SCMP_A1(SCMP_CMP_EQ, p));
1577 if (r < 0) {
1578 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1579 continue;
1580 }
1581 }
1582
1583 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1584 * are unsigned here, hence no need no check for < 0 values. */
1585 r = seccomp_rule_add_exact(
1586 seccomp,
1587 SCMP_ACT_ERRNO(EPERM),
1588 SCMP_SYS(sched_setscheduler),
1589 1,
1590 SCMP_A1(SCMP_CMP_GT, max_policy));
1591 if (r < 0) {
1592 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1593 continue;
1594 }
1595
1596 r = seccomp_load(seccomp);
1597 if (ERRNO_IS_SECCOMP_FATAL(r))
1598 return r;
1599 if (r < 0)
1600 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1601 }
1602
1603 return 0;
1604 }
1605
1606 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1607 uint32_t arch,
1608 int nr,
1609 unsigned arg_cnt,
1610 const struct scmp_arg_cmp arg) {
1611 int r;
1612
1613 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1614 if (r < 0) {
1615 _cleanup_free_ char *n = NULL;
1616
1617 n = seccomp_syscall_resolve_num_arch(arch, nr);
1618 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1619 strna(n),
1620 seccomp_arch_to_string(arch));
1621 }
1622
1623 return r;
1624 }
1625
1626 /* For known architectures, check that syscalls are indeed defined or not. */
1627 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1628 assert_cc(SCMP_SYS(shmget) > 0);
1629 assert_cc(SCMP_SYS(shmat) > 0);
1630 assert_cc(SCMP_SYS(shmdt) > 0);
1631 #endif
1632
1633 int seccomp_memory_deny_write_execute(void) {
1634 uint32_t arch;
1635 unsigned loaded = 0;
1636
1637 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1638 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1639 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1640
1641 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1642
1643 switch (arch) {
1644
1645 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1646 * We ignore that here, which means there's still a way to get writable/executable
1647 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1648
1649 case SCMP_ARCH_X86:
1650 case SCMP_ARCH_S390:
1651 filter_syscall = SCMP_SYS(mmap2);
1652 block_syscall = SCMP_SYS(mmap);
1653 /* shmat multiplexed, see above */
1654 break;
1655
1656 case SCMP_ARCH_PPC:
1657 case SCMP_ARCH_PPC64:
1658 case SCMP_ARCH_PPC64LE:
1659 case SCMP_ARCH_S390X:
1660 filter_syscall = SCMP_SYS(mmap);
1661 /* shmat multiplexed, see above */
1662 break;
1663
1664 case SCMP_ARCH_ARM:
1665 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1666 shmat_syscall = SCMP_SYS(shmat);
1667 break;
1668
1669 case SCMP_ARCH_X86_64:
1670 case SCMP_ARCH_X32:
1671 case SCMP_ARCH_AARCH64:
1672 #ifdef SCMP_ARCH_RISCV64
1673 case SCMP_ARCH_RISCV64:
1674 #endif
1675 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1676 shmat_syscall = SCMP_SYS(shmat);
1677 break;
1678
1679 /* Please add more definitions here, if you port systemd to other architectures! */
1680
1681 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1682 #warning "Consider adding the right mmap() syscall definitions here!"
1683 #endif
1684 }
1685
1686 /* Can't filter mmap() on this arch, then skip it */
1687 if (filter_syscall == 0)
1688 continue;
1689
1690 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1691 if (r < 0)
1692 return r;
1693
1694 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1695 1,
1696 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1697 if (r < 0)
1698 continue;
1699
1700 if (block_syscall != 0) {
1701 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1702 if (r < 0)
1703 continue;
1704 }
1705
1706 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1707 1,
1708 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1709 if (r < 0)
1710 continue;
1711
1712 #ifdef __NR_pkey_mprotect
1713 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1714 1,
1715 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1716 if (r < 0)
1717 continue;
1718 #endif
1719
1720 if (shmat_syscall > 0) {
1721 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1722 1,
1723 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1724 if (r < 0)
1725 continue;
1726 }
1727
1728 r = seccomp_load(seccomp);
1729 if (ERRNO_IS_SECCOMP_FATAL(r))
1730 return r;
1731 if (r < 0)
1732 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1733 seccomp_arch_to_string(arch));
1734 loaded++;
1735 }
1736
1737 if (loaded == 0)
1738 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1739
1740 return loaded;
1741 }
1742
1743 int seccomp_restrict_archs(Set *archs) {
1744 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1745 void *id;
1746 int r;
1747
1748 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1749 * list.
1750 *
1751 * There are some qualifications. However the most important use is to stop processes from bypassing
1752 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1753 * in a non-native architecture. There are no holes in this use case, at least so far. */
1754
1755 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1756 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1757 * to run a program with the restrictions applied. */
1758 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1759 if (!seccomp)
1760 return -ENOMEM;
1761
1762 SET_FOREACH(id, archs) {
1763 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1764 if (r < 0 && r != -EEXIST)
1765 return r;
1766 }
1767
1768 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1769 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1770 * The important thing is that you can block the old 32-bit x86 syscalls.
1771 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1772
1773 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1774 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1775
1776 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1777 if (r < 0 && r != -EEXIST)
1778 return r;
1779 }
1780
1781 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1782 if (r < 0)
1783 return r;
1784
1785 r = seccomp_load(seccomp);
1786 if (ERRNO_IS_SECCOMP_FATAL(r))
1787 return r;
1788 if (r < 0)
1789 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1790
1791 return 0;
1792 }
1793
1794 int parse_syscall_archs(char **l, Set **ret_archs) {
1795 _cleanup_set_free_ Set *archs = NULL;
1796 char **s;
1797 int r;
1798
1799 assert(l);
1800 assert(ret_archs);
1801
1802 STRV_FOREACH(s, l) {
1803 uint32_t a;
1804
1805 r = seccomp_arch_from_string(*s, &a);
1806 if (r < 0)
1807 return -EINVAL;
1808
1809 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1810 if (r < 0)
1811 return -ENOMEM;
1812 }
1813
1814 *ret_archs = TAKE_PTR(archs);
1815 return 0;
1816 }
1817
1818 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1819 const char *i;
1820 int r;
1821
1822 assert(set);
1823
1824 NULSTR_FOREACH(i, set->value) {
1825
1826 if (i[0] == '@') {
1827 const SyscallFilterSet *more;
1828
1829 more = syscall_filter_set_find(i);
1830 if (!more)
1831 return -ENXIO;
1832
1833 r = seccomp_filter_set_add(filter, add, more);
1834 if (r < 0)
1835 return r;
1836 } else {
1837 int id;
1838
1839 id = seccomp_syscall_resolve_name(i);
1840 if (id == __NR_SCMP_ERROR) {
1841 log_debug("Couldn't resolve system call, ignoring: %s", i);
1842 continue;
1843 }
1844
1845 if (add) {
1846 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1847 if (r < 0)
1848 return r;
1849 } else
1850 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1851 }
1852 }
1853
1854 return 0;
1855 }
1856
1857 int seccomp_lock_personality(unsigned long personality) {
1858 uint32_t arch;
1859 int r;
1860
1861 if (personality >= PERSONALITY_INVALID)
1862 return -EINVAL;
1863
1864 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1865 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1866
1867 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1868 if (r < 0)
1869 return r;
1870
1871 r = seccomp_rule_add_exact(
1872 seccomp,
1873 SCMP_ACT_ERRNO(EPERM),
1874 SCMP_SYS(personality),
1875 1,
1876 SCMP_A0(SCMP_CMP_NE, personality));
1877 if (r < 0) {
1878 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1879 continue;
1880 }
1881
1882 r = seccomp_load(seccomp);
1883 if (ERRNO_IS_SECCOMP_FATAL(r))
1884 return r;
1885 if (r < 0)
1886 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1887 }
1888
1889 return 0;
1890 }
1891
1892 int seccomp_protect_hostname(void) {
1893 uint32_t arch;
1894 int r;
1895
1896 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1897 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1898
1899 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1900 if (r < 0)
1901 return r;
1902
1903 r = seccomp_rule_add_exact(
1904 seccomp,
1905 SCMP_ACT_ERRNO(EPERM),
1906 SCMP_SYS(sethostname),
1907 0);
1908 if (r < 0) {
1909 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1910 continue;
1911 }
1912
1913 r = seccomp_rule_add_exact(
1914 seccomp,
1915 SCMP_ACT_ERRNO(EPERM),
1916 SCMP_SYS(setdomainname),
1917 0);
1918 if (r < 0) {
1919 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1920 continue;
1921 }
1922
1923 r = seccomp_load(seccomp);
1924 if (ERRNO_IS_SECCOMP_FATAL(r))
1925 return r;
1926 if (r < 0)
1927 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1928 }
1929
1930 return 0;
1931 }
1932
1933 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1934 /* Checks the mode_t parameter of the following system calls:
1935 *
1936 * → chmod() + fchmod() + fchmodat()
1937 * → open() + creat() + openat()
1938 * → mkdir() + mkdirat()
1939 * → mknod() + mknodat()
1940 *
1941 * Returns error if *everything* failed, and 0 otherwise.
1942 */
1943 int r;
1944 bool any = false;
1945
1946 r = seccomp_rule_add_exact(
1947 seccomp,
1948 SCMP_ACT_ERRNO(EPERM),
1949 SCMP_SYS(chmod),
1950 1,
1951 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1952 if (r < 0)
1953 log_debug_errno(r, "Failed to add filter for chmod: %m");
1954 else
1955 any = true;
1956
1957 r = seccomp_rule_add_exact(
1958 seccomp,
1959 SCMP_ACT_ERRNO(EPERM),
1960 SCMP_SYS(fchmod),
1961 1,
1962 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1963 if (r < 0)
1964 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1965 else
1966 any = true;
1967
1968 r = seccomp_rule_add_exact(
1969 seccomp,
1970 SCMP_ACT_ERRNO(EPERM),
1971 SCMP_SYS(fchmodat),
1972 1,
1973 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1974 if (r < 0)
1975 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1976 else
1977 any = true;
1978
1979 r = seccomp_rule_add_exact(
1980 seccomp,
1981 SCMP_ACT_ERRNO(EPERM),
1982 SCMP_SYS(mkdir),
1983 1,
1984 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1985 if (r < 0)
1986 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1987 else
1988 any = true;
1989
1990 r = seccomp_rule_add_exact(
1991 seccomp,
1992 SCMP_ACT_ERRNO(EPERM),
1993 SCMP_SYS(mkdirat),
1994 1,
1995 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1996 if (r < 0)
1997 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
1998 else
1999 any = true;
2000
2001 r = seccomp_rule_add_exact(
2002 seccomp,
2003 SCMP_ACT_ERRNO(EPERM),
2004 SCMP_SYS(mknod),
2005 1,
2006 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2007 if (r < 0)
2008 log_debug_errno(r, "Failed to add filter for mknod: %m");
2009 else
2010 any = true;
2011
2012 r = seccomp_rule_add_exact(
2013 seccomp,
2014 SCMP_ACT_ERRNO(EPERM),
2015 SCMP_SYS(mknodat),
2016 1,
2017 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2018 if (r < 0)
2019 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2020 else
2021 any = true;
2022
2023 #if SCMP_SYS(open) > 0
2024 r = seccomp_rule_add_exact(
2025 seccomp,
2026 SCMP_ACT_ERRNO(EPERM),
2027 SCMP_SYS(open),
2028 2,
2029 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2030 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2031 if (r < 0)
2032 log_debug_errno(r, "Failed to add filter for open: %m");
2033 else
2034 any = true;
2035 #endif
2036
2037 r = seccomp_rule_add_exact(
2038 seccomp,
2039 SCMP_ACT_ERRNO(EPERM),
2040 SCMP_SYS(openat),
2041 2,
2042 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2043 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2044 if (r < 0)
2045 log_debug_errno(r, "Failed to add filter for openat: %m");
2046 else
2047 any = true;
2048
2049 #if defined(__SNR_openat2)
2050 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2051 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2052 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2053 * compatible with kernels that are not absolutely recent. */
2054 r = seccomp_rule_add_exact(
2055 seccomp,
2056 SCMP_ACT_ERRNO(EPERM),
2057 SCMP_SYS(openat2),
2058 0);
2059 if (r < 0)
2060 log_debug_errno(r, "Failed to add filter for openat2: %m");
2061 else
2062 any = true;
2063 #endif
2064
2065 r = seccomp_rule_add_exact(
2066 seccomp,
2067 SCMP_ACT_ERRNO(EPERM),
2068 SCMP_SYS(creat),
2069 1,
2070 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2071 if (r < 0)
2072 log_debug_errno(r, "Failed to add filter for creat: %m");
2073 else
2074 any = true;
2075
2076 return any ? 0 : r;
2077 }
2078
2079 int seccomp_restrict_suid_sgid(void) {
2080 uint32_t arch;
2081 int r, k;
2082
2083 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2084 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2085
2086 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2087 if (r < 0)
2088 return r;
2089
2090 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2091 if (r < 0)
2092 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2093
2094 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2095 if (k < 0)
2096 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2097
2098 if (r < 0 && k < 0)
2099 continue;
2100
2101 r = seccomp_load(seccomp);
2102 if (ERRNO_IS_SECCOMP_FATAL(r))
2103 return r;
2104 if (r < 0)
2105 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2106 }
2107
2108 return 0;
2109 }
2110
2111 uint32_t scmp_act_kill_process(void) {
2112
2113 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2114 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2115 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2116 * for single-threaded apps does the right thing. */
2117
2118 #ifdef SCMP_ACT_KILL_PROCESS
2119 if (seccomp_api_get() >= 3)
2120 return SCMP_ACT_KILL_PROCESS;
2121 #endif
2122
2123 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2124 }