]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
ethtool: add several new link modes
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "env-util.h"
16 #include "errno-list.h"
17 #include "macro.h"
18 #include "nsflags.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
22 #include "set.h"
23 #include "string-util.h"
24 #include "strv.h"
25
26 const uint32_t seccomp_local_archs[] = {
27
28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
29
30 #if defined(__x86_64__) && defined(__ILP32__)
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
33 SCMP_ARCH_X32, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X32,
37 SCMP_ARCH_X86_64, /* native */
38 #elif defined(__i386__)
39 SCMP_ARCH_X86,
40 #elif defined(__aarch64__)
41 SCMP_ARCH_ARM,
42 SCMP_ARCH_AARCH64, /* native */
43 #elif defined(__arm__)
44 SCMP_ARCH_ARM,
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
49 SCMP_ARCH_MIPS,
50 SCMP_ARCH_MIPSEL, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
55 SCMP_ARCH_MIPS64N32,
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64,
69 SCMP_ARCH_MIPS64,
70 SCMP_ARCH_MIPSEL64N32,
71 SCMP_ARCH_MIPS64N32, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
80 SCMP_ARCH_PPC,
81 SCMP_ARCH_PPC64LE,
82 SCMP_ARCH_PPC64, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87 #elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
89 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
91 #elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94 #elif defined(__s390__)
95 SCMP_ARCH_S390,
96 #endif
97 (uint32_t) -1
98 };
99
100 const char* seccomp_arch_to_string(uint32_t c) {
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
105
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
108 return "native";
109 case SCMP_ARCH_X86:
110 return "x86";
111 case SCMP_ARCH_X86_64:
112 return "x86-64";
113 case SCMP_ARCH_X32:
114 return "x32";
115 case SCMP_ARCH_ARM:
116 return "arm";
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
137 #ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140 #endif
141 case SCMP_ARCH_S390:
142 return "s390";
143 case SCMP_ARCH_S390X:
144 return "s390x";
145 default:
146 return NULL;
147 }
148 }
149
150 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
186 #ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189 #endif
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
194 else
195 return -EINVAL;
196
197 return 0;
198 }
199
200 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
202 int r;
203
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
215 if (r < 0)
216 return r;
217
218 r = seccomp_arch_add(seccomp, arch);
219 if (r < 0)
220 return r;
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
231 if (r < 0)
232 return r;
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
236 return r;
237
238 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244 #endif
245
246 *ret = TAKE_PTR(seccomp);
247 return 0;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_getres_time64\0"
277 "clock_gettime\0"
278 "clock_gettime64\0"
279 "clock_nanosleep\0"
280 "clock_nanosleep_time64\0"
281 "execve\0"
282 "exit\0"
283 "exit_group\0"
284 "futex\0"
285 "futex_time64\0"
286 "get_robust_list\0"
287 "get_thread_area\0"
288 "getegid\0"
289 "getegid32\0"
290 "geteuid\0"
291 "geteuid32\0"
292 "getgid\0"
293 "getgid32\0"
294 "getgroups\0"
295 "getgroups32\0"
296 "getpgid\0"
297 "getpgrp\0"
298 "getpid\0"
299 "getppid\0"
300 "getresgid\0"
301 "getresgid32\0"
302 "getresuid\0"
303 "getresuid32\0"
304 "getrlimit\0" /* make sure processes can query stack size and such */
305 "getsid\0"
306 "gettid\0"
307 "gettimeofday\0"
308 "getuid\0"
309 "getuid32\0"
310 "membarrier\0"
311 "nanosleep\0"
312 "pause\0"
313 "prlimit64\0"
314 "restart_syscall\0"
315 "rseq\0"
316 "rt_sigreturn\0"
317 "sched_yield\0"
318 "set_robust_list\0"
319 "set_thread_area\0"
320 "set_tid_address\0"
321 "set_tls\0"
322 "sigreturn\0"
323 "time\0"
324 "ugetrlimit\0"
325 },
326 [SYSCALL_FILTER_SET_AIO] = {
327 .name = "@aio",
328 .help = "Asynchronous IO",
329 .value =
330 "io_cancel\0"
331 "io_destroy\0"
332 "io_getevents\0"
333 "io_pgetevents\0"
334 "io_pgetevents_time64\0"
335 "io_setup\0"
336 "io_submit\0"
337 "io_uring_enter\0"
338 "io_uring_register\0"
339 "io_uring_setup\0"
340 },
341 [SYSCALL_FILTER_SET_BASIC_IO] = {
342 .name = "@basic-io",
343 .help = "Basic IO",
344 .value =
345 "_llseek\0"
346 "close\0"
347 "dup\0"
348 "dup2\0"
349 "dup3\0"
350 "lseek\0"
351 "pread64\0"
352 "preadv\0"
353 "preadv2\0"
354 "pwrite64\0"
355 "pwritev\0"
356 "pwritev2\0"
357 "read\0"
358 "readv\0"
359 "write\0"
360 "writev\0"
361 },
362 [SYSCALL_FILTER_SET_CHOWN] = {
363 .name = "@chown",
364 .help = "Change ownership of files and directories",
365 .value =
366 "chown\0"
367 "chown32\0"
368 "fchown\0"
369 "fchown32\0"
370 "fchownat\0"
371 "lchown\0"
372 "lchown32\0"
373 },
374 [SYSCALL_FILTER_SET_CLOCK] = {
375 .name = "@clock",
376 .help = "Change the system time",
377 .value =
378 "adjtimex\0"
379 "clock_adjtime\0"
380 "clock_adjtime64\0"
381 "clock_settime\0"
382 "clock_settime64\0"
383 "settimeofday\0"
384 "stime\0"
385 },
386 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
387 .name = "@cpu-emulation",
388 .help = "System calls for CPU emulation functionality",
389 .value =
390 "modify_ldt\0"
391 "subpage_prot\0"
392 "switch_endian\0"
393 "vm86\0"
394 "vm86old\0"
395 },
396 [SYSCALL_FILTER_SET_DEBUG] = {
397 .name = "@debug",
398 .help = "Debugging, performance monitoring and tracing functionality",
399 .value =
400 "lookup_dcookie\0"
401 "perf_event_open\0"
402 "pidfd_getfd\0"
403 "ptrace\0"
404 "rtas\0"
405 #if defined __s390__ || defined __s390x__
406 "s390_runtime_instr\0"
407 #endif
408 "sys_debug_setcontext\0"
409 },
410 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
411 .name = "@file-system",
412 .help = "File system operations",
413 .value =
414 "access\0"
415 "chdir\0"
416 "chmod\0"
417 "close\0"
418 "creat\0"
419 "faccessat\0"
420 "faccessat2\0"
421 "fallocate\0"
422 "fchdir\0"
423 "fchmod\0"
424 "fchmodat\0"
425 "fcntl\0"
426 "fcntl64\0"
427 "fgetxattr\0"
428 "flistxattr\0"
429 "fremovexattr\0"
430 "fsetxattr\0"
431 "fstat\0"
432 "fstat64\0"
433 "fstatat64\0"
434 "fstatfs\0"
435 "fstatfs64\0"
436 "ftruncate\0"
437 "ftruncate64\0"
438 "futimesat\0"
439 "getcwd\0"
440 "getdents\0"
441 "getdents64\0"
442 "getxattr\0"
443 "inotify_add_watch\0"
444 "inotify_init\0"
445 "inotify_init1\0"
446 "inotify_rm_watch\0"
447 "lgetxattr\0"
448 "link\0"
449 "linkat\0"
450 "listxattr\0"
451 "llistxattr\0"
452 "lremovexattr\0"
453 "lsetxattr\0"
454 "lstat\0"
455 "lstat64\0"
456 "mkdir\0"
457 "mkdirat\0"
458 "mknod\0"
459 "mknodat\0"
460 "mmap\0"
461 "mmap2\0"
462 "munmap\0"
463 "newfstatat\0"
464 "oldfstat\0"
465 "oldlstat\0"
466 "oldstat\0"
467 "open\0"
468 "openat\0"
469 "openat2\0"
470 "readlink\0"
471 "readlinkat\0"
472 "removexattr\0"
473 "rename\0"
474 "renameat\0"
475 "renameat2\0"
476 "rmdir\0"
477 "setxattr\0"
478 "stat\0"
479 "stat64\0"
480 "statfs\0"
481 "statfs64\0"
482 "statx\0"
483 "symlink\0"
484 "symlinkat\0"
485 "truncate\0"
486 "truncate64\0"
487 "unlink\0"
488 "unlinkat\0"
489 "utime\0"
490 "utimensat\0"
491 "utimensat_time64\0"
492 "utimes\0"
493 },
494 [SYSCALL_FILTER_SET_IO_EVENT] = {
495 .name = "@io-event",
496 .help = "Event loop system calls",
497 .value =
498 "_newselect\0"
499 "epoll_create\0"
500 "epoll_create1\0"
501 "epoll_ctl\0"
502 "epoll_ctl_old\0"
503 "epoll_pwait\0"
504 "epoll_wait\0"
505 "epoll_wait_old\0"
506 "eventfd\0"
507 "eventfd2\0"
508 "poll\0"
509 "ppoll\0"
510 "ppoll_time64\0"
511 "pselect6\0"
512 "pselect6_time64\0"
513 "select\0"
514 },
515 [SYSCALL_FILTER_SET_IPC] = {
516 .name = "@ipc",
517 .help = "SysV IPC, POSIX Message Queues or other IPC",
518 .value =
519 "ipc\0"
520 "memfd_create\0"
521 "mq_getsetattr\0"
522 "mq_notify\0"
523 "mq_open\0"
524 "mq_timedreceive\0"
525 "mq_timedreceive_time64\0"
526 "mq_timedsend\0"
527 "mq_timedsend_time64\0"
528 "mq_unlink\0"
529 "msgctl\0"
530 "msgget\0"
531 "msgrcv\0"
532 "msgsnd\0"
533 "pipe\0"
534 "pipe2\0"
535 "process_vm_readv\0"
536 "process_vm_writev\0"
537 "semctl\0"
538 "semget\0"
539 "semop\0"
540 "semtimedop\0"
541 "semtimedop_time64\0"
542 "shmat\0"
543 "shmctl\0"
544 "shmdt\0"
545 "shmget\0"
546 },
547 [SYSCALL_FILTER_SET_KEYRING] = {
548 .name = "@keyring",
549 .help = "Kernel keyring access",
550 .value =
551 "add_key\0"
552 "keyctl\0"
553 "request_key\0"
554 },
555 [SYSCALL_FILTER_SET_MEMLOCK] = {
556 .name = "@memlock",
557 .help = "Memory locking control",
558 .value =
559 "mlock\0"
560 "mlock2\0"
561 "mlockall\0"
562 "munlock\0"
563 "munlockall\0"
564 },
565 [SYSCALL_FILTER_SET_MODULE] = {
566 .name = "@module",
567 .help = "Loading and unloading of kernel modules",
568 .value =
569 "delete_module\0"
570 "finit_module\0"
571 "init_module\0"
572 },
573 [SYSCALL_FILTER_SET_MOUNT] = {
574 .name = "@mount",
575 .help = "Mounting and unmounting of file systems",
576 .value =
577 "chroot\0"
578 "fsconfig\0"
579 "fsmount\0"
580 "fsopen\0"
581 "fspick\0"
582 "mount\0"
583 "move_mount\0"
584 "open_tree\0"
585 "pivot_root\0"
586 "umount\0"
587 "umount2\0"
588 },
589 [SYSCALL_FILTER_SET_NETWORK_IO] = {
590 .name = "@network-io",
591 .help = "Network or Unix socket IO, should not be needed if not network facing",
592 .value =
593 "accept\0"
594 "accept4\0"
595 "bind\0"
596 "connect\0"
597 "getpeername\0"
598 "getsockname\0"
599 "getsockopt\0"
600 "listen\0"
601 "recv\0"
602 "recvfrom\0"
603 "recvmmsg\0"
604 "recvmmsg_time64\0"
605 "recvmsg\0"
606 "send\0"
607 "sendmmsg\0"
608 "sendmsg\0"
609 "sendto\0"
610 "setsockopt\0"
611 "shutdown\0"
612 "socket\0"
613 "socketcall\0"
614 "socketpair\0"
615 },
616 [SYSCALL_FILTER_SET_OBSOLETE] = {
617 /* some unknown even to libseccomp */
618 .name = "@obsolete",
619 .help = "Unusual, obsolete or unimplemented system calls",
620 .value =
621 "_sysctl\0"
622 "afs_syscall\0"
623 "bdflush\0"
624 "break\0"
625 "create_module\0"
626 "ftime\0"
627 "get_kernel_syms\0"
628 "getpmsg\0"
629 "gtty\0"
630 "idle\0"
631 "lock\0"
632 "mpx\0"
633 "prof\0"
634 "profil\0"
635 "putpmsg\0"
636 "query_module\0"
637 "security\0"
638 "sgetmask\0"
639 "ssetmask\0"
640 "stty\0"
641 "sysfs\0"
642 "tuxcall\0"
643 "ulimit\0"
644 "uselib\0"
645 "ustat\0"
646 "vserver\0"
647 },
648 [SYSCALL_FILTER_SET_PKEY] = {
649 .name = "@pkey",
650 .help = "System calls used for memory protection keys",
651 .value =
652 "pkey_alloc\0"
653 "pkey_free\0"
654 "pkey_mprotect\0"
655 },
656 [SYSCALL_FILTER_SET_PRIVILEGED] = {
657 .name = "@privileged",
658 .help = "All system calls which need super-user capabilities",
659 .value =
660 "@chown\0"
661 "@clock\0"
662 "@module\0"
663 "@raw-io\0"
664 "@reboot\0"
665 "@swap\0"
666 "_sysctl\0"
667 "acct\0"
668 "bpf\0"
669 "capset\0"
670 "chroot\0"
671 "fanotify_init\0"
672 "fanotify_mark\0"
673 "nfsservctl\0"
674 "open_by_handle_at\0"
675 "pivot_root\0"
676 "quotactl\0"
677 "setdomainname\0"
678 "setfsuid\0"
679 "setfsuid32\0"
680 "setgroups\0"
681 "setgroups32\0"
682 "sethostname\0"
683 "setresuid\0"
684 "setresuid32\0"
685 "setreuid\0"
686 "setreuid32\0"
687 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
688 "setuid32\0"
689 "vhangup\0"
690 },
691 [SYSCALL_FILTER_SET_PROCESS] = {
692 .name = "@process",
693 .help = "Process control, execution, namespaceing operations",
694 .value =
695 "arch_prctl\0"
696 "capget\0" /* Able to query arbitrary processes */
697 "clone\0"
698 "clone3\0"
699 "execveat\0"
700 "fork\0"
701 "getrusage\0"
702 "kill\0"
703 "pidfd_open\0"
704 "pidfd_send_signal\0"
705 "prctl\0"
706 "rt_sigqueueinfo\0"
707 "rt_tgsigqueueinfo\0"
708 "setns\0"
709 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
710 "tgkill\0"
711 "times\0"
712 "tkill\0"
713 "unshare\0"
714 "vfork\0"
715 "wait4\0"
716 "waitid\0"
717 "waitpid\0"
718 },
719 [SYSCALL_FILTER_SET_RAW_IO] = {
720 .name = "@raw-io",
721 .help = "Raw I/O port access",
722 .value =
723 "ioperm\0"
724 "iopl\0"
725 "pciconfig_iobase\0"
726 "pciconfig_read\0"
727 "pciconfig_write\0"
728 #if defined __s390__ || defined __s390x__
729 "s390_pci_mmio_read\0"
730 "s390_pci_mmio_write\0"
731 #endif
732 },
733 [SYSCALL_FILTER_SET_REBOOT] = {
734 .name = "@reboot",
735 .help = "Reboot and reboot preparation/kexec",
736 .value =
737 "kexec_file_load\0"
738 "kexec_load\0"
739 "reboot\0"
740 },
741 [SYSCALL_FILTER_SET_RESOURCES] = {
742 .name = "@resources",
743 .help = "Alter resource settings",
744 .value =
745 "ioprio_set\0"
746 "mbind\0"
747 "migrate_pages\0"
748 "move_pages\0"
749 "nice\0"
750 "sched_setaffinity\0"
751 "sched_setattr\0"
752 "sched_setparam\0"
753 "sched_setscheduler\0"
754 "set_mempolicy\0"
755 "setpriority\0"
756 "setrlimit\0"
757 },
758 [SYSCALL_FILTER_SET_SETUID] = {
759 .name = "@setuid",
760 .help = "Operations for changing user/group credentials",
761 .value =
762 "setgid\0"
763 "setgid32\0"
764 "setgroups\0"
765 "setgroups32\0"
766 "setregid\0"
767 "setregid32\0"
768 "setresgid\0"
769 "setresgid32\0"
770 "setresuid\0"
771 "setresuid32\0"
772 "setreuid\0"
773 "setreuid32\0"
774 "setuid\0"
775 "setuid32\0"
776 },
777 [SYSCALL_FILTER_SET_SIGNAL] = {
778 .name = "@signal",
779 .help = "Process signal handling",
780 .value =
781 "rt_sigaction\0"
782 "rt_sigpending\0"
783 "rt_sigprocmask\0"
784 "rt_sigsuspend\0"
785 "rt_sigtimedwait\0"
786 "rt_sigtimedwait_time64\0"
787 "sigaction\0"
788 "sigaltstack\0"
789 "signal\0"
790 "signalfd\0"
791 "signalfd4\0"
792 "sigpending\0"
793 "sigprocmask\0"
794 "sigsuspend\0"
795 },
796 [SYSCALL_FILTER_SET_SWAP] = {
797 .name = "@swap",
798 .help = "Enable/disable swap devices",
799 .value =
800 "swapoff\0"
801 "swapon\0"
802 },
803 [SYSCALL_FILTER_SET_SYNC] = {
804 .name = "@sync",
805 .help = "Synchronize files and memory to storage",
806 .value =
807 "fdatasync\0"
808 "fsync\0"
809 "msync\0"
810 "sync\0"
811 "sync_file_range\0"
812 "sync_file_range2\0"
813 "syncfs\0"
814 },
815 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
816 .name = "@system-service",
817 .help = "General system service operations",
818 .value =
819 "@aio\0"
820 "@basic-io\0"
821 "@chown\0"
822 "@default\0"
823 "@file-system\0"
824 "@io-event\0"
825 "@ipc\0"
826 "@keyring\0"
827 "@memlock\0"
828 "@network-io\0"
829 "@process\0"
830 "@resources\0"
831 "@setuid\0"
832 "@signal\0"
833 "@sync\0"
834 "@timer\0"
835 "brk\0"
836 "capget\0"
837 "capset\0"
838 "copy_file_range\0"
839 "fadvise64\0"
840 "fadvise64_64\0"
841 "flock\0"
842 "get_mempolicy\0"
843 "getcpu\0"
844 "getpriority\0"
845 "getrandom\0"
846 "ioctl\0"
847 "ioprio_get\0"
848 "kcmp\0"
849 "madvise\0"
850 "mprotect\0"
851 "mremap\0"
852 "name_to_handle_at\0"
853 "oldolduname\0"
854 "olduname\0"
855 "personality\0"
856 "readahead\0"
857 "readdir\0"
858 "remap_file_pages\0"
859 "sched_get_priority_max\0"
860 "sched_get_priority_min\0"
861 "sched_getaffinity\0"
862 "sched_getattr\0"
863 "sched_getparam\0"
864 "sched_getscheduler\0"
865 "sched_rr_get_interval\0"
866 "sched_rr_get_interval_time64\0"
867 "sched_yield\0"
868 "sendfile\0"
869 "sendfile64\0"
870 "setfsgid\0"
871 "setfsgid32\0"
872 "setfsuid\0"
873 "setfsuid32\0"
874 "setpgid\0"
875 "setsid\0"
876 "splice\0"
877 "sysinfo\0"
878 "tee\0"
879 "umask\0"
880 "uname\0"
881 "userfaultfd\0"
882 "vmsplice\0"
883 },
884 [SYSCALL_FILTER_SET_TIMER] = {
885 .name = "@timer",
886 .help = "Schedule operations by time",
887 .value =
888 "alarm\0"
889 "getitimer\0"
890 "setitimer\0"
891 "timer_create\0"
892 "timer_delete\0"
893 "timer_getoverrun\0"
894 "timer_gettime\0"
895 "timer_gettime64\0"
896 "timer_settime\0"
897 "timer_settime64\0"
898 "timerfd_create\0"
899 "timerfd_gettime\0"
900 "timerfd_gettime64\0"
901 "timerfd_settime\0"
902 "timerfd_settime64\0"
903 "times\0"
904 },
905 [SYSCALL_FILTER_SET_KNOWN] = {
906 .name = "@known",
907 .help = "All known syscalls declared in the kernel",
908 .value =
909 #include "syscall-list.h"
910 },
911 };
912
913 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
914 if (isempty(name) || name[0] != '@')
915 return NULL;
916
917 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
918 if (streq(syscall_filter_sets[i].name, name))
919 return syscall_filter_sets + i;
920
921 return NULL;
922 }
923
924 static int add_syscall_filter_set(
925 scmp_filter_ctx seccomp,
926 const SyscallFilterSet *set,
927 uint32_t action,
928 char **exclude,
929 bool log_missing,
930 char ***added);
931
932 int seccomp_add_syscall_filter_item(
933 scmp_filter_ctx *seccomp,
934 const char *name,
935 uint32_t action,
936 char **exclude,
937 bool log_missing,
938 char ***added) {
939
940 assert(seccomp);
941 assert(name);
942
943 if (strv_contains(exclude, name))
944 return 0;
945
946 /* Any syscalls that are handled are added to the *added strv. The pointer
947 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
948
949 if (name[0] == '@') {
950 const SyscallFilterSet *other;
951
952 other = syscall_filter_set_find(name);
953 if (!other)
954 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
955 "Filter set %s is not known!",
956 name);
957
958 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
959
960 } else {
961 int id, r;
962
963 id = seccomp_syscall_resolve_name(name);
964 if (id == __NR_SCMP_ERROR) {
965 if (log_missing)
966 log_debug("System call %s is not known, ignoring.", name);
967 return 0;
968 }
969
970 r = seccomp_rule_add_exact(seccomp, action, id, 0);
971 if (r < 0) {
972 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
973 bool ignore = r == -EDOM;
974
975 if (!ignore || log_missing)
976 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
977 name, id, ignore ? ", ignoring" : "");
978 if (!ignore)
979 return r;
980 }
981
982 if (added) {
983 r = strv_extend(added, name);
984 if (r < 0)
985 return r;
986 }
987
988 return 0;
989 }
990 }
991
992 static int add_syscall_filter_set(
993 scmp_filter_ctx seccomp,
994 const SyscallFilterSet *set,
995 uint32_t action,
996 char **exclude,
997 bool log_missing,
998 char ***added) {
999
1000 const char *sys;
1001 int r;
1002
1003 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1004
1005 assert(seccomp);
1006 assert(set);
1007
1008 NULSTR_FOREACH(sys, set->value) {
1009 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1010 if (r < 0)
1011 return r;
1012 }
1013
1014 return 0;
1015 }
1016
1017 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1018 uint32_t arch;
1019 int r;
1020
1021 assert(set);
1022
1023 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1024 * each local arch. */
1025
1026 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1027 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1028
1029 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1030
1031 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1032 if (r < 0)
1033 return r;
1034
1035 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1036 if (r < 0)
1037 return log_debug_errno(r, "Failed to add filter set: %m");
1038
1039 r = seccomp_load(seccomp);
1040 if (ERRNO_IS_SECCOMP_FATAL(r))
1041 return r;
1042 if (r < 0)
1043 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1044 }
1045
1046 return 0;
1047 }
1048
1049 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1050 uint32_t arch;
1051 int r;
1052
1053 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1054 * SyscallFilterSet* table. */
1055
1056 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1057 return 0;
1058
1059 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1060 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1061 void *syscall_id, *val;
1062
1063 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1064
1065 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1066 if (r < 0)
1067 return r;
1068
1069 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
1070 uint32_t a = action;
1071 int id = PTR_TO_INT(syscall_id) - 1;
1072 int error = PTR_TO_INT(val);
1073
1074 if (error == SECCOMP_ERROR_NUMBER_KILL)
1075 a = scmp_act_kill_process();
1076 #ifdef SCMP_ACT_LOG
1077 else if (action == SCMP_ACT_LOG)
1078 a = SCMP_ACT_LOG;
1079 #endif
1080 else if (action != SCMP_ACT_ALLOW && error >= 0)
1081 a = SCMP_ACT_ERRNO(error);
1082
1083 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1084 if (r < 0) {
1085 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1086 _cleanup_free_ char *n = NULL;
1087 bool ignore;
1088
1089 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1090 ignore = r == -EDOM;
1091 if (!ignore || log_missing)
1092 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1093 strna(n), id, ignore ? ", ignoring" : "");
1094 if (!ignore)
1095 return r;
1096 }
1097 }
1098
1099 r = seccomp_load(seccomp);
1100 if (ERRNO_IS_SECCOMP_FATAL(r))
1101 return r;
1102 if (r < 0)
1103 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1104 }
1105
1106 return 0;
1107 }
1108
1109 int seccomp_parse_syscall_filter(
1110 const char *name,
1111 int errno_num,
1112 Hashmap *filter,
1113 SeccompParseFlags flags,
1114 const char *unit,
1115 const char *filename,
1116 unsigned line) {
1117
1118 int r;
1119
1120 assert(name);
1121 assert(filter);
1122
1123 if (name[0] == '@') {
1124 const SyscallFilterSet *set;
1125 const char *i;
1126
1127 set = syscall_filter_set_find(name);
1128 if (!set) {
1129 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1130 return -EINVAL;
1131
1132 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1133 "Unknown system call group, ignoring: %s", name);
1134 return 0;
1135 }
1136
1137 NULSTR_FOREACH(i, set->value) {
1138 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1139 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1140 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1141 * about them. */
1142 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1143 if (r < 0)
1144 return r;
1145 }
1146 } else {
1147 int id;
1148
1149 id = seccomp_syscall_resolve_name(name);
1150 if (id == __NR_SCMP_ERROR) {
1151 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1152 return -EINVAL;
1153
1154 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1155 "Failed to parse system call, ignoring: %s", name);
1156 return 0;
1157 }
1158
1159 /* If we previously wanted to forbid a syscall and now
1160 * we want to allow it, then remove it from the list. */
1161 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
1162 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1163 if (r < 0)
1164 switch (r) {
1165 case -ENOMEM:
1166 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1167 case -EEXIST:
1168 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1169 break;
1170 default:
1171 return r;
1172 }
1173 } else
1174 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1175 }
1176
1177 return 0;
1178 }
1179
1180 int seccomp_restrict_namespaces(unsigned long retain) {
1181 uint32_t arch;
1182 int r;
1183
1184 if (DEBUG_LOGGING) {
1185 _cleanup_free_ char *s = NULL;
1186
1187 (void) namespace_flags_to_string(retain, &s);
1188 log_debug("Restricting namespace to: %s.", strna(s));
1189 }
1190
1191 /* NOOP? */
1192 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1193 return 0;
1194
1195 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1196 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1197
1198 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1199
1200 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1201 if (r < 0)
1202 return r;
1203
1204 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1205 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1206 * altogether. */
1207 r = seccomp_rule_add_exact(
1208 seccomp,
1209 SCMP_ACT_ERRNO(EPERM),
1210 SCMP_SYS(setns),
1211 0);
1212 else
1213 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1214 * special invocation with a zero flags argument, right here. */
1215 r = seccomp_rule_add_exact(
1216 seccomp,
1217 SCMP_ACT_ERRNO(EPERM),
1218 SCMP_SYS(setns),
1219 1,
1220 SCMP_A1(SCMP_CMP_EQ, 0));
1221 if (r < 0) {
1222 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1223 continue;
1224 }
1225
1226 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1227 unsigned long f;
1228
1229 f = namespace_flag_map[i].flag;
1230 if ((retain & f) == f) {
1231 log_debug("Permitting %s.", namespace_flag_map[i].name);
1232 continue;
1233 }
1234
1235 log_debug("Blocking %s.", namespace_flag_map[i].name);
1236
1237 r = seccomp_rule_add_exact(
1238 seccomp,
1239 SCMP_ACT_ERRNO(EPERM),
1240 SCMP_SYS(unshare),
1241 1,
1242 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1243 if (r < 0) {
1244 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1245 break;
1246 }
1247
1248 /* On s390/s390x the first two parameters to clone are switched */
1249 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1250 r = seccomp_rule_add_exact(
1251 seccomp,
1252 SCMP_ACT_ERRNO(EPERM),
1253 SCMP_SYS(clone),
1254 1,
1255 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1256 else
1257 r = seccomp_rule_add_exact(
1258 seccomp,
1259 SCMP_ACT_ERRNO(EPERM),
1260 SCMP_SYS(clone),
1261 1,
1262 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1263 if (r < 0) {
1264 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1265 break;
1266 }
1267
1268 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1269 r = seccomp_rule_add_exact(
1270 seccomp,
1271 SCMP_ACT_ERRNO(EPERM),
1272 SCMP_SYS(setns),
1273 1,
1274 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1275 if (r < 0) {
1276 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1277 break;
1278 }
1279 }
1280 }
1281 if (r < 0)
1282 continue;
1283
1284 r = seccomp_load(seccomp);
1285 if (ERRNO_IS_SECCOMP_FATAL(r))
1286 return r;
1287 if (r < 0)
1288 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289 }
1290
1291 return 0;
1292 }
1293
1294 int seccomp_protect_sysctl(void) {
1295 uint32_t arch;
1296 int r;
1297
1298 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1299 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1300
1301 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1302
1303 if (IN_SET(arch,
1304 SCMP_ARCH_AARCH64,
1305 #ifdef SCMP_ARCH_RISCV64
1306 SCMP_ARCH_RISCV64,
1307 #endif
1308 SCMP_ARCH_X32
1309 ))
1310 /* No _sysctl syscall */
1311 continue;
1312
1313 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1314 if (r < 0)
1315 return r;
1316
1317 r = seccomp_rule_add_exact(
1318 seccomp,
1319 SCMP_ACT_ERRNO(EPERM),
1320 SCMP_SYS(_sysctl),
1321 0);
1322 if (r < 0) {
1323 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1324 continue;
1325 }
1326
1327 r = seccomp_load(seccomp);
1328 if (ERRNO_IS_SECCOMP_FATAL(r))
1329 return r;
1330 if (r < 0)
1331 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1332 }
1333
1334 return 0;
1335 }
1336
1337 int seccomp_protect_syslog(void) {
1338 uint32_t arch;
1339 int r;
1340
1341 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1342 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1343
1344 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1345 if (r < 0)
1346 return r;
1347
1348 r = seccomp_rule_add_exact(
1349 seccomp,
1350 SCMP_ACT_ERRNO(EPERM),
1351 SCMP_SYS(syslog),
1352 0);
1353
1354 if (r < 0) {
1355 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1356 continue;
1357 }
1358
1359 r = seccomp_load(seccomp);
1360 if (ERRNO_IS_SECCOMP_FATAL(r))
1361 return r;
1362 if (r < 0)
1363 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1364 }
1365
1366 return 0;
1367 }
1368
1369 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1370 uint32_t arch;
1371 int r;
1372
1373 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1374 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1375 bool supported;
1376
1377 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1378
1379 switch (arch) {
1380
1381 case SCMP_ARCH_X86_64:
1382 case SCMP_ARCH_X32:
1383 case SCMP_ARCH_ARM:
1384 case SCMP_ARCH_AARCH64:
1385 case SCMP_ARCH_PPC:
1386 case SCMP_ARCH_PPC64:
1387 case SCMP_ARCH_PPC64LE:
1388 case SCMP_ARCH_MIPSEL64N32:
1389 case SCMP_ARCH_MIPS64N32:
1390 case SCMP_ARCH_MIPSEL64:
1391 case SCMP_ARCH_MIPS64:
1392 #ifdef SCMP_ARCH_RISCV64
1393 case SCMP_ARCH_RISCV64:
1394 #endif
1395 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1396 supported = true;
1397 break;
1398
1399 case SCMP_ARCH_S390:
1400 case SCMP_ARCH_S390X:
1401 case SCMP_ARCH_X86:
1402 case SCMP_ARCH_MIPSEL:
1403 case SCMP_ARCH_MIPS:
1404 default:
1405 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1406 * don't know */
1407 supported = false;
1408 break;
1409 }
1410
1411 if (!supported)
1412 continue;
1413
1414 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1415 if (r < 0)
1416 return r;
1417
1418 if (allow_list) {
1419 int first = 0, last = 0;
1420 void *afp;
1421
1422 /* If this is an allow list, we first block the address families that are out of
1423 * range and then everything that is not in the set. First, we find the lowest and
1424 * highest address family in the set. */
1425
1426 SET_FOREACH(afp, address_families) {
1427 int af = PTR_TO_INT(afp);
1428
1429 if (af <= 0 || af >= af_max())
1430 continue;
1431
1432 if (first == 0 || af < first)
1433 first = af;
1434
1435 if (last == 0 || af > last)
1436 last = af;
1437 }
1438
1439 assert((first == 0) == (last == 0));
1440
1441 if (first == 0) {
1442
1443 /* No entries in the valid range, block everything */
1444 r = seccomp_rule_add_exact(
1445 seccomp,
1446 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1447 SCMP_SYS(socket),
1448 0);
1449 if (r < 0) {
1450 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1451 continue;
1452 }
1453
1454 } else {
1455
1456 /* Block everything below the first entry */
1457 r = seccomp_rule_add_exact(
1458 seccomp,
1459 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1460 SCMP_SYS(socket),
1461 1,
1462 SCMP_A0(SCMP_CMP_LT, first));
1463 if (r < 0) {
1464 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1465 continue;
1466 }
1467
1468 /* Block everything above the last entry */
1469 r = seccomp_rule_add_exact(
1470 seccomp,
1471 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1472 SCMP_SYS(socket),
1473 1,
1474 SCMP_A0(SCMP_CMP_GT, last));
1475 if (r < 0) {
1476 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 continue;
1478 }
1479
1480 /* Block everything between the first and last entry */
1481 for (int af = 1; af < af_max(); af++) {
1482
1483 if (set_contains(address_families, INT_TO_PTR(af)))
1484 continue;
1485
1486 r = seccomp_rule_add_exact(
1487 seccomp,
1488 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1489 SCMP_SYS(socket),
1490 1,
1491 SCMP_A0(SCMP_CMP_EQ, af));
1492 if (r < 0)
1493 break;
1494 }
1495 if (r < 0) {
1496 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1497 continue;
1498 }
1499 }
1500
1501 } else {
1502 void *af;
1503
1504 /* If this is a deny list, then generate one rule for each address family that are
1505 * then combined in OR checks. */
1506
1507 SET_FOREACH(af, address_families) {
1508 r = seccomp_rule_add_exact(
1509 seccomp,
1510 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1511 SCMP_SYS(socket),
1512 1,
1513 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1514 if (r < 0)
1515 break;
1516 }
1517 if (r < 0) {
1518 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1519 continue;
1520 }
1521 }
1522
1523 r = seccomp_load(seccomp);
1524 if (ERRNO_IS_SECCOMP_FATAL(r))
1525 return r;
1526 if (r < 0)
1527 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1528 }
1529
1530 return 0;
1531 }
1532
1533 int seccomp_restrict_realtime(void) {
1534 static const int permitted_policies[] = {
1535 SCHED_OTHER,
1536 SCHED_BATCH,
1537 SCHED_IDLE,
1538 };
1539
1540 int r, max_policy = 0;
1541 uint32_t arch;
1542 unsigned i;
1543
1544 /* Determine the highest policy constant we want to allow */
1545 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1546 if (permitted_policies[i] > max_policy)
1547 max_policy = permitted_policies[i];
1548
1549 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1550 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1551 int p;
1552
1553 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1554
1555 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1556 if (r < 0)
1557 return r;
1558
1559 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1560 * allow list. */
1561 for (p = 0; p < max_policy; p++) {
1562 bool good = false;
1563
1564 /* Check if this is in the allow list. */
1565 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1566 if (permitted_policies[i] == p) {
1567 good = true;
1568 break;
1569 }
1570
1571 if (good)
1572 continue;
1573
1574 /* Deny this policy */
1575 r = seccomp_rule_add_exact(
1576 seccomp,
1577 SCMP_ACT_ERRNO(EPERM),
1578 SCMP_SYS(sched_setscheduler),
1579 1,
1580 SCMP_A1(SCMP_CMP_EQ, p));
1581 if (r < 0) {
1582 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1583 continue;
1584 }
1585 }
1586
1587 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1588 * are unsigned here, hence no need no check for < 0 values. */
1589 r = seccomp_rule_add_exact(
1590 seccomp,
1591 SCMP_ACT_ERRNO(EPERM),
1592 SCMP_SYS(sched_setscheduler),
1593 1,
1594 SCMP_A1(SCMP_CMP_GT, max_policy));
1595 if (r < 0) {
1596 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1597 continue;
1598 }
1599
1600 r = seccomp_load(seccomp);
1601 if (ERRNO_IS_SECCOMP_FATAL(r))
1602 return r;
1603 if (r < 0)
1604 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1605 }
1606
1607 return 0;
1608 }
1609
1610 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1611 uint32_t arch,
1612 int nr,
1613 unsigned arg_cnt,
1614 const struct scmp_arg_cmp arg) {
1615 int r;
1616
1617 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1618 if (r < 0) {
1619 _cleanup_free_ char *n = NULL;
1620
1621 n = seccomp_syscall_resolve_num_arch(arch, nr);
1622 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1623 strna(n),
1624 seccomp_arch_to_string(arch));
1625 }
1626
1627 return r;
1628 }
1629
1630 /* For known architectures, check that syscalls are indeed defined or not. */
1631 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1632 assert_cc(SCMP_SYS(shmget) > 0);
1633 assert_cc(SCMP_SYS(shmat) > 0);
1634 assert_cc(SCMP_SYS(shmdt) > 0);
1635 #endif
1636
1637 int seccomp_memory_deny_write_execute(void) {
1638 uint32_t arch;
1639 unsigned loaded = 0;
1640
1641 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1642 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1643 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1644
1645 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1646
1647 switch (arch) {
1648
1649 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1650 * We ignore that here, which means there's still a way to get writable/executable
1651 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1652
1653 case SCMP_ARCH_X86:
1654 case SCMP_ARCH_S390:
1655 filter_syscall = SCMP_SYS(mmap2);
1656 block_syscall = SCMP_SYS(mmap);
1657 /* shmat multiplexed, see above */
1658 break;
1659
1660 case SCMP_ARCH_PPC:
1661 case SCMP_ARCH_PPC64:
1662 case SCMP_ARCH_PPC64LE:
1663 case SCMP_ARCH_S390X:
1664 filter_syscall = SCMP_SYS(mmap);
1665 /* shmat multiplexed, see above */
1666 break;
1667
1668 case SCMP_ARCH_ARM:
1669 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1670 shmat_syscall = SCMP_SYS(shmat);
1671 break;
1672
1673 case SCMP_ARCH_X86_64:
1674 case SCMP_ARCH_X32:
1675 case SCMP_ARCH_AARCH64:
1676 #ifdef SCMP_ARCH_RISCV64
1677 case SCMP_ARCH_RISCV64:
1678 #endif
1679 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1680 shmat_syscall = SCMP_SYS(shmat);
1681 break;
1682
1683 /* Please add more definitions here, if you port systemd to other architectures! */
1684
1685 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1686 #warning "Consider adding the right mmap() syscall definitions here!"
1687 #endif
1688 }
1689
1690 /* Can't filter mmap() on this arch, then skip it */
1691 if (filter_syscall == 0)
1692 continue;
1693
1694 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1695 if (r < 0)
1696 return r;
1697
1698 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1699 1,
1700 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1701 if (r < 0)
1702 continue;
1703
1704 if (block_syscall != 0) {
1705 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1706 if (r < 0)
1707 continue;
1708 }
1709
1710 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1711 1,
1712 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1713 if (r < 0)
1714 continue;
1715
1716 #ifdef __NR_pkey_mprotect
1717 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1718 1,
1719 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1720 if (r < 0)
1721 continue;
1722 #endif
1723
1724 if (shmat_syscall > 0) {
1725 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1726 1,
1727 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1728 if (r < 0)
1729 continue;
1730 }
1731
1732 r = seccomp_load(seccomp);
1733 if (ERRNO_IS_SECCOMP_FATAL(r))
1734 return r;
1735 if (r < 0)
1736 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1737 seccomp_arch_to_string(arch));
1738 loaded++;
1739 }
1740
1741 if (loaded == 0)
1742 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1743
1744 return loaded;
1745 }
1746
1747 int seccomp_restrict_archs(Set *archs) {
1748 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1749 void *id;
1750 int r;
1751
1752 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1753 * list.
1754 *
1755 * There are some qualifications. However the most important use is to stop processes from bypassing
1756 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1757 * in a non-native architecture. There are no holes in this use case, at least so far. */
1758
1759 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1760 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1761 * to run a program with the restrictions applied. */
1762 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1763 if (!seccomp)
1764 return -ENOMEM;
1765
1766 SET_FOREACH(id, archs) {
1767 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1768 if (r < 0 && r != -EEXIST)
1769 return r;
1770 }
1771
1772 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1773 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1774 * The important thing is that you can block the old 32-bit x86 syscalls.
1775 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1776
1777 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1778 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1779
1780 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1781 if (r < 0 && r != -EEXIST)
1782 return r;
1783 }
1784
1785 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1786 if (r < 0)
1787 return r;
1788
1789 r = seccomp_load(seccomp);
1790 if (ERRNO_IS_SECCOMP_FATAL(r))
1791 return r;
1792 if (r < 0)
1793 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1794
1795 return 0;
1796 }
1797
1798 int parse_syscall_archs(char **l, Set **ret_archs) {
1799 _cleanup_set_free_ Set *archs = NULL;
1800 char **s;
1801 int r;
1802
1803 assert(l);
1804 assert(ret_archs);
1805
1806 STRV_FOREACH(s, l) {
1807 uint32_t a;
1808
1809 r = seccomp_arch_from_string(*s, &a);
1810 if (r < 0)
1811 return -EINVAL;
1812
1813 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1814 if (r < 0)
1815 return -ENOMEM;
1816 }
1817
1818 *ret_archs = TAKE_PTR(archs);
1819 return 0;
1820 }
1821
1822 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1823 const char *i;
1824 int r;
1825
1826 assert(set);
1827
1828 NULSTR_FOREACH(i, set->value) {
1829
1830 if (i[0] == '@') {
1831 const SyscallFilterSet *more;
1832
1833 more = syscall_filter_set_find(i);
1834 if (!more)
1835 return -ENXIO;
1836
1837 r = seccomp_filter_set_add(filter, add, more);
1838 if (r < 0)
1839 return r;
1840 } else {
1841 int id;
1842
1843 id = seccomp_syscall_resolve_name(i);
1844 if (id == __NR_SCMP_ERROR) {
1845 log_debug("Couldn't resolve system call, ignoring: %s", i);
1846 continue;
1847 }
1848
1849 if (add) {
1850 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1851 if (r < 0)
1852 return r;
1853 } else
1854 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1855 }
1856 }
1857
1858 return 0;
1859 }
1860
1861 int seccomp_lock_personality(unsigned long personality) {
1862 uint32_t arch;
1863 int r;
1864
1865 if (personality >= PERSONALITY_INVALID)
1866 return -EINVAL;
1867
1868 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1869 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1870
1871 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1872 if (r < 0)
1873 return r;
1874
1875 r = seccomp_rule_add_exact(
1876 seccomp,
1877 SCMP_ACT_ERRNO(EPERM),
1878 SCMP_SYS(personality),
1879 1,
1880 SCMP_A0(SCMP_CMP_NE, personality));
1881 if (r < 0) {
1882 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1883 continue;
1884 }
1885
1886 r = seccomp_load(seccomp);
1887 if (ERRNO_IS_SECCOMP_FATAL(r))
1888 return r;
1889 if (r < 0)
1890 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1891 }
1892
1893 return 0;
1894 }
1895
1896 int seccomp_protect_hostname(void) {
1897 uint32_t arch;
1898 int r;
1899
1900 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1901 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1902
1903 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1904 if (r < 0)
1905 return r;
1906
1907 r = seccomp_rule_add_exact(
1908 seccomp,
1909 SCMP_ACT_ERRNO(EPERM),
1910 SCMP_SYS(sethostname),
1911 0);
1912 if (r < 0) {
1913 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1914 continue;
1915 }
1916
1917 r = seccomp_rule_add_exact(
1918 seccomp,
1919 SCMP_ACT_ERRNO(EPERM),
1920 SCMP_SYS(setdomainname),
1921 0);
1922 if (r < 0) {
1923 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1924 continue;
1925 }
1926
1927 r = seccomp_load(seccomp);
1928 if (ERRNO_IS_SECCOMP_FATAL(r))
1929 return r;
1930 if (r < 0)
1931 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1932 }
1933
1934 return 0;
1935 }
1936
1937 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1938 /* Checks the mode_t parameter of the following system calls:
1939 *
1940 * → chmod() + fchmod() + fchmodat()
1941 * → open() + creat() + openat()
1942 * → mkdir() + mkdirat()
1943 * → mknod() + mknodat()
1944 *
1945 * Returns error if *everything* failed, and 0 otherwise.
1946 */
1947 int r;
1948 bool any = false;
1949
1950 r = seccomp_rule_add_exact(
1951 seccomp,
1952 SCMP_ACT_ERRNO(EPERM),
1953 SCMP_SYS(chmod),
1954 1,
1955 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1956 if (r < 0)
1957 log_debug_errno(r, "Failed to add filter for chmod: %m");
1958 else
1959 any = true;
1960
1961 r = seccomp_rule_add_exact(
1962 seccomp,
1963 SCMP_ACT_ERRNO(EPERM),
1964 SCMP_SYS(fchmod),
1965 1,
1966 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1967 if (r < 0)
1968 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1969 else
1970 any = true;
1971
1972 r = seccomp_rule_add_exact(
1973 seccomp,
1974 SCMP_ACT_ERRNO(EPERM),
1975 SCMP_SYS(fchmodat),
1976 1,
1977 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1978 if (r < 0)
1979 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1980 else
1981 any = true;
1982
1983 r = seccomp_rule_add_exact(
1984 seccomp,
1985 SCMP_ACT_ERRNO(EPERM),
1986 SCMP_SYS(mkdir),
1987 1,
1988 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1989 if (r < 0)
1990 log_debug_errno(r, "Failed to add filter for mkdir: %m");
1991 else
1992 any = true;
1993
1994 r = seccomp_rule_add_exact(
1995 seccomp,
1996 SCMP_ACT_ERRNO(EPERM),
1997 SCMP_SYS(mkdirat),
1998 1,
1999 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2000 if (r < 0)
2001 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2002 else
2003 any = true;
2004
2005 r = seccomp_rule_add_exact(
2006 seccomp,
2007 SCMP_ACT_ERRNO(EPERM),
2008 SCMP_SYS(mknod),
2009 1,
2010 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2011 if (r < 0)
2012 log_debug_errno(r, "Failed to add filter for mknod: %m");
2013 else
2014 any = true;
2015
2016 r = seccomp_rule_add_exact(
2017 seccomp,
2018 SCMP_ACT_ERRNO(EPERM),
2019 SCMP_SYS(mknodat),
2020 1,
2021 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2022 if (r < 0)
2023 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2024 else
2025 any = true;
2026
2027 #if SCMP_SYS(open) > 0
2028 r = seccomp_rule_add_exact(
2029 seccomp,
2030 SCMP_ACT_ERRNO(EPERM),
2031 SCMP_SYS(open),
2032 2,
2033 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2034 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2035 if (r < 0)
2036 log_debug_errno(r, "Failed to add filter for open: %m");
2037 else
2038 any = true;
2039 #endif
2040
2041 r = seccomp_rule_add_exact(
2042 seccomp,
2043 SCMP_ACT_ERRNO(EPERM),
2044 SCMP_SYS(openat),
2045 2,
2046 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2047 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2048 if (r < 0)
2049 log_debug_errno(r, "Failed to add filter for openat: %m");
2050 else
2051 any = true;
2052
2053 #if defined(__SNR_openat2)
2054 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2055 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2056 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2057 * compatible with kernels that are not absolutely recent. */
2058 r = seccomp_rule_add_exact(
2059 seccomp,
2060 SCMP_ACT_ERRNO(EPERM),
2061 SCMP_SYS(openat2),
2062 0);
2063 if (r < 0)
2064 log_debug_errno(r, "Failed to add filter for openat2: %m");
2065 else
2066 any = true;
2067 #endif
2068
2069 r = seccomp_rule_add_exact(
2070 seccomp,
2071 SCMP_ACT_ERRNO(EPERM),
2072 SCMP_SYS(creat),
2073 1,
2074 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2075 if (r < 0)
2076 log_debug_errno(r, "Failed to add filter for creat: %m");
2077 else
2078 any = true;
2079
2080 return any ? 0 : r;
2081 }
2082
2083 int seccomp_restrict_suid_sgid(void) {
2084 uint32_t arch;
2085 int r, k;
2086
2087 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2088 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2089
2090 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2091 if (r < 0)
2092 return r;
2093
2094 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2095 if (r < 0)
2096 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2097
2098 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2099 if (k < 0)
2100 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2101
2102 if (r < 0 && k < 0)
2103 continue;
2104
2105 r = seccomp_load(seccomp);
2106 if (ERRNO_IS_SECCOMP_FATAL(r))
2107 return r;
2108 if (r < 0)
2109 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2110 }
2111
2112 return 0;
2113 }
2114
2115 uint32_t scmp_act_kill_process(void) {
2116
2117 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2118 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2119 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2120 * for single-threaded apps does the right thing. */
2121
2122 #ifdef SCMP_ACT_KILL_PROCESS
2123 if (seccomp_api_get() >= 3)
2124 return SCMP_ACT_KILL_PROCESS;
2125 #endif
2126
2127 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2128 }