]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
license: LGPL-2.1+ -> LGPL-2.1-or-later
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "env-util.h"
16 #include "errno-list.h"
17 #include "macro.h"
18 #include "nsflags.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
22 #include "set.h"
23 #include "string-util.h"
24 #include "strv.h"
25
26 const uint32_t seccomp_local_archs[] = {
27
28 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
29
30 #if defined(__x86_64__) && defined(__ILP32__)
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
33 SCMP_ARCH_X32, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X32,
37 SCMP_ARCH_X86_64, /* native */
38 #elif defined(__i386__)
39 SCMP_ARCH_X86,
40 #elif defined(__aarch64__)
41 SCMP_ARCH_ARM,
42 SCMP_ARCH_AARCH64, /* native */
43 #elif defined(__arm__)
44 SCMP_ARCH_ARM,
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
49 SCMP_ARCH_MIPS,
50 SCMP_ARCH_MIPSEL, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
55 SCMP_ARCH_MIPS64N32,
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64,
69 SCMP_ARCH_MIPS64,
70 SCMP_ARCH_MIPSEL64N32,
71 SCMP_ARCH_MIPS64N32, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
80 SCMP_ARCH_PPC,
81 SCMP_ARCH_PPC64LE,
82 SCMP_ARCH_PPC64, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87 #elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
89 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
90 SCMP_ARCH_RISCV64,
91 #elif defined(__s390x__)
92 SCMP_ARCH_S390,
93 SCMP_ARCH_S390X, /* native */
94 #elif defined(__s390__)
95 SCMP_ARCH_S390,
96 #endif
97 (uint32_t) -1
98 };
99
100 const char* seccomp_arch_to_string(uint32_t c) {
101 /* Maintain order used in <seccomp.h>.
102 *
103 * Names used here should be the same as those used for ConditionArchitecture=,
104 * except for "subarchitectures" like x32. */
105
106 switch(c) {
107 case SCMP_ARCH_NATIVE:
108 return "native";
109 case SCMP_ARCH_X86:
110 return "x86";
111 case SCMP_ARCH_X86_64:
112 return "x86-64";
113 case SCMP_ARCH_X32:
114 return "x32";
115 case SCMP_ARCH_ARM:
116 return "arm";
117 case SCMP_ARCH_AARCH64:
118 return "arm64";
119 case SCMP_ARCH_MIPS:
120 return "mips";
121 case SCMP_ARCH_MIPS64:
122 return "mips64";
123 case SCMP_ARCH_MIPS64N32:
124 return "mips64-n32";
125 case SCMP_ARCH_MIPSEL:
126 return "mips-le";
127 case SCMP_ARCH_MIPSEL64:
128 return "mips64-le";
129 case SCMP_ARCH_MIPSEL64N32:
130 return "mips64-le-n32";
131 case SCMP_ARCH_PPC:
132 return "ppc";
133 case SCMP_ARCH_PPC64:
134 return "ppc64";
135 case SCMP_ARCH_PPC64LE:
136 return "ppc64-le";
137 #ifdef SCMP_ARCH_RISCV64
138 case SCMP_ARCH_RISCV64:
139 return "riscv64";
140 #endif
141 case SCMP_ARCH_S390:
142 return "s390";
143 case SCMP_ARCH_S390X:
144 return "s390x";
145 default:
146 return NULL;
147 }
148 }
149
150 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
151 if (!n)
152 return -EINVAL;
153
154 assert(ret);
155
156 if (streq(n, "native"))
157 *ret = SCMP_ARCH_NATIVE;
158 else if (streq(n, "x86"))
159 *ret = SCMP_ARCH_X86;
160 else if (streq(n, "x86-64"))
161 *ret = SCMP_ARCH_X86_64;
162 else if (streq(n, "x32"))
163 *ret = SCMP_ARCH_X32;
164 else if (streq(n, "arm"))
165 *ret = SCMP_ARCH_ARM;
166 else if (streq(n, "arm64"))
167 *ret = SCMP_ARCH_AARCH64;
168 else if (streq(n, "mips"))
169 *ret = SCMP_ARCH_MIPS;
170 else if (streq(n, "mips64"))
171 *ret = SCMP_ARCH_MIPS64;
172 else if (streq(n, "mips64-n32"))
173 *ret = SCMP_ARCH_MIPS64N32;
174 else if (streq(n, "mips-le"))
175 *ret = SCMP_ARCH_MIPSEL;
176 else if (streq(n, "mips64-le"))
177 *ret = SCMP_ARCH_MIPSEL64;
178 else if (streq(n, "mips64-le-n32"))
179 *ret = SCMP_ARCH_MIPSEL64N32;
180 else if (streq(n, "ppc"))
181 *ret = SCMP_ARCH_PPC;
182 else if (streq(n, "ppc64"))
183 *ret = SCMP_ARCH_PPC64;
184 else if (streq(n, "ppc64-le"))
185 *ret = SCMP_ARCH_PPC64LE;
186 #ifdef SCMP_ARCH_RISCV64
187 else if (streq(n, "riscv64"))
188 *ret = SCMP_ARCH_RISCV64;
189 #endif
190 else if (streq(n, "s390"))
191 *ret = SCMP_ARCH_S390;
192 else if (streq(n, "s390x"))
193 *ret = SCMP_ARCH_S390X;
194 else
195 return -EINVAL;
196
197 return 0;
198 }
199
200 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
201 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
202 int r;
203
204 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
205 * any others. Also, turns off the NNP fiddling. */
206
207 seccomp = seccomp_init(default_action);
208 if (!seccomp)
209 return -ENOMEM;
210
211 if (arch != SCMP_ARCH_NATIVE &&
212 arch != seccomp_arch_native()) {
213
214 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
215 if (r < 0)
216 return r;
217
218 r = seccomp_arch_add(seccomp, arch);
219 if (r < 0)
220 return r;
221
222 assert(seccomp_arch_exist(seccomp, arch) >= 0);
223 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
224 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
225 } else {
226 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
227 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
228 }
229
230 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
231 if (r < 0)
232 return r;
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
235 if (r < 0)
236 return r;
237
238 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
239 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
240 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
241 if (r < 0)
242 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
243 }
244 #endif
245
246 *ret = TAKE_PTR(seccomp);
247 return 0;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0) {
263 int b;
264
265 b = getenv_bool_secure("SYSTEMD_SECCOMP");
266 if (b != 0) {
267 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
268 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
269
270 cached_enabled =
271 is_basic_seccomp_available() &&
272 is_seccomp_filter_available();
273 } else
274 cached_enabled = false;
275 }
276
277 return cached_enabled;
278 }
279
280 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
281 [SYSCALL_FILTER_SET_DEFAULT] = {
282 .name = "@default",
283 .help = "System calls that are always permitted",
284 .value =
285 "cacheflush\0"
286 "clock_getres\0"
287 "clock_getres_time64\0"
288 "clock_gettime\0"
289 "clock_gettime64\0"
290 "clock_nanosleep\0"
291 "clock_nanosleep_time64\0"
292 "execve\0"
293 "exit\0"
294 "exit_group\0"
295 "futex\0"
296 "futex_time64\0"
297 "get_robust_list\0"
298 "get_thread_area\0"
299 "getegid\0"
300 "getegid32\0"
301 "geteuid\0"
302 "geteuid32\0"
303 "getgid\0"
304 "getgid32\0"
305 "getgroups\0"
306 "getgroups32\0"
307 "getpgid\0"
308 "getpgrp\0"
309 "getpid\0"
310 "getppid\0"
311 "getresgid\0"
312 "getresgid32\0"
313 "getresuid\0"
314 "getresuid32\0"
315 "getrlimit\0" /* make sure processes can query stack size and such */
316 "getsid\0"
317 "gettid\0"
318 "gettimeofday\0"
319 "getuid\0"
320 "getuid32\0"
321 "membarrier\0"
322 "nanosleep\0"
323 "pause\0"
324 "prlimit64\0"
325 "restart_syscall\0"
326 "rseq\0"
327 "rt_sigreturn\0"
328 "sched_yield\0"
329 "set_robust_list\0"
330 "set_thread_area\0"
331 "set_tid_address\0"
332 "set_tls\0"
333 "sigreturn\0"
334 "time\0"
335 "ugetrlimit\0"
336 },
337 [SYSCALL_FILTER_SET_AIO] = {
338 .name = "@aio",
339 .help = "Asynchronous IO",
340 .value =
341 "io_cancel\0"
342 "io_destroy\0"
343 "io_getevents\0"
344 "io_pgetevents\0"
345 "io_pgetevents_time64\0"
346 "io_setup\0"
347 "io_submit\0"
348 "io_uring_enter\0"
349 "io_uring_register\0"
350 "io_uring_setup\0"
351 },
352 [SYSCALL_FILTER_SET_BASIC_IO] = {
353 .name = "@basic-io",
354 .help = "Basic IO",
355 .value =
356 "_llseek\0"
357 "close\0"
358 "close_range\0"
359 "dup\0"
360 "dup2\0"
361 "dup3\0"
362 "lseek\0"
363 "pread64\0"
364 "preadv\0"
365 "preadv2\0"
366 "pwrite64\0"
367 "pwritev\0"
368 "pwritev2\0"
369 "read\0"
370 "readv\0"
371 "write\0"
372 "writev\0"
373 },
374 [SYSCALL_FILTER_SET_CHOWN] = {
375 .name = "@chown",
376 .help = "Change ownership of files and directories",
377 .value =
378 "chown\0"
379 "chown32\0"
380 "fchown\0"
381 "fchown32\0"
382 "fchownat\0"
383 "lchown\0"
384 "lchown32\0"
385 },
386 [SYSCALL_FILTER_SET_CLOCK] = {
387 .name = "@clock",
388 .help = "Change the system time",
389 .value =
390 "adjtimex\0"
391 "clock_adjtime\0"
392 "clock_adjtime64\0"
393 "clock_settime\0"
394 "clock_settime64\0"
395 "settimeofday\0"
396 },
397 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
398 .name = "@cpu-emulation",
399 .help = "System calls for CPU emulation functionality",
400 .value =
401 "modify_ldt\0"
402 "subpage_prot\0"
403 "switch_endian\0"
404 "vm86\0"
405 "vm86old\0"
406 },
407 [SYSCALL_FILTER_SET_DEBUG] = {
408 .name = "@debug",
409 .help = "Debugging, performance monitoring and tracing functionality",
410 .value =
411 "lookup_dcookie\0"
412 "perf_event_open\0"
413 "pidfd_getfd\0"
414 "ptrace\0"
415 "rtas\0"
416 #if defined __s390__ || defined __s390x__
417 "s390_runtime_instr\0"
418 #endif
419 "sys_debug_setcontext\0"
420 },
421 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
422 .name = "@file-system",
423 .help = "File system operations",
424 .value =
425 "access\0"
426 "chdir\0"
427 "chmod\0"
428 "close\0"
429 "creat\0"
430 "faccessat\0"
431 "faccessat2\0"
432 "fallocate\0"
433 "fchdir\0"
434 "fchmod\0"
435 "fchmodat\0"
436 "fcntl\0"
437 "fcntl64\0"
438 "fgetxattr\0"
439 "flistxattr\0"
440 "fremovexattr\0"
441 "fsetxattr\0"
442 "fstat\0"
443 "fstat64\0"
444 "fstatat64\0"
445 "fstatfs\0"
446 "fstatfs64\0"
447 "ftruncate\0"
448 "ftruncate64\0"
449 "futimesat\0"
450 "getcwd\0"
451 "getdents\0"
452 "getdents64\0"
453 "getxattr\0"
454 "inotify_add_watch\0"
455 "inotify_init\0"
456 "inotify_init1\0"
457 "inotify_rm_watch\0"
458 "lgetxattr\0"
459 "link\0"
460 "linkat\0"
461 "listxattr\0"
462 "llistxattr\0"
463 "lremovexattr\0"
464 "lsetxattr\0"
465 "lstat\0"
466 "lstat64\0"
467 "mkdir\0"
468 "mkdirat\0"
469 "mknod\0"
470 "mknodat\0"
471 "mmap\0"
472 "mmap2\0"
473 "munmap\0"
474 "newfstatat\0"
475 "oldfstat\0"
476 "oldlstat\0"
477 "oldstat\0"
478 "open\0"
479 "openat\0"
480 "openat2\0"
481 "readlink\0"
482 "readlinkat\0"
483 "removexattr\0"
484 "rename\0"
485 "renameat\0"
486 "renameat2\0"
487 "rmdir\0"
488 "setxattr\0"
489 "stat\0"
490 "stat64\0"
491 "statfs\0"
492 "statfs64\0"
493 "statx\0"
494 "symlink\0"
495 "symlinkat\0"
496 "truncate\0"
497 "truncate64\0"
498 "unlink\0"
499 "unlinkat\0"
500 "utime\0"
501 "utimensat\0"
502 "utimensat_time64\0"
503 "utimes\0"
504 },
505 [SYSCALL_FILTER_SET_IO_EVENT] = {
506 .name = "@io-event",
507 .help = "Event loop system calls",
508 .value =
509 "_newselect\0"
510 "epoll_create\0"
511 "epoll_create1\0"
512 "epoll_ctl\0"
513 "epoll_ctl_old\0"
514 "epoll_pwait\0"
515 "epoll_wait\0"
516 "epoll_wait_old\0"
517 "eventfd\0"
518 "eventfd2\0"
519 "poll\0"
520 "ppoll\0"
521 "ppoll_time64\0"
522 "pselect6\0"
523 "pselect6_time64\0"
524 "select\0"
525 },
526 [SYSCALL_FILTER_SET_IPC] = {
527 .name = "@ipc",
528 .help = "SysV IPC, POSIX Message Queues or other IPC",
529 .value =
530 "ipc\0"
531 "memfd_create\0"
532 "mq_getsetattr\0"
533 "mq_notify\0"
534 "mq_open\0"
535 "mq_timedreceive\0"
536 "mq_timedreceive_time64\0"
537 "mq_timedsend\0"
538 "mq_timedsend_time64\0"
539 "mq_unlink\0"
540 "msgctl\0"
541 "msgget\0"
542 "msgrcv\0"
543 "msgsnd\0"
544 "pipe\0"
545 "pipe2\0"
546 "process_vm_readv\0"
547 "process_vm_writev\0"
548 "semctl\0"
549 "semget\0"
550 "semop\0"
551 "semtimedop\0"
552 "semtimedop_time64\0"
553 "shmat\0"
554 "shmctl\0"
555 "shmdt\0"
556 "shmget\0"
557 },
558 [SYSCALL_FILTER_SET_KEYRING] = {
559 .name = "@keyring",
560 .help = "Kernel keyring access",
561 .value =
562 "add_key\0"
563 "keyctl\0"
564 "request_key\0"
565 },
566 [SYSCALL_FILTER_SET_MEMLOCK] = {
567 .name = "@memlock",
568 .help = "Memory locking control",
569 .value =
570 "mlock\0"
571 "mlock2\0"
572 "mlockall\0"
573 "munlock\0"
574 "munlockall\0"
575 },
576 [SYSCALL_FILTER_SET_MODULE] = {
577 .name = "@module",
578 .help = "Loading and unloading of kernel modules",
579 .value =
580 "delete_module\0"
581 "finit_module\0"
582 "init_module\0"
583 },
584 [SYSCALL_FILTER_SET_MOUNT] = {
585 .name = "@mount",
586 .help = "Mounting and unmounting of file systems",
587 .value =
588 "chroot\0"
589 "fsconfig\0"
590 "fsmount\0"
591 "fsopen\0"
592 "fspick\0"
593 "mount\0"
594 "move_mount\0"
595 "open_tree\0"
596 "pivot_root\0"
597 "umount\0"
598 "umount2\0"
599 },
600 [SYSCALL_FILTER_SET_NETWORK_IO] = {
601 .name = "@network-io",
602 .help = "Network or Unix socket IO, should not be needed if not network facing",
603 .value =
604 "accept\0"
605 "accept4\0"
606 "bind\0"
607 "connect\0"
608 "getpeername\0"
609 "getsockname\0"
610 "getsockopt\0"
611 "listen\0"
612 "recv\0"
613 "recvfrom\0"
614 "recvmmsg\0"
615 "recvmmsg_time64\0"
616 "recvmsg\0"
617 "send\0"
618 "sendmmsg\0"
619 "sendmsg\0"
620 "sendto\0"
621 "setsockopt\0"
622 "shutdown\0"
623 "socket\0"
624 "socketcall\0"
625 "socketpair\0"
626 },
627 [SYSCALL_FILTER_SET_OBSOLETE] = {
628 /* some unknown even to libseccomp */
629 .name = "@obsolete",
630 .help = "Unusual, obsolete or unimplemented system calls",
631 .value =
632 "_sysctl\0"
633 "afs_syscall\0"
634 "bdflush\0"
635 "break\0"
636 "create_module\0"
637 "ftime\0"
638 "get_kernel_syms\0"
639 "getpmsg\0"
640 "gtty\0"
641 "idle\0"
642 "lock\0"
643 "mpx\0"
644 "prof\0"
645 "profil\0"
646 "putpmsg\0"
647 "query_module\0"
648 "security\0"
649 "sgetmask\0"
650 "ssetmask\0"
651 "stime\0"
652 "stty\0"
653 "sysfs\0"
654 "tuxcall\0"
655 "ulimit\0"
656 "uselib\0"
657 "ustat\0"
658 "vserver\0"
659 },
660 [SYSCALL_FILTER_SET_PKEY] = {
661 .name = "@pkey",
662 .help = "System calls used for memory protection keys",
663 .value =
664 "pkey_alloc\0"
665 "pkey_free\0"
666 "pkey_mprotect\0"
667 },
668 [SYSCALL_FILTER_SET_PRIVILEGED] = {
669 .name = "@privileged",
670 .help = "All system calls which need super-user capabilities",
671 .value =
672 "@chown\0"
673 "@clock\0"
674 "@module\0"
675 "@raw-io\0"
676 "@reboot\0"
677 "@swap\0"
678 "_sysctl\0"
679 "acct\0"
680 "bpf\0"
681 "capset\0"
682 "chroot\0"
683 "fanotify_init\0"
684 "fanotify_mark\0"
685 "nfsservctl\0"
686 "open_by_handle_at\0"
687 "pivot_root\0"
688 "quotactl\0"
689 "setdomainname\0"
690 "setfsuid\0"
691 "setfsuid32\0"
692 "setgroups\0"
693 "setgroups32\0"
694 "sethostname\0"
695 "setresuid\0"
696 "setresuid32\0"
697 "setreuid\0"
698 "setreuid32\0"
699 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
700 "setuid32\0"
701 "vhangup\0"
702 },
703 [SYSCALL_FILTER_SET_PROCESS] = {
704 .name = "@process",
705 .help = "Process control, execution, namespacing operations",
706 .value =
707 "arch_prctl\0"
708 "capget\0" /* Able to query arbitrary processes */
709 "clone\0"
710 "clone3\0"
711 "execveat\0"
712 "fork\0"
713 "getrusage\0"
714 "kill\0"
715 "pidfd_open\0"
716 "pidfd_send_signal\0"
717 "prctl\0"
718 "rt_sigqueueinfo\0"
719 "rt_tgsigqueueinfo\0"
720 "setns\0"
721 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
722 "tgkill\0"
723 "times\0"
724 "tkill\0"
725 "unshare\0"
726 "vfork\0"
727 "wait4\0"
728 "waitid\0"
729 "waitpid\0"
730 },
731 [SYSCALL_FILTER_SET_RAW_IO] = {
732 .name = "@raw-io",
733 .help = "Raw I/O port access",
734 .value =
735 "ioperm\0"
736 "iopl\0"
737 "pciconfig_iobase\0"
738 "pciconfig_read\0"
739 "pciconfig_write\0"
740 #if defined __s390__ || defined __s390x__
741 "s390_pci_mmio_read\0"
742 "s390_pci_mmio_write\0"
743 #endif
744 },
745 [SYSCALL_FILTER_SET_REBOOT] = {
746 .name = "@reboot",
747 .help = "Reboot and reboot preparation/kexec",
748 .value =
749 "kexec_file_load\0"
750 "kexec_load\0"
751 "reboot\0"
752 },
753 [SYSCALL_FILTER_SET_RESOURCES] = {
754 .name = "@resources",
755 .help = "Alter resource settings",
756 .value =
757 "ioprio_set\0"
758 "mbind\0"
759 "migrate_pages\0"
760 "move_pages\0"
761 "nice\0"
762 "sched_setaffinity\0"
763 "sched_setattr\0"
764 "sched_setparam\0"
765 "sched_setscheduler\0"
766 "set_mempolicy\0"
767 "setpriority\0"
768 "setrlimit\0"
769 },
770 [SYSCALL_FILTER_SET_SETUID] = {
771 .name = "@setuid",
772 .help = "Operations for changing user/group credentials",
773 .value =
774 "setgid\0"
775 "setgid32\0"
776 "setgroups\0"
777 "setgroups32\0"
778 "setregid\0"
779 "setregid32\0"
780 "setresgid\0"
781 "setresgid32\0"
782 "setresuid\0"
783 "setresuid32\0"
784 "setreuid\0"
785 "setreuid32\0"
786 "setuid\0"
787 "setuid32\0"
788 },
789 [SYSCALL_FILTER_SET_SIGNAL] = {
790 .name = "@signal",
791 .help = "Process signal handling",
792 .value =
793 "rt_sigaction\0"
794 "rt_sigpending\0"
795 "rt_sigprocmask\0"
796 "rt_sigsuspend\0"
797 "rt_sigtimedwait\0"
798 "rt_sigtimedwait_time64\0"
799 "sigaction\0"
800 "sigaltstack\0"
801 "signal\0"
802 "signalfd\0"
803 "signalfd4\0"
804 "sigpending\0"
805 "sigprocmask\0"
806 "sigsuspend\0"
807 },
808 [SYSCALL_FILTER_SET_SWAP] = {
809 .name = "@swap",
810 .help = "Enable/disable swap devices",
811 .value =
812 "swapoff\0"
813 "swapon\0"
814 },
815 [SYSCALL_FILTER_SET_SYNC] = {
816 .name = "@sync",
817 .help = "Synchronize files and memory to storage",
818 .value =
819 "fdatasync\0"
820 "fsync\0"
821 "msync\0"
822 "sync\0"
823 "sync_file_range\0"
824 "sync_file_range2\0"
825 "syncfs\0"
826 },
827 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
828 .name = "@system-service",
829 .help = "General system service operations",
830 .value =
831 "@aio\0"
832 "@basic-io\0"
833 "@chown\0"
834 "@default\0"
835 "@file-system\0"
836 "@io-event\0"
837 "@ipc\0"
838 "@keyring\0"
839 "@memlock\0"
840 "@network-io\0"
841 "@process\0"
842 "@resources\0"
843 "@setuid\0"
844 "@signal\0"
845 "@sync\0"
846 "@timer\0"
847 "brk\0"
848 "capget\0"
849 "capset\0"
850 "copy_file_range\0"
851 "fadvise64\0"
852 "fadvise64_64\0"
853 "flock\0"
854 "get_mempolicy\0"
855 "getcpu\0"
856 "getpriority\0"
857 "getrandom\0"
858 "ioctl\0"
859 "ioprio_get\0"
860 "kcmp\0"
861 "madvise\0"
862 "mprotect\0"
863 "mremap\0"
864 "name_to_handle_at\0"
865 "oldolduname\0"
866 "olduname\0"
867 "personality\0"
868 "readahead\0"
869 "readdir\0"
870 "remap_file_pages\0"
871 "sched_get_priority_max\0"
872 "sched_get_priority_min\0"
873 "sched_getaffinity\0"
874 "sched_getattr\0"
875 "sched_getparam\0"
876 "sched_getscheduler\0"
877 "sched_rr_get_interval\0"
878 "sched_rr_get_interval_time64\0"
879 "sched_yield\0"
880 "sendfile\0"
881 "sendfile64\0"
882 "setfsgid\0"
883 "setfsgid32\0"
884 "setfsuid\0"
885 "setfsuid32\0"
886 "setpgid\0"
887 "setsid\0"
888 "splice\0"
889 "sysinfo\0"
890 "tee\0"
891 "umask\0"
892 "uname\0"
893 "userfaultfd\0"
894 "vmsplice\0"
895 },
896 [SYSCALL_FILTER_SET_TIMER] = {
897 .name = "@timer",
898 .help = "Schedule operations by time",
899 .value =
900 "alarm\0"
901 "getitimer\0"
902 "setitimer\0"
903 "timer_create\0"
904 "timer_delete\0"
905 "timer_getoverrun\0"
906 "timer_gettime\0"
907 "timer_gettime64\0"
908 "timer_settime\0"
909 "timer_settime64\0"
910 "timerfd_create\0"
911 "timerfd_gettime\0"
912 "timerfd_gettime64\0"
913 "timerfd_settime\0"
914 "timerfd_settime64\0"
915 "times\0"
916 },
917 [SYSCALL_FILTER_SET_KNOWN] = {
918 .name = "@known",
919 .help = "All known syscalls declared in the kernel",
920 .value =
921 #include "syscall-list.h"
922 },
923 };
924
925 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
926 if (isempty(name) || name[0] != '@')
927 return NULL;
928
929 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
930 if (streq(syscall_filter_sets[i].name, name))
931 return syscall_filter_sets + i;
932
933 return NULL;
934 }
935
936 static int add_syscall_filter_set(
937 scmp_filter_ctx seccomp,
938 const SyscallFilterSet *set,
939 uint32_t action,
940 char **exclude,
941 bool log_missing,
942 char ***added);
943
944 int seccomp_add_syscall_filter_item(
945 scmp_filter_ctx *seccomp,
946 const char *name,
947 uint32_t action,
948 char **exclude,
949 bool log_missing,
950 char ***added) {
951
952 assert(seccomp);
953 assert(name);
954
955 if (strv_contains(exclude, name))
956 return 0;
957
958 /* Any syscalls that are handled are added to the *added strv. The pointer
959 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
960
961 if (name[0] == '@') {
962 const SyscallFilterSet *other;
963
964 other = syscall_filter_set_find(name);
965 if (!other)
966 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
967 "Filter set %s is not known!",
968 name);
969
970 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
971
972 } else {
973 int id, r;
974
975 id = seccomp_syscall_resolve_name(name);
976 if (id == __NR_SCMP_ERROR) {
977 if (log_missing)
978 log_debug("System call %s is not known, ignoring.", name);
979 return 0;
980 }
981
982 r = seccomp_rule_add_exact(seccomp, action, id, 0);
983 if (r < 0) {
984 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
985 bool ignore = r == -EDOM;
986
987 if (!ignore || log_missing)
988 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
989 name, id, ignore ? ", ignoring" : "");
990 if (!ignore)
991 return r;
992 }
993
994 if (added) {
995 r = strv_extend(added, name);
996 if (r < 0)
997 return r;
998 }
999
1000 return 0;
1001 }
1002 }
1003
1004 static int add_syscall_filter_set(
1005 scmp_filter_ctx seccomp,
1006 const SyscallFilterSet *set,
1007 uint32_t action,
1008 char **exclude,
1009 bool log_missing,
1010 char ***added) {
1011
1012 const char *sys;
1013 int r;
1014
1015 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1016
1017 assert(seccomp);
1018 assert(set);
1019
1020 NULSTR_FOREACH(sys, set->value) {
1021 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1022 if (r < 0)
1023 return r;
1024 }
1025
1026 return 0;
1027 }
1028
1029 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1030 uint32_t arch;
1031 int r;
1032
1033 assert(set);
1034
1035 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1036 * each local arch. */
1037
1038 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1039 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1040
1041 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1042
1043 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1044 if (r < 0)
1045 return r;
1046
1047 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1048 if (r < 0)
1049 return log_debug_errno(r, "Failed to add filter set: %m");
1050
1051 r = seccomp_load(seccomp);
1052 if (ERRNO_IS_SECCOMP_FATAL(r))
1053 return r;
1054 if (r < 0)
1055 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1056 }
1057
1058 return 0;
1059 }
1060
1061 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
1062 uint32_t arch;
1063 int r;
1064
1065 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
1066 * SyscallFilterSet* table. */
1067
1068 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
1069 return 0;
1070
1071 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1072 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1073 void *syscall_id, *val;
1074
1075 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1076
1077 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1078 if (r < 0)
1079 return r;
1080
1081 HASHMAP_FOREACH_KEY(val, syscall_id, set) {
1082 uint32_t a = action;
1083 int id = PTR_TO_INT(syscall_id) - 1;
1084 int error = PTR_TO_INT(val);
1085
1086 if (error == SECCOMP_ERROR_NUMBER_KILL)
1087 a = scmp_act_kill_process();
1088 #ifdef SCMP_ACT_LOG
1089 else if (action == SCMP_ACT_LOG)
1090 a = SCMP_ACT_LOG;
1091 #endif
1092 else if (action != SCMP_ACT_ALLOW && error >= 0)
1093 a = SCMP_ACT_ERRNO(error);
1094
1095 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1096 if (r < 0) {
1097 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1098 _cleanup_free_ char *n = NULL;
1099 bool ignore;
1100
1101 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1102 ignore = r == -EDOM;
1103 if (!ignore || log_missing)
1104 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1105 strna(n), id, ignore ? ", ignoring" : "");
1106 if (!ignore)
1107 return r;
1108 }
1109 }
1110
1111 r = seccomp_load(seccomp);
1112 if (ERRNO_IS_SECCOMP_FATAL(r))
1113 return r;
1114 if (r < 0)
1115 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1116 }
1117
1118 return 0;
1119 }
1120
1121 int seccomp_parse_syscall_filter(
1122 const char *name,
1123 int errno_num,
1124 Hashmap *filter,
1125 SeccompParseFlags flags,
1126 const char *unit,
1127 const char *filename,
1128 unsigned line) {
1129
1130 int r;
1131
1132 assert(name);
1133 assert(filter);
1134
1135 if (name[0] == '@') {
1136 const SyscallFilterSet *set;
1137 const char *i;
1138
1139 set = syscall_filter_set_find(name);
1140 if (!set) {
1141 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1142 return -EINVAL;
1143
1144 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1145 "Unknown system call group, ignoring: %s", name);
1146 return 0;
1147 }
1148
1149 NULSTR_FOREACH(i, set->value) {
1150 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1151 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1152 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1153 * about them. */
1154 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1155 if (r < 0)
1156 return r;
1157 }
1158 } else {
1159 int id;
1160
1161 id = seccomp_syscall_resolve_name(name);
1162 if (id == __NR_SCMP_ERROR) {
1163 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1164 return -EINVAL;
1165
1166 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1167 "Failed to parse system call, ignoring: %s", name);
1168 return 0;
1169 }
1170
1171 /* If we previously wanted to forbid a syscall and now
1172 * we want to allow it, then remove it from the list. */
1173 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_ALLOW_LIST)) {
1174 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1175 if (r < 0)
1176 switch (r) {
1177 case -ENOMEM:
1178 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1179 case -EEXIST:
1180 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1181 break;
1182 default:
1183 return r;
1184 }
1185 } else
1186 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1187 }
1188
1189 return 0;
1190 }
1191
1192 int seccomp_restrict_namespaces(unsigned long retain) {
1193 uint32_t arch;
1194 int r;
1195
1196 if (DEBUG_LOGGING) {
1197 _cleanup_free_ char *s = NULL;
1198
1199 (void) namespace_flags_to_string(retain, &s);
1200 log_debug("Restricting namespace to: %s.", strna(s));
1201 }
1202
1203 /* NOOP? */
1204 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1205 return 0;
1206
1207 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1208 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1209
1210 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1211
1212 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1213 if (r < 0)
1214 return r;
1215
1216 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1217 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1218 * altogether. */
1219 r = seccomp_rule_add_exact(
1220 seccomp,
1221 SCMP_ACT_ERRNO(EPERM),
1222 SCMP_SYS(setns),
1223 0);
1224 else
1225 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1226 * special invocation with a zero flags argument, right here. */
1227 r = seccomp_rule_add_exact(
1228 seccomp,
1229 SCMP_ACT_ERRNO(EPERM),
1230 SCMP_SYS(setns),
1231 1,
1232 SCMP_A1(SCMP_CMP_EQ, 0));
1233 if (r < 0) {
1234 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1235 continue;
1236 }
1237
1238 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1239 unsigned long f;
1240
1241 f = namespace_flag_map[i].flag;
1242 if (FLAGS_SET(retain, f)) {
1243 log_debug("Permitting %s.", namespace_flag_map[i].name);
1244 continue;
1245 }
1246
1247 log_debug("Blocking %s.", namespace_flag_map[i].name);
1248
1249 r = seccomp_rule_add_exact(
1250 seccomp,
1251 SCMP_ACT_ERRNO(EPERM),
1252 SCMP_SYS(unshare),
1253 1,
1254 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1255 if (r < 0) {
1256 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1257 break;
1258 }
1259
1260 /* On s390/s390x the first two parameters to clone are switched */
1261 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1262 r = seccomp_rule_add_exact(
1263 seccomp,
1264 SCMP_ACT_ERRNO(EPERM),
1265 SCMP_SYS(clone),
1266 1,
1267 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1268 else
1269 r = seccomp_rule_add_exact(
1270 seccomp,
1271 SCMP_ACT_ERRNO(EPERM),
1272 SCMP_SYS(clone),
1273 1,
1274 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1275 if (r < 0) {
1276 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1277 break;
1278 }
1279
1280 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1281 r = seccomp_rule_add_exact(
1282 seccomp,
1283 SCMP_ACT_ERRNO(EPERM),
1284 SCMP_SYS(setns),
1285 1,
1286 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1287 if (r < 0) {
1288 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289 break;
1290 }
1291 }
1292 }
1293 if (r < 0)
1294 continue;
1295
1296 r = seccomp_load(seccomp);
1297 if (ERRNO_IS_SECCOMP_FATAL(r))
1298 return r;
1299 if (r < 0)
1300 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1301 }
1302
1303 return 0;
1304 }
1305
1306 int seccomp_protect_sysctl(void) {
1307 uint32_t arch;
1308 int r;
1309
1310 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1311 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1312
1313 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1314
1315 if (IN_SET(arch,
1316 SCMP_ARCH_AARCH64,
1317 #ifdef SCMP_ARCH_RISCV64
1318 SCMP_ARCH_RISCV64,
1319 #endif
1320 SCMP_ARCH_X32
1321 ))
1322 /* No _sysctl syscall */
1323 continue;
1324
1325 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1326 if (r < 0)
1327 return r;
1328
1329 r = seccomp_rule_add_exact(
1330 seccomp,
1331 SCMP_ACT_ERRNO(EPERM),
1332 SCMP_SYS(_sysctl),
1333 0);
1334 if (r < 0) {
1335 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1336 continue;
1337 }
1338
1339 r = seccomp_load(seccomp);
1340 if (ERRNO_IS_SECCOMP_FATAL(r))
1341 return r;
1342 if (r < 0)
1343 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1344 }
1345
1346 return 0;
1347 }
1348
1349 int seccomp_protect_syslog(void) {
1350 uint32_t arch;
1351 int r;
1352
1353 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1354 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1355
1356 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1357 if (r < 0)
1358 return r;
1359
1360 r = seccomp_rule_add_exact(
1361 seccomp,
1362 SCMP_ACT_ERRNO(EPERM),
1363 SCMP_SYS(syslog),
1364 0);
1365
1366 if (r < 0) {
1367 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1368 continue;
1369 }
1370
1371 r = seccomp_load(seccomp);
1372 if (ERRNO_IS_SECCOMP_FATAL(r))
1373 return r;
1374 if (r < 0)
1375 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1376 }
1377
1378 return 0;
1379 }
1380
1381 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1382 uint32_t arch;
1383 int r;
1384
1385 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1386 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1387 bool supported;
1388
1389 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1390
1391 switch (arch) {
1392
1393 case SCMP_ARCH_X86_64:
1394 case SCMP_ARCH_X32:
1395 case SCMP_ARCH_ARM:
1396 case SCMP_ARCH_AARCH64:
1397 case SCMP_ARCH_PPC:
1398 case SCMP_ARCH_PPC64:
1399 case SCMP_ARCH_PPC64LE:
1400 case SCMP_ARCH_MIPSEL64N32:
1401 case SCMP_ARCH_MIPS64N32:
1402 case SCMP_ARCH_MIPSEL64:
1403 case SCMP_ARCH_MIPS64:
1404 #ifdef SCMP_ARCH_RISCV64
1405 case SCMP_ARCH_RISCV64:
1406 #endif
1407 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1408 supported = true;
1409 break;
1410
1411 case SCMP_ARCH_S390:
1412 case SCMP_ARCH_S390X:
1413 case SCMP_ARCH_X86:
1414 case SCMP_ARCH_MIPSEL:
1415 case SCMP_ARCH_MIPS:
1416 default:
1417 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1418 * don't know */
1419 supported = false;
1420 break;
1421 }
1422
1423 if (!supported)
1424 continue;
1425
1426 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1427 if (r < 0)
1428 return r;
1429
1430 if (allow_list) {
1431 int first = 0, last = 0;
1432 void *afp;
1433
1434 /* If this is an allow list, we first block the address families that are out of
1435 * range and then everything that is not in the set. First, we find the lowest and
1436 * highest address family in the set. */
1437
1438 SET_FOREACH(afp, address_families) {
1439 int af = PTR_TO_INT(afp);
1440
1441 if (af <= 0 || af >= af_max())
1442 continue;
1443
1444 if (first == 0 || af < first)
1445 first = af;
1446
1447 if (last == 0 || af > last)
1448 last = af;
1449 }
1450
1451 assert((first == 0) == (last == 0));
1452
1453 if (first == 0) {
1454
1455 /* No entries in the valid range, block everything */
1456 r = seccomp_rule_add_exact(
1457 seccomp,
1458 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1459 SCMP_SYS(socket),
1460 0);
1461 if (r < 0) {
1462 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1463 continue;
1464 }
1465
1466 } else {
1467
1468 /* Block everything below the first entry */
1469 r = seccomp_rule_add_exact(
1470 seccomp,
1471 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1472 SCMP_SYS(socket),
1473 1,
1474 SCMP_A0(SCMP_CMP_LT, first));
1475 if (r < 0) {
1476 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 continue;
1478 }
1479
1480 /* Block everything above the last entry */
1481 r = seccomp_rule_add_exact(
1482 seccomp,
1483 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1484 SCMP_SYS(socket),
1485 1,
1486 SCMP_A0(SCMP_CMP_GT, last));
1487 if (r < 0) {
1488 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1489 continue;
1490 }
1491
1492 /* Block everything between the first and last entry */
1493 for (int af = 1; af < af_max(); af++) {
1494
1495 if (set_contains(address_families, INT_TO_PTR(af)))
1496 continue;
1497
1498 r = seccomp_rule_add_exact(
1499 seccomp,
1500 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1501 SCMP_SYS(socket),
1502 1,
1503 SCMP_A0(SCMP_CMP_EQ, af));
1504 if (r < 0)
1505 break;
1506 }
1507 if (r < 0) {
1508 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1509 continue;
1510 }
1511 }
1512
1513 } else {
1514 void *af;
1515
1516 /* If this is a deny list, then generate one rule for each address family that are
1517 * then combined in OR checks. */
1518
1519 SET_FOREACH(af, address_families) {
1520 r = seccomp_rule_add_exact(
1521 seccomp,
1522 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1523 SCMP_SYS(socket),
1524 1,
1525 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1526 if (r < 0)
1527 break;
1528 }
1529 if (r < 0) {
1530 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1531 continue;
1532 }
1533 }
1534
1535 r = seccomp_load(seccomp);
1536 if (ERRNO_IS_SECCOMP_FATAL(r))
1537 return r;
1538 if (r < 0)
1539 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1540 }
1541
1542 return 0;
1543 }
1544
1545 int seccomp_restrict_realtime(void) {
1546 static const int permitted_policies[] = {
1547 SCHED_OTHER,
1548 SCHED_BATCH,
1549 SCHED_IDLE,
1550 };
1551
1552 int r, max_policy = 0;
1553 uint32_t arch;
1554 unsigned i;
1555
1556 /* Determine the highest policy constant we want to allow */
1557 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1558 if (permitted_policies[i] > max_policy)
1559 max_policy = permitted_policies[i];
1560
1561 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1562 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1563 int p;
1564
1565 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1566
1567 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1568 if (r < 0)
1569 return r;
1570
1571 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1572 * allow list. */
1573 for (p = 0; p < max_policy; p++) {
1574 bool good = false;
1575
1576 /* Check if this is in the allow list. */
1577 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1578 if (permitted_policies[i] == p) {
1579 good = true;
1580 break;
1581 }
1582
1583 if (good)
1584 continue;
1585
1586 /* Deny this policy */
1587 r = seccomp_rule_add_exact(
1588 seccomp,
1589 SCMP_ACT_ERRNO(EPERM),
1590 SCMP_SYS(sched_setscheduler),
1591 1,
1592 SCMP_A1(SCMP_CMP_EQ, p));
1593 if (r < 0) {
1594 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1595 continue;
1596 }
1597 }
1598
1599 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1600 * are unsigned here, hence no need no check for < 0 values. */
1601 r = seccomp_rule_add_exact(
1602 seccomp,
1603 SCMP_ACT_ERRNO(EPERM),
1604 SCMP_SYS(sched_setscheduler),
1605 1,
1606 SCMP_A1(SCMP_CMP_GT, max_policy));
1607 if (r < 0) {
1608 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 continue;
1610 }
1611
1612 r = seccomp_load(seccomp);
1613 if (ERRNO_IS_SECCOMP_FATAL(r))
1614 return r;
1615 if (r < 0)
1616 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1617 }
1618
1619 return 0;
1620 }
1621
1622 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1623 uint32_t arch,
1624 int nr,
1625 unsigned arg_cnt,
1626 const struct scmp_arg_cmp arg) {
1627 int r;
1628
1629 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1630 if (r < 0) {
1631 _cleanup_free_ char *n = NULL;
1632
1633 n = seccomp_syscall_resolve_num_arch(arch, nr);
1634 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1635 strna(n),
1636 seccomp_arch_to_string(arch));
1637 }
1638
1639 return r;
1640 }
1641
1642 /* For known architectures, check that syscalls are indeed defined or not. */
1643 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1644 assert_cc(SCMP_SYS(shmget) > 0);
1645 assert_cc(SCMP_SYS(shmat) > 0);
1646 assert_cc(SCMP_SYS(shmdt) > 0);
1647 #endif
1648
1649 int seccomp_memory_deny_write_execute(void) {
1650 uint32_t arch;
1651 unsigned loaded = 0;
1652
1653 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1654 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1655 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1656
1657 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1658
1659 switch (arch) {
1660
1661 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1662 * We ignore that here, which means there's still a way to get writable/executable
1663 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1664
1665 case SCMP_ARCH_X86:
1666 case SCMP_ARCH_S390:
1667 filter_syscall = SCMP_SYS(mmap2);
1668 block_syscall = SCMP_SYS(mmap);
1669 /* shmat multiplexed, see above */
1670 break;
1671
1672 case SCMP_ARCH_PPC:
1673 case SCMP_ARCH_PPC64:
1674 case SCMP_ARCH_PPC64LE:
1675 case SCMP_ARCH_S390X:
1676 filter_syscall = SCMP_SYS(mmap);
1677 /* shmat multiplexed, see above */
1678 break;
1679
1680 case SCMP_ARCH_ARM:
1681 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1682 shmat_syscall = SCMP_SYS(shmat);
1683 break;
1684
1685 case SCMP_ARCH_X86_64:
1686 case SCMP_ARCH_X32:
1687 case SCMP_ARCH_AARCH64:
1688 #ifdef SCMP_ARCH_RISCV64
1689 case SCMP_ARCH_RISCV64:
1690 #endif
1691 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1692 shmat_syscall = SCMP_SYS(shmat);
1693 break;
1694
1695 /* Please add more definitions here, if you port systemd to other architectures! */
1696
1697 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1698 #warning "Consider adding the right mmap() syscall definitions here!"
1699 #endif
1700 }
1701
1702 /* Can't filter mmap() on this arch, then skip it */
1703 if (filter_syscall == 0)
1704 continue;
1705
1706 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1707 if (r < 0)
1708 return r;
1709
1710 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1711 1,
1712 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1713 if (r < 0)
1714 continue;
1715
1716 if (block_syscall != 0) {
1717 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1718 if (r < 0)
1719 continue;
1720 }
1721
1722 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1723 1,
1724 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1725 if (r < 0)
1726 continue;
1727
1728 #ifdef __NR_pkey_mprotect
1729 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1730 1,
1731 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1732 if (r < 0)
1733 continue;
1734 #endif
1735
1736 if (shmat_syscall > 0) {
1737 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1738 1,
1739 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1740 if (r < 0)
1741 continue;
1742 }
1743
1744 r = seccomp_load(seccomp);
1745 if (ERRNO_IS_SECCOMP_FATAL(r))
1746 return r;
1747 if (r < 0)
1748 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1749 seccomp_arch_to_string(arch));
1750 loaded++;
1751 }
1752
1753 if (loaded == 0)
1754 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1755
1756 return loaded;
1757 }
1758
1759 int seccomp_restrict_archs(Set *archs) {
1760 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1761 void *id;
1762 int r;
1763
1764 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1765 * list.
1766 *
1767 * There are some qualifications. However the most important use is to stop processes from bypassing
1768 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1769 * in a non-native architecture. There are no holes in this use case, at least so far. */
1770
1771 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1772 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1773 * to run a program with the restrictions applied. */
1774 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1775 if (!seccomp)
1776 return -ENOMEM;
1777
1778 SET_FOREACH(id, archs) {
1779 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1780 if (r < 0 && r != -EEXIST)
1781 return r;
1782 }
1783
1784 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1785 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1786 * The important thing is that you can block the old 32-bit x86 syscalls.
1787 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1788
1789 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1790 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1791
1792 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1793 if (r < 0 && r != -EEXIST)
1794 return r;
1795 }
1796
1797 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1798 if (r < 0)
1799 return r;
1800
1801 r = seccomp_load(seccomp);
1802 if (ERRNO_IS_SECCOMP_FATAL(r))
1803 return r;
1804 if (r < 0)
1805 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1806
1807 return 0;
1808 }
1809
1810 int parse_syscall_archs(char **l, Set **ret_archs) {
1811 _cleanup_set_free_ Set *archs = NULL;
1812 char **s;
1813 int r;
1814
1815 assert(l);
1816 assert(ret_archs);
1817
1818 STRV_FOREACH(s, l) {
1819 uint32_t a;
1820
1821 r = seccomp_arch_from_string(*s, &a);
1822 if (r < 0)
1823 return -EINVAL;
1824
1825 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1826 if (r < 0)
1827 return -ENOMEM;
1828 }
1829
1830 *ret_archs = TAKE_PTR(archs);
1831 return 0;
1832 }
1833
1834 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1835 const char *i;
1836 int r;
1837
1838 assert(set);
1839
1840 NULSTR_FOREACH(i, set->value) {
1841
1842 if (i[0] == '@') {
1843 const SyscallFilterSet *more;
1844
1845 more = syscall_filter_set_find(i);
1846 if (!more)
1847 return -ENXIO;
1848
1849 r = seccomp_filter_set_add(filter, add, more);
1850 if (r < 0)
1851 return r;
1852 } else {
1853 int id;
1854
1855 id = seccomp_syscall_resolve_name(i);
1856 if (id == __NR_SCMP_ERROR) {
1857 log_debug("Couldn't resolve system call, ignoring: %s", i);
1858 continue;
1859 }
1860
1861 if (add) {
1862 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1863 if (r < 0)
1864 return r;
1865 } else
1866 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1867 }
1868 }
1869
1870 return 0;
1871 }
1872
1873 int seccomp_lock_personality(unsigned long personality) {
1874 uint32_t arch;
1875 int r;
1876
1877 if (personality >= PERSONALITY_INVALID)
1878 return -EINVAL;
1879
1880 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1881 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1882
1883 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1884 if (r < 0)
1885 return r;
1886
1887 r = seccomp_rule_add_exact(
1888 seccomp,
1889 SCMP_ACT_ERRNO(EPERM),
1890 SCMP_SYS(personality),
1891 1,
1892 SCMP_A0(SCMP_CMP_NE, personality));
1893 if (r < 0) {
1894 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1895 continue;
1896 }
1897
1898 r = seccomp_load(seccomp);
1899 if (ERRNO_IS_SECCOMP_FATAL(r))
1900 return r;
1901 if (r < 0)
1902 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1903 }
1904
1905 return 0;
1906 }
1907
1908 int seccomp_protect_hostname(void) {
1909 uint32_t arch;
1910 int r;
1911
1912 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1913 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1914
1915 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1916 if (r < 0)
1917 return r;
1918
1919 r = seccomp_rule_add_exact(
1920 seccomp,
1921 SCMP_ACT_ERRNO(EPERM),
1922 SCMP_SYS(sethostname),
1923 0);
1924 if (r < 0) {
1925 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1926 continue;
1927 }
1928
1929 r = seccomp_rule_add_exact(
1930 seccomp,
1931 SCMP_ACT_ERRNO(EPERM),
1932 SCMP_SYS(setdomainname),
1933 0);
1934 if (r < 0) {
1935 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1936 continue;
1937 }
1938
1939 r = seccomp_load(seccomp);
1940 if (ERRNO_IS_SECCOMP_FATAL(r))
1941 return r;
1942 if (r < 0)
1943 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1944 }
1945
1946 return 0;
1947 }
1948
1949 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1950 /* Checks the mode_t parameter of the following system calls:
1951 *
1952 * → chmod() + fchmod() + fchmodat()
1953 * → open() + creat() + openat()
1954 * → mkdir() + mkdirat()
1955 * → mknod() + mknodat()
1956 *
1957 * Returns error if *everything* failed, and 0 otherwise.
1958 */
1959 int r;
1960 bool any = false;
1961
1962 r = seccomp_rule_add_exact(
1963 seccomp,
1964 SCMP_ACT_ERRNO(EPERM),
1965 SCMP_SYS(chmod),
1966 1,
1967 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1968 if (r < 0)
1969 log_debug_errno(r, "Failed to add filter for chmod: %m");
1970 else
1971 any = true;
1972
1973 r = seccomp_rule_add_exact(
1974 seccomp,
1975 SCMP_ACT_ERRNO(EPERM),
1976 SCMP_SYS(fchmod),
1977 1,
1978 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1979 if (r < 0)
1980 log_debug_errno(r, "Failed to add filter for fchmod: %m");
1981 else
1982 any = true;
1983
1984 r = seccomp_rule_add_exact(
1985 seccomp,
1986 SCMP_ACT_ERRNO(EPERM),
1987 SCMP_SYS(fchmodat),
1988 1,
1989 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
1990 if (r < 0)
1991 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
1992 else
1993 any = true;
1994
1995 r = seccomp_rule_add_exact(
1996 seccomp,
1997 SCMP_ACT_ERRNO(EPERM),
1998 SCMP_SYS(mkdir),
1999 1,
2000 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2001 if (r < 0)
2002 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2003 else
2004 any = true;
2005
2006 r = seccomp_rule_add_exact(
2007 seccomp,
2008 SCMP_ACT_ERRNO(EPERM),
2009 SCMP_SYS(mkdirat),
2010 1,
2011 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2012 if (r < 0)
2013 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2014 else
2015 any = true;
2016
2017 r = seccomp_rule_add_exact(
2018 seccomp,
2019 SCMP_ACT_ERRNO(EPERM),
2020 SCMP_SYS(mknod),
2021 1,
2022 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2023 if (r < 0)
2024 log_debug_errno(r, "Failed to add filter for mknod: %m");
2025 else
2026 any = true;
2027
2028 r = seccomp_rule_add_exact(
2029 seccomp,
2030 SCMP_ACT_ERRNO(EPERM),
2031 SCMP_SYS(mknodat),
2032 1,
2033 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2034 if (r < 0)
2035 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2036 else
2037 any = true;
2038
2039 #if SCMP_SYS(open) > 0
2040 r = seccomp_rule_add_exact(
2041 seccomp,
2042 SCMP_ACT_ERRNO(EPERM),
2043 SCMP_SYS(open),
2044 2,
2045 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2046 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2047 if (r < 0)
2048 log_debug_errno(r, "Failed to add filter for open: %m");
2049 else
2050 any = true;
2051 #endif
2052
2053 r = seccomp_rule_add_exact(
2054 seccomp,
2055 SCMP_ACT_ERRNO(EPERM),
2056 SCMP_SYS(openat),
2057 2,
2058 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2059 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2060 if (r < 0)
2061 log_debug_errno(r, "Failed to add filter for openat: %m");
2062 else
2063 any = true;
2064
2065 #if defined(__SNR_openat2)
2066 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2067 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2068 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2069 * compatible with kernels that are not absolutely recent. */
2070 r = seccomp_rule_add_exact(
2071 seccomp,
2072 SCMP_ACT_ERRNO(EPERM),
2073 SCMP_SYS(openat2),
2074 0);
2075 if (r < 0)
2076 log_debug_errno(r, "Failed to add filter for openat2: %m");
2077 else
2078 any = true;
2079 #endif
2080
2081 r = seccomp_rule_add_exact(
2082 seccomp,
2083 SCMP_ACT_ERRNO(EPERM),
2084 SCMP_SYS(creat),
2085 1,
2086 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2087 if (r < 0)
2088 log_debug_errno(r, "Failed to add filter for creat: %m");
2089 else
2090 any = true;
2091
2092 return any ? 0 : r;
2093 }
2094
2095 int seccomp_restrict_suid_sgid(void) {
2096 uint32_t arch;
2097 int r, k;
2098
2099 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2100 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2101
2102 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2103 if (r < 0)
2104 return r;
2105
2106 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2107 if (r < 0)
2108 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2109
2110 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2111 if (k < 0)
2112 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2113
2114 if (r < 0 && k < 0)
2115 continue;
2116
2117 r = seccomp_load(seccomp);
2118 if (ERRNO_IS_SECCOMP_FATAL(r))
2119 return r;
2120 if (r < 0)
2121 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2122 }
2123
2124 return 0;
2125 }
2126
2127 uint32_t scmp_act_kill_process(void) {
2128
2129 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2130 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2131 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2132 * for single-threaded apps does the right thing. */
2133
2134 #ifdef SCMP_ACT_KILL_PROCESS
2135 if (seccomp_api_get() >= 3)
2136 return SCMP_ACT_KILL_PROCESS;
2137 #endif
2138
2139 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2140 }