]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/shared/seccomp-util.c
ci: enable arm64 runner for build/unit jobs
[thirdparty/systemd.git] / src / shared / seccomp-util.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <fcntl.h>
4#include <linux/seccomp.h>
5#include <sched.h>
6#include <sys/mman.h>
7#include <sys/prctl.h>
8#include <sys/shm.h>
9#include <sys/stat.h>
10
11#include "af-list.h"
12#include "alloc-util.h"
13#include "env-util.h"
14#include "errno-list.h"
15#include "log.h"
16#include "namespace-util.h"
17#include "nsflags.h"
18#include "nulstr-util.h"
19#include "parse-util.h"
20#include "process-util.h"
21#include "seccomp-util.h"
22#include "set.h"
23#include "string-util.h"
24#include "strv.h"
25
26#if HAVE_SECCOMP
27
28/* This array will be modified at runtime as seccomp_restrict_archs is called. */
29uint32_t seccomp_local_archs[] = {
30
31 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
32
33#if defined(__x86_64__) && defined(__ILP32__)
34 SCMP_ARCH_X86,
35 SCMP_ARCH_X86_64,
36 SCMP_ARCH_X32, /* native */
37#elif defined(__x86_64__) && !defined(__ILP32__)
38 SCMP_ARCH_X86,
39 SCMP_ARCH_X32,
40 SCMP_ARCH_X86_64, /* native */
41#elif defined(__i386__)
42 SCMP_ARCH_X86,
43#elif defined(__aarch64__)
44 SCMP_ARCH_ARM,
45 SCMP_ARCH_AARCH64, /* native */
46#elif defined(__arm__)
47 SCMP_ARCH_ARM,
48#elif defined(__loongarch_lp64)
49 SCMP_ARCH_LOONGARCH64,
50#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS, /* native */
53#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
54 SCMP_ARCH_MIPS,
55 SCMP_ARCH_MIPSEL, /* native */
56#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64,
62 SCMP_ARCH_MIPS64, /* native */
63#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS64N32,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64, /* native */
70#elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64N32,
76 SCMP_ARCH_MIPS64N32, /* native */
77#elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPS,
79 SCMP_ARCH_MIPSEL,
80 SCMP_ARCH_MIPS64,
81 SCMP_ARCH_MIPSEL64,
82 SCMP_ARCH_MIPS64N32,
83 SCMP_ARCH_MIPSEL64N32, /* native */
84#elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
85 SCMP_ARCH_PARISC,
86 SCMP_ARCH_PARISC64, /* native */
87#elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
88 SCMP_ARCH_PARISC,
89#elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
90 SCMP_ARCH_PPC,
91 SCMP_ARCH_PPC64LE,
92 SCMP_ARCH_PPC64, /* native */
93#elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64,
96 SCMP_ARCH_PPC64LE, /* native */
97#elif defined(__powerpc__)
98 SCMP_ARCH_PPC,
99#elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
100 SCMP_ARCH_RISCV64,
101#elif defined(__s390x__)
102 SCMP_ARCH_S390,
103 SCMP_ARCH_S390X, /* native */
104#elif defined(__s390__)
105 SCMP_ARCH_S390,
106#endif
107 SECCOMP_LOCAL_ARCH_END
108 };
109
110const char* seccomp_arch_to_string(uint32_t c) {
111 /* Maintain order used in <seccomp.h>.
112 *
113 * Names used here should be the same as those used for ConditionArchitecture=,
114 * except for "subarchitectures" like x32. */
115
116 switch (c) {
117 case SCMP_ARCH_NATIVE:
118 return "native";
119 case SCMP_ARCH_X86:
120 return "x86";
121 case SCMP_ARCH_X86_64:
122 return "x86-64";
123 case SCMP_ARCH_X32:
124 return "x32";
125 case SCMP_ARCH_ARM:
126 return "arm";
127 case SCMP_ARCH_AARCH64:
128 return "arm64";
129#ifdef SCMP_ARCH_LOONGARCH64
130 case SCMP_ARCH_LOONGARCH64:
131 return "loongarch64";
132#endif
133 case SCMP_ARCH_MIPS:
134 return "mips";
135 case SCMP_ARCH_MIPS64:
136 return "mips64";
137 case SCMP_ARCH_MIPS64N32:
138 return "mips64-n32";
139 case SCMP_ARCH_MIPSEL:
140 return "mips-le";
141 case SCMP_ARCH_MIPSEL64:
142 return "mips64-le";
143 case SCMP_ARCH_MIPSEL64N32:
144 return "mips64-le-n32";
145#ifdef SCMP_ARCH_PARISC
146 case SCMP_ARCH_PARISC:
147 return "parisc";
148#endif
149#ifdef SCMP_ARCH_PARISC64
150 case SCMP_ARCH_PARISC64:
151 return "parisc64";
152#endif
153 case SCMP_ARCH_PPC:
154 return "ppc";
155 case SCMP_ARCH_PPC64:
156 return "ppc64";
157 case SCMP_ARCH_PPC64LE:
158 return "ppc64-le";
159#ifdef SCMP_ARCH_RISCV64
160 case SCMP_ARCH_RISCV64:
161 return "riscv64";
162#endif
163 case SCMP_ARCH_S390:
164 return "s390";
165 case SCMP_ARCH_S390X:
166 return "s390x";
167 default:
168 return NULL;
169 }
170}
171
172int seccomp_arch_from_string(const char *n, uint32_t *ret) {
173 if (!n)
174 return -EINVAL;
175
176 assert(ret);
177
178 if (streq(n, "native"))
179 *ret = SCMP_ARCH_NATIVE;
180 else if (streq(n, "x86"))
181 *ret = SCMP_ARCH_X86;
182 else if (streq(n, "x86-64"))
183 *ret = SCMP_ARCH_X86_64;
184 else if (streq(n, "x32"))
185 *ret = SCMP_ARCH_X32;
186 else if (streq(n, "arm"))
187 *ret = SCMP_ARCH_ARM;
188 else if (streq(n, "arm64"))
189 *ret = SCMP_ARCH_AARCH64;
190#ifdef SCMP_ARCH_LOONGARCH64
191 else if (streq(n, "loongarch64"))
192 *ret = SCMP_ARCH_LOONGARCH64;
193#endif
194 else if (streq(n, "mips"))
195 *ret = SCMP_ARCH_MIPS;
196 else if (streq(n, "mips64"))
197 *ret = SCMP_ARCH_MIPS64;
198 else if (streq(n, "mips64-n32"))
199 *ret = SCMP_ARCH_MIPS64N32;
200 else if (streq(n, "mips-le"))
201 *ret = SCMP_ARCH_MIPSEL;
202 else if (streq(n, "mips64-le"))
203 *ret = SCMP_ARCH_MIPSEL64;
204 else if (streq(n, "mips64-le-n32"))
205 *ret = SCMP_ARCH_MIPSEL64N32;
206#ifdef SCMP_ARCH_PARISC
207 else if (streq(n, "parisc"))
208 *ret = SCMP_ARCH_PARISC;
209#endif
210#ifdef SCMP_ARCH_PARISC64
211 else if (streq(n, "parisc64"))
212 *ret = SCMP_ARCH_PARISC64;
213#endif
214 else if (streq(n, "ppc"))
215 *ret = SCMP_ARCH_PPC;
216 else if (streq(n, "ppc64"))
217 *ret = SCMP_ARCH_PPC64;
218 else if (streq(n, "ppc64-le"))
219 *ret = SCMP_ARCH_PPC64LE;
220#ifdef SCMP_ARCH_RISCV64
221 else if (streq(n, "riscv64"))
222 *ret = SCMP_ARCH_RISCV64;
223#endif
224 else if (streq(n, "s390"))
225 *ret = SCMP_ARCH_S390;
226 else if (streq(n, "s390x"))
227 *ret = SCMP_ARCH_S390X;
228 else
229 return -EINVAL;
230
231 return 0;
232}
233
234int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
235 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
236 int r;
237
238 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
239 * any others. Also, turns off the NNP fiddling. */
240
241 seccomp = seccomp_init(default_action);
242 if (!seccomp)
243 return -ENOMEM;
244
245 if (arch != SCMP_ARCH_NATIVE &&
246 arch != seccomp_arch_native()) {
247
248 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
249 if (r < 0)
250 return r;
251
252 r = seccomp_arch_add(seccomp, arch);
253 if (r < 0)
254 return r;
255
256 assert(seccomp_arch_exist(seccomp, arch) >= 0);
257 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
258 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
259 } else {
260 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
261 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
262 }
263
264 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
265 if (r < 0)
266 return r;
267
268 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
269 if (r < 0)
270 return r;
271
272#if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
273 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
274 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
275 if (r < 0)
276 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
277 }
278#endif
279
280 *ret = TAKE_PTR(seccomp);
281 return 0;
282}
283
284static bool is_basic_seccomp_available(void) {
285 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
286}
287
288static bool is_seccomp_filter_available(void) {
289 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
290 errno == EFAULT;
291}
292
293bool is_seccomp_available(void) {
294 static int cached_enabled = -1;
295
296 if (cached_enabled < 0) {
297 int b;
298
299 b = secure_getenv_bool("SYSTEMD_SECCOMP");
300 if (b != 0) {
301 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
302 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
303
304 cached_enabled =
305 is_basic_seccomp_available() &&
306 is_seccomp_filter_available();
307 } else
308 cached_enabled = false;
309 }
310
311 return cached_enabled;
312}
313
314const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
315 [SYSCALL_FILTER_SET_DEFAULT] = {
316 .name = "@default",
317 .help = "System calls that are always permitted",
318 .value =
319 "@sandbox\0"
320 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
321 "brk\0"
322 "cacheflush\0"
323 "clock_getres\0"
324 "clock_getres_time64\0"
325 "clock_gettime\0"
326 "clock_gettime64\0"
327 "clock_nanosleep\0"
328 "clock_nanosleep_time64\0"
329 "execve\0"
330 "exit\0"
331 "exit_group\0"
332 "futex\0"
333 "futex_time64\0"
334 "futex_waitv\0"
335 "get_robust_list\0"
336 "get_thread_area\0"
337 "getegid\0"
338 "getegid32\0"
339 "geteuid\0"
340 "geteuid32\0"
341 "getgid\0"
342 "getgid32\0"
343 "getgroups\0"
344 "getgroups32\0"
345 "getpgid\0"
346 "getpgrp\0"
347 "getpid\0"
348 "getppid\0"
349 "getrandom\0"
350 "getresgid\0"
351 "getresgid32\0"
352 "getresuid\0"
353 "getresuid32\0"
354 "getrlimit\0" /* make sure processes can query stack size and such */
355 "getsid\0"
356 "gettid\0"
357 "gettimeofday\0"
358 "getuid\0"
359 "getuid32\0"
360 "membarrier\0"
361 "mmap\0"
362 "mmap2\0"
363 "mprotect\0"
364 "mseal\0"
365 "munmap\0"
366 "nanosleep\0"
367 "pause\0"
368 "prlimit64\0"
369 "restart_syscall\0"
370 "riscv_flush_icache\0"
371 "riscv_hwprobe\0"
372 "rseq\0"
373 "rt_sigreturn\0"
374 "sched_getaffinity\0"
375 "sched_yield\0"
376 "set_robust_list\0"
377 "set_thread_area\0"
378 "set_tid_address\0"
379 "set_tls\0"
380 "sigreturn\0"
381 "time\0"
382 "ugetrlimit\0"
383 "uretprobe\0"
384 },
385 [SYSCALL_FILTER_SET_AIO] = {
386 .name = "@aio",
387 .help = "Asynchronous IO",
388 .value =
389 "io_cancel\0"
390 "io_destroy\0"
391 "io_getevents\0"
392 "io_pgetevents\0"
393 "io_pgetevents_time64\0"
394 "io_setup\0"
395 "io_submit\0"
396 "io_uring_enter\0"
397 "io_uring_register\0"
398 "io_uring_setup\0"
399 },
400 [SYSCALL_FILTER_SET_BASIC_IO] = {
401 .name = "@basic-io",
402 .help = "Basic IO",
403 .value =
404 "_llseek\0"
405 "close\0"
406 "close_range\0"
407 "dup\0"
408 "dup2\0"
409 "dup3\0"
410 "llseek\0"
411 "lseek\0"
412 "pread64\0"
413 "preadv\0"
414 "preadv2\0"
415 "pwrite64\0"
416 "pwritev\0"
417 "pwritev2\0"
418 "read\0"
419 "readv\0"
420 "write\0"
421 "writev\0"
422 },
423 [SYSCALL_FILTER_SET_CHOWN] = {
424 .name = "@chown",
425 .help = "Change ownership of files and directories",
426 .value =
427 "chown\0"
428 "chown32\0"
429 "fchown\0"
430 "fchown32\0"
431 "fchownat\0"
432 "lchown\0"
433 "lchown32\0"
434 },
435 [SYSCALL_FILTER_SET_CLOCK] = {
436 .name = "@clock",
437 .help = "Change the system time",
438 .value =
439 "adjtimex\0"
440 "clock_adjtime\0"
441 "clock_adjtime64\0"
442 "clock_settime\0"
443 "clock_settime64\0"
444 "settimeofday\0"
445 },
446 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
447 .name = "@cpu-emulation",
448 .help = "System calls for CPU emulation functionality",
449 .value =
450 "modify_ldt\0"
451 "subpage_prot\0"
452 "switch_endian\0"
453 "vm86\0"
454 "vm86old\0"
455 },
456 [SYSCALL_FILTER_SET_DEBUG] = {
457 .name = "@debug",
458 .help = "Debugging, performance monitoring and tracing functionality",
459 .value =
460 "lookup_dcookie\0"
461 "perf_event_open\0"
462 "pidfd_getfd\0"
463 "ptrace\0"
464 "rtas\0"
465 "s390_runtime_instr\0"
466 "sys_debug_setcontext\0"
467 },
468 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
469 .name = "@file-system",
470 .help = "File system operations",
471 .value =
472 "access\0"
473 "chdir\0"
474 "chmod\0"
475 "close\0"
476 "creat\0"
477 "faccessat\0"
478 "faccessat2\0"
479 "fallocate\0"
480 "fchdir\0"
481 "fchmod\0"
482 "fchmodat\0"
483 "fchmodat2\0"
484 "fcntl\0"
485 "fcntl64\0"
486 "fgetxattr\0"
487 "flistxattr\0"
488 "fremovexattr\0"
489 "fsetxattr\0"
490 "fstat\0"
491 "fstat64\0"
492 "fstatat\0"
493 "fstatat64\0"
494 "fstatfs\0"
495 "fstatfs64\0"
496 "ftruncate\0"
497 "ftruncate64\0"
498 "futimesat\0"
499 "getcwd\0"
500 "getdents\0"
501 "getdents64\0"
502 "getxattr\0"
503 "getxattrat\0"
504 "inotify_add_watch\0"
505 "inotify_init\0"
506 "inotify_init1\0"
507 "inotify_rm_watch\0"
508 "lgetxattr\0"
509 "link\0"
510 "linkat\0"
511 "listmount\0"
512 "listxattr\0"
513 "listxattrat\0"
514 "llistxattr\0"
515 "lremovexattr\0"
516 "lsetxattr\0"
517 "lstat\0"
518 "lstat64\0"
519 "mkdir\0"
520 "mkdirat\0"
521 "mknod\0"
522 "mknodat\0"
523 "newfstat\0"
524 "newfstatat\0"
525 "oldfstat\0"
526 "oldlstat\0"
527 "oldstat\0"
528 "open\0"
529 "openat\0"
530 "openat2\0"
531 "readlink\0"
532 "readlinkat\0"
533 "removexattr\0"
534 "removexattrat\0"
535 "rename\0"
536 "renameat\0"
537 "renameat2\0"
538 "rmdir\0"
539 "setxattr\0"
540 "setxattrat\0"
541 "stat\0"
542 "stat64\0"
543 "statfs\0"
544 "statfs64\0"
545 "statmount\0"
546 "statx\0"
547 "symlink\0"
548 "symlinkat\0"
549 "truncate\0"
550 "truncate64\0"
551 "unlink\0"
552 "unlinkat\0"
553 "utime\0"
554 "utimensat\0"
555 "utimensat_time64\0"
556 "utimes\0"
557 },
558 [SYSCALL_FILTER_SET_IO_EVENT] = {
559 .name = "@io-event",
560 .help = "Event loop system calls",
561 .value =
562 "_newselect\0"
563 "epoll_create\0"
564 "epoll_create1\0"
565 "epoll_ctl\0"
566 "epoll_ctl_old\0"
567 "epoll_pwait\0"
568 "epoll_pwait2\0"
569 "epoll_wait\0"
570 "epoll_wait_old\0"
571 "eventfd\0"
572 "eventfd2\0"
573 "poll\0"
574 "ppoll\0"
575 "ppoll_time64\0"
576 "pselect6\0"
577 "pselect6_time64\0"
578 "select\0"
579 },
580 [SYSCALL_FILTER_SET_IPC] = {
581 .name = "@ipc",
582 .help = "SysV IPC, POSIX Message Queues or other IPC",
583 .value =
584 "ipc\0"
585 "memfd_create\0"
586 "mq_getsetattr\0"
587 "mq_notify\0"
588 "mq_open\0"
589 "mq_timedreceive\0"
590 "mq_timedreceive_time64\0"
591 "mq_timedsend\0"
592 "mq_timedsend_time64\0"
593 "mq_unlink\0"
594 "msgctl\0"
595 "msgget\0"
596 "msgrcv\0"
597 "msgsnd\0"
598 "pipe\0"
599 "pipe2\0"
600 "process_madvise\0"
601 "process_vm_readv\0"
602 "process_vm_writev\0"
603 "semctl\0"
604 "semget\0"
605 "semop\0"
606 "semtimedop\0"
607 "semtimedop_time64\0"
608 "shmat\0"
609 "shmctl\0"
610 "shmdt\0"
611 "shmget\0"
612 },
613 [SYSCALL_FILTER_SET_KEYRING] = {
614 .name = "@keyring",
615 .help = "Kernel keyring access",
616 .value =
617 "add_key\0"
618 "keyctl\0"
619 "request_key\0"
620 },
621 [SYSCALL_FILTER_SET_MEMLOCK] = {
622 .name = "@memlock",
623 .help = "Memory locking control",
624 .value =
625 "mlock\0"
626 "mlock2\0"
627 "mlockall\0"
628 "munlock\0"
629 "munlockall\0"
630 },
631 [SYSCALL_FILTER_SET_MODULE] = {
632 .name = "@module",
633 .help = "Loading and unloading of kernel modules",
634 .value =
635 "delete_module\0"
636 "finit_module\0"
637 "init_module\0"
638 },
639 [SYSCALL_FILTER_SET_MOUNT] = {
640 .name = "@mount",
641 .help = "Mounting and unmounting of file systems",
642 .value =
643 "chroot\0"
644 "fsconfig\0"
645 "fsmount\0"
646 "fsopen\0"
647 "fspick\0"
648 "mount\0"
649 "mount_setattr\0"
650 "move_mount\0"
651 "open_tree\0"
652 "open_tree_attr\0"
653 "pivot_root\0"
654 "umount\0"
655 "umount2\0"
656 },
657 [SYSCALL_FILTER_SET_NETWORK_IO] = {
658 .name = "@network-io",
659 .help = "Network or Unix socket IO, should not be needed if not network facing",
660 .value =
661 "accept\0"
662 "accept4\0"
663 "bind\0"
664 "connect\0"
665 "getpeername\0"
666 "getsockname\0"
667 "getsockopt\0"
668 "listen\0"
669 "recv\0"
670 "recvfrom\0"
671 "recvmmsg\0"
672 "recvmmsg_time64\0"
673 "recvmsg\0"
674 "send\0"
675 "sendmmsg\0"
676 "sendmsg\0"
677 "sendto\0"
678 "setsockopt\0"
679 "shutdown\0"
680 "socket\0"
681 "socketcall\0"
682 "socketpair\0"
683 },
684 [SYSCALL_FILTER_SET_OBSOLETE] = {
685 /* some unknown even to libseccomp */
686 .name = "@obsolete",
687 .help = "Unusual, obsolete or unimplemented system calls",
688 .value =
689 "_sysctl\0"
690 "afs_syscall\0"
691 "bdflush\0"
692 "break\0"
693 "create_module\0"
694 "ftime\0"
695 "get_kernel_syms\0"
696 "getpmsg\0"
697 "gtty\0"
698 "idle\0"
699 "lock\0"
700 "mpx\0"
701 "prof\0"
702 "profil\0"
703 "putpmsg\0"
704 "query_module\0"
705 "security\0"
706 "sgetmask\0"
707 "ssetmask\0"
708 "stime\0"
709 "stty\0"
710 "sysfs\0"
711 "tuxcall\0"
712 "ulimit\0"
713 "uselib\0"
714 "ustat\0"
715 "vserver\0"
716 },
717 [SYSCALL_FILTER_SET_PKEY] = {
718 .name = "@pkey",
719 .help = "System calls used for memory protection keys",
720 .value =
721 "pkey_alloc\0"
722 "pkey_free\0"
723 "pkey_mprotect\0"
724 },
725 [SYSCALL_FILTER_SET_PRIVILEGED] = {
726 .name = "@privileged",
727 .help = "All system calls which need super-user capabilities",
728 .value =
729 "@chown\0"
730 "@clock\0"
731 "@module\0"
732 "@raw-io\0"
733 "@reboot\0"
734 "@swap\0"
735 "_sysctl\0"
736 "acct\0"
737 "bpf\0"
738 "capset\0"
739 "chroot\0"
740 "fanotify_init\0"
741 "fanotify_mark\0"
742 "nfsservctl\0"
743 "open_by_handle_at\0"
744 "pivot_root\0"
745 "quotactl\0"
746 "quotactl_fd\0"
747 "setdomainname\0"
748 "setfsuid\0"
749 "setfsuid32\0"
750 "setgroups\0"
751 "setgroups32\0"
752 "sethostname\0"
753 "setresuid\0"
754 "setresuid32\0"
755 "setreuid\0"
756 "setreuid32\0"
757 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
758 "setuid32\0"
759 "vhangup\0"
760 },
761 [SYSCALL_FILTER_SET_PROCESS] = {
762 .name = "@process",
763 .help = "Process control, execution, namespacing operations",
764 .value =
765 "capget\0" /* Able to query arbitrary processes */
766 "clone\0"
767 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
768 * implement seccomp, so we don't need to list it at all. C.f.
769 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
770 "clone3\0"
771 "execveat\0"
772 "fork\0"
773 "getrusage\0"
774 "kill\0"
775 "pidfd_open\0"
776 "pidfd_send_signal\0"
777 "prctl\0"
778 "rt_sigqueueinfo\0"
779 "rt_tgsigqueueinfo\0"
780 "setns\0"
781 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
782 "tgkill\0"
783 "times\0"
784 "tkill\0"
785 "unshare\0"
786 "vfork\0"
787 "wait4\0"
788 "waitid\0"
789 "waitpid\0"
790 },
791 [SYSCALL_FILTER_SET_RAW_IO] = {
792 .name = "@raw-io",
793 .help = "Raw I/O port access",
794 .value =
795 "ioperm\0"
796 "iopl\0"
797 "pciconfig_iobase\0"
798 "pciconfig_read\0"
799 "pciconfig_write\0"
800 "s390_pci_mmio_read\0"
801 "s390_pci_mmio_write\0"
802 },
803 [SYSCALL_FILTER_SET_REBOOT] = {
804 .name = "@reboot",
805 .help = "Reboot and reboot preparation/kexec",
806 .value =
807 "kexec_file_load\0"
808 "kexec_load\0"
809 "reboot\0"
810 },
811 [SYSCALL_FILTER_SET_RESOURCES] = {
812 .name = "@resources",
813 .help = "Alter resource settings",
814 .value =
815 "ioprio_set\0"
816 "mbind\0"
817 "migrate_pages\0"
818 "move_pages\0"
819 "nice\0"
820 "sched_setaffinity\0"
821 "sched_setattr\0"
822 "sched_setparam\0"
823 "sched_setscheduler\0"
824 "set_mempolicy\0"
825 "set_mempolicy_home_node\0"
826 "setpriority\0"
827 "setrlimit\0"
828 },
829 [SYSCALL_FILTER_SET_SANDBOX] = {
830 .name = "@sandbox",
831 .help = "Sandbox functionality",
832 .value =
833 "landlock_add_rule\0"
834 "landlock_create_ruleset\0"
835 "landlock_restrict_self\0"
836 "seccomp\0"
837 },
838 [SYSCALL_FILTER_SET_SETUID] = {
839 .name = "@setuid",
840 .help = "Operations for changing user/group credentials",
841 .value =
842 "setgid\0"
843 "setgid32\0"
844 "setgroups\0"
845 "setgroups32\0"
846 "setregid\0"
847 "setregid32\0"
848 "setresgid\0"
849 "setresgid32\0"
850 "setresuid\0"
851 "setresuid32\0"
852 "setreuid\0"
853 "setreuid32\0"
854 "setuid\0"
855 "setuid32\0"
856 },
857 [SYSCALL_FILTER_SET_SIGNAL] = {
858 .name = "@signal",
859 .help = "Process signal handling",
860 .value =
861 "rt_sigaction\0"
862 "rt_sigpending\0"
863 "rt_sigprocmask\0"
864 "rt_sigsuspend\0"
865 "rt_sigtimedwait\0"
866 "rt_sigtimedwait_time64\0"
867 "sigaction\0"
868 "sigaltstack\0"
869 "signal\0"
870 "signalfd\0"
871 "signalfd4\0"
872 "sigpending\0"
873 "sigprocmask\0"
874 "sigsuspend\0"
875 },
876 [SYSCALL_FILTER_SET_SWAP] = {
877 .name = "@swap",
878 .help = "Enable/disable swap devices",
879 .value =
880 "swapoff\0"
881 "swapon\0"
882 },
883 [SYSCALL_FILTER_SET_SYNC] = {
884 .name = "@sync",
885 .help = "Synchronize files and memory to storage",
886 .value =
887 /* Please also update the list in seccomp_suppress_sync(). */
888 "fdatasync\0"
889 "fsync\0"
890 "msync\0"
891 "sync\0"
892 "sync_file_range\0"
893 "sync_file_range2\0"
894 "syncfs\0"
895 },
896 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
897 .name = "@system-service",
898 .help = "General system service operations",
899 .value =
900 "@aio\0"
901 "@basic-io\0"
902 "@chown\0"
903 "@default\0"
904 "@file-system\0"
905 "@io-event\0"
906 "@ipc\0"
907 "@keyring\0"
908 "@memlock\0"
909 "@network-io\0"
910 "@process\0"
911 "@resources\0"
912 "@setuid\0"
913 "@signal\0"
914 "@sync\0"
915 "@timer\0"
916 "arm_fadvise64_64\0"
917 "capget\0"
918 "capset\0"
919 "copy_file_range\0"
920 "fadvise64\0"
921 "fadvise64_64\0"
922 "flock\0"
923 "get_mempolicy\0"
924 "getcpu\0"
925 "getpriority\0"
926 "ioctl\0"
927 "ioprio_get\0"
928 "kcmp\0"
929 "madvise\0"
930 "mremap\0"
931 "name_to_handle_at\0"
932 "oldolduname\0"
933 "olduname\0"
934 "personality\0"
935 "readahead\0"
936 "readdir\0"
937 "remap_file_pages\0"
938 "sched_get_priority_max\0"
939 "sched_get_priority_min\0"
940 "sched_getattr\0"
941 "sched_getparam\0"
942 "sched_getscheduler\0"
943 "sched_rr_get_interval\0"
944 "sched_rr_get_interval_time64\0"
945 "sched_yield\0"
946 "sendfile\0"
947 "sendfile64\0"
948 "setfsgid\0"
949 "setfsgid32\0"
950 "setfsuid\0"
951 "setfsuid32\0"
952 "setpgid\0"
953 "setsid\0"
954 "splice\0"
955 "sysinfo\0"
956 "tee\0"
957 "umask\0"
958 "uname\0"
959 "userfaultfd\0"
960 "vmsplice\0"
961 },
962 [SYSCALL_FILTER_SET_TIMER] = {
963 .name = "@timer",
964 .help = "Schedule operations by time",
965 .value =
966 "alarm\0"
967 "getitimer\0"
968 "setitimer\0"
969 "timer_create\0"
970 "timer_delete\0"
971 "timer_getoverrun\0"
972 "timer_gettime\0"
973 "timer_gettime64\0"
974 "timer_settime\0"
975 "timer_settime64\0"
976 "timerfd_create\0"
977 "timerfd_gettime\0"
978 "timerfd_gettime64\0"
979 "timerfd_settime\0"
980 "timerfd_settime64\0"
981 "times\0"
982 },
983 [SYSCALL_FILTER_SET_KNOWN] = {
984 .name = "@known",
985 .help = "All known syscalls declared in the kernel",
986 .value =
987 "@obsolete\0"
988#include "syscall-list.inc"
989 },
990};
991
992const SyscallFilterSet *syscall_filter_set_find(const char *name) {
993 if (isempty(name) || name[0] != '@')
994 return NULL;
995
996 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
997 if (streq(syscall_filter_sets[i].name, name))
998 return syscall_filter_sets + i;
999
1000 return NULL;
1001}
1002
1003static int add_syscall_filter_set(
1004 scmp_filter_ctx seccomp,
1005 const SyscallFilterSet *set,
1006 uint32_t action,
1007 char **exclude,
1008 bool log_missing,
1009 char ***added);
1010
1011int seccomp_add_syscall_filter_item(
1012 scmp_filter_ctx *seccomp,
1013 const char *name,
1014 uint32_t action,
1015 char **exclude,
1016 bool log_missing,
1017 char ***added) {
1018
1019 assert(seccomp);
1020 assert(name);
1021
1022 if (strv_contains(exclude, name))
1023 return 0;
1024
1025 /* Any syscalls that are handled are added to the *added strv. The pointer
1026 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1027
1028 if (name[0] == '@') {
1029 const SyscallFilterSet *other;
1030
1031 other = syscall_filter_set_find(name);
1032 if (!other)
1033 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1034 "Filter set %s is not known!",
1035 name);
1036
1037 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
1038
1039 } else {
1040 int id, r;
1041
1042 id = seccomp_syscall_resolve_name(name);
1043 if (id == __NR_SCMP_ERROR) {
1044 if (log_missing)
1045 log_debug("System call %s is not known, ignoring.", name);
1046 return 0;
1047 }
1048
1049 r = seccomp_rule_add_exact(seccomp, action, id, 0);
1050 if (r < 0) {
1051 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1052 bool ignore = r == -EDOM;
1053
1054 if (!ignore || log_missing)
1055 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1056 name, id, ignore ? ", ignoring" : "");
1057 if (!ignore)
1058 return r;
1059 }
1060
1061 if (added) {
1062 r = strv_extend(added, name);
1063 if (r < 0)
1064 return r;
1065 }
1066
1067 return 0;
1068 }
1069}
1070
1071static int add_syscall_filter_set(
1072 scmp_filter_ctx seccomp,
1073 const SyscallFilterSet *set,
1074 uint32_t action,
1075 char **exclude,
1076 bool log_missing,
1077 char ***added) {
1078
1079 int r;
1080
1081 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1082
1083 assert(seccomp);
1084 assert(set);
1085
1086 NULSTR_FOREACH(sys, set->value) {
1087 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1088 if (r < 0)
1089 return r;
1090 }
1091
1092 return 0;
1093}
1094
1095static uint32_t override_default_action(uint32_t default_action) {
1096 /* When the requested filter is an allow-list, and the default action is something critical, we
1097 * install ENOSYS as the default action, but it will only apply to syscalls which are not in the
1098 * @known set. */
1099
1100 if (default_action == SCMP_ACT_ALLOW)
1101 return default_action;
1102
1103#ifdef SCMP_ACT_LOG
1104 if (default_action == SCMP_ACT_LOG)
1105 return default_action;
1106#endif
1107
1108 return SCMP_ACT_ERRNO(ENOSYS);
1109}
1110
1111int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1112 uint32_t arch, default_action_override;
1113 int r;
1114
1115 assert(set);
1116
1117 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1118 * each local arch. */
1119
1120 default_action_override = override_default_action(default_action);
1121
1122 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1123 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1124 _cleanup_strv_free_ char **added = NULL;
1125
1126 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1127
1128 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
1129 if (r < 0)
1130 return r;
1131
1132 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, &added);
1133 if (r < 0)
1134 return log_debug_errno(r, "Failed to add filter set: %m");
1135
1136 if (default_action != default_action_override)
1137 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1138 int id;
1139
1140 id = seccomp_syscall_resolve_name(name);
1141 if (id < 0)
1142 continue;
1143
1144 /* Ignore the syscall if it was already handled above */
1145 if (strv_contains(added, name))
1146 continue;
1147
1148 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1149 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1150 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1151 name, id);
1152 }
1153
1154#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1155 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1156 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1157 if (r < 0)
1158 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1159#endif
1160
1161 r = seccomp_load(seccomp);
1162 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1163 return r;
1164 if (r < 0)
1165 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m",
1166 seccomp_arch_to_string(arch));
1167 }
1168
1169 return 0;
1170}
1171
1172int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1173 uint32_t arch, default_action_override;
1174 int r;
1175
1176 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1177 * of a SyscallFilterSet* table. */
1178
1179 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1180 return 0;
1181
1182 default_action_override = override_default_action(default_action);
1183
1184 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1185 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1186 void *syscall_id, *val;
1187
1188 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1189
1190 r = seccomp_init_for_arch(&seccomp, arch, default_action_override);
1191 if (r < 0)
1192 return r;
1193
1194 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1195 uint32_t a = action;
1196 int id = PTR_TO_INT(syscall_id) - 1;
1197 int error = PTR_TO_INT(val);
1198
1199 if (error == SECCOMP_ERROR_NUMBER_KILL)
1200 a = scmp_act_kill_process();
1201#ifdef SCMP_ACT_LOG
1202 else if (action == SCMP_ACT_LOG)
1203 a = SCMP_ACT_LOG;
1204#endif
1205 else if (error >= 0)
1206 a = SCMP_ACT_ERRNO(error);
1207
1208 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1209 if (r < 0) {
1210 /* If the system call is not known on this architecture, then that's
1211 * fine, let's ignore it */
1212 _cleanup_free_ char *n = NULL;
1213 bool ignore;
1214
1215 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1216 ignore = r == -EDOM;
1217 if (!ignore || log_missing)
1218 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1219 strna(n), id, ignore ? ", ignoring" : "");
1220 if (!ignore)
1221 return r;
1222 }
1223 }
1224
1225 if (default_action != default_action_override)
1226 NULSTR_FOREACH(name, syscall_filter_sets[SYSCALL_FILTER_SET_KNOWN].value) {
1227 int id;
1228
1229 id = seccomp_syscall_resolve_name(name);
1230 if (id < 0)
1231 continue;
1232
1233 /* Ignore the syscall if it was already handled above */
1234 if (hashmap_contains(filter, INT_TO_PTR(id + 1)))
1235 continue;
1236
1237 r = seccomp_rule_add_exact(seccomp, default_action, id, 0);
1238 if (r < 0 && r != -EDOM) /* EDOM means that the syscall is not available for arch */
1239 return log_debug_errno(r, "Failed to add rule for system call %s() / %d: %m",
1240 name, id);
1241 }
1242
1243#if (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 5) || SCMP_VER_MAJOR > 2
1244 /* We have a large filter here, so let's turn on the binary tree mode if possible. */
1245 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_OPTIMIZE, 2);
1246 if (r < 0)
1247 log_warning_errno(r, "Failed to set SCMP_FLTATR_CTL_OPTIMIZE, ignoring: %m");
1248#endif
1249
1250 r = seccomp_load(seccomp);
1251 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1252 return r;
1253 if (r < 0)
1254 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1255 seccomp_arch_to_string(arch));
1256 }
1257
1258 return 0;
1259}
1260
1261int seccomp_parse_syscall_filter(
1262 const char *name,
1263 int errno_num,
1264 Hashmap *filter,
1265 SeccompParseFlags flags,
1266 const char *unit,
1267 const char *filename,
1268 unsigned line) {
1269
1270 int r;
1271
1272 assert(name);
1273 assert(filter);
1274
1275 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1276 return -EINVAL;
1277
1278 if (name[0] == '@') {
1279 const SyscallFilterSet *set;
1280
1281 set = syscall_filter_set_find(name);
1282 if (!set) {
1283 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1284 return -EINVAL;
1285
1286 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1287 "Unknown system call group, ignoring: %s", name);
1288 return 0;
1289 }
1290
1291 NULSTR_FOREACH(i, set->value) {
1292 /* Call ourselves again, for the group to parse. Note that we downgrade logging here
1293 * (i.e. take away the SECCOMP_PARSE_LOG flag) since any issues in the group table
1294 * are our own problem, not a problem in user configuration data and we shouldn't
1295 * pretend otherwise by complaining about them. */
1296 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1297 if (r < 0)
1298 return r;
1299 }
1300 } else {
1301 int id;
1302
1303 id = seccomp_syscall_resolve_name(name);
1304 if (id == __NR_SCMP_ERROR) {
1305 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1306 return -EINVAL;
1307
1308 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1309 "System call %s is not known, ignoring.", name);
1310 return 0;
1311 }
1312
1313 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove it
1314 * from the list. The entries in allow-list with non-negative error value will be handled
1315 * with SCMP_ACT_ERRNO() instead of the default action. */
1316 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1317 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1318 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1319 if (r < 0)
1320 switch (r) {
1321 case -ENOMEM:
1322 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1323 case -EEXIST:
1324 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1325 break;
1326 default:
1327 return r;
1328 }
1329 } else
1330 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1331 }
1332
1333 return 0;
1334}
1335
1336int seccomp_restrict_namespaces(unsigned long retain) {
1337 uint32_t arch;
1338 int r;
1339
1340 if (DEBUG_LOGGING) {
1341 _cleanup_free_ char *s = NULL;
1342
1343 (void) namespace_flags_to_string(retain, &s);
1344 log_debug("Restricting namespace to: %s.", strna(s));
1345 }
1346
1347 /* NOOP? */
1348 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1349 return 0;
1350
1351 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1352 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1353
1354 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1355
1356 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1357 if (r < 0)
1358 return r;
1359
1360 /* We cannot filter on individual flags to clone3(), and we need to disable the
1361 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1362 * users shall fall back to clone(), as if on an older kernel.
1363 *
1364 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1365 * https://github.com/moby/moby/issues/42680. */
1366
1367 r = seccomp_rule_add_exact(
1368 seccomp,
1369 SCMP_ACT_ERRNO(ENOSYS),
1370 SCMP_SYS(clone3),
1371 0);
1372 if (r < 0)
1373 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m",
1374 seccomp_arch_to_string(arch));
1375
1376 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1377 /* If every single kind of namespace shall be prohibited, then let's block the whole
1378 * setns() syscall altogether. */
1379 r = seccomp_rule_add_exact(
1380 seccomp,
1381 SCMP_ACT_ERRNO(EPERM),
1382 SCMP_SYS(setns),
1383 0);
1384 else
1385 /* Otherwise, block only the invocations with the appropriate flags in the loop
1386 * below, but also the special invocation with a zero flags argument, right here. */
1387 r = seccomp_rule_add_exact(
1388 seccomp,
1389 SCMP_ACT_ERRNO(EPERM),
1390 SCMP_SYS(setns),
1391 1,
1392 SCMP_A1(SCMP_CMP_EQ, 0));
1393 if (r < 0) {
1394 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1395 seccomp_arch_to_string(arch));
1396 continue;
1397 }
1398
1399 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
1400 unsigned long f;
1401
1402 f = namespace_info[i].clone_flag;
1403 if (FLAGS_SET(retain, f)) {
1404 log_debug("Permitting %s.", namespace_info[i].proc_name);
1405 continue;
1406 }
1407
1408 log_trace("Blocking %s.", namespace_info[i].proc_name);
1409
1410 r = seccomp_rule_add_exact(
1411 seccomp,
1412 SCMP_ACT_ERRNO(EPERM),
1413 SCMP_SYS(unshare),
1414 1,
1415 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1416 if (r < 0) {
1417 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m",
1418 seccomp_arch_to_string(arch));
1419 break;
1420 }
1421
1422 /* On s390/s390x the first two parameters to clone are switched */
1423 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1424 r = seccomp_rule_add_exact(
1425 seccomp,
1426 SCMP_ACT_ERRNO(EPERM),
1427 SCMP_SYS(clone),
1428 1,
1429 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1430 else
1431 r = seccomp_rule_add_exact(
1432 seccomp,
1433 SCMP_ACT_ERRNO(EPERM),
1434 SCMP_SYS(clone),
1435 1,
1436 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1437 if (r < 0) {
1438 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m",
1439 seccomp_arch_to_string(arch));
1440 break;
1441 }
1442
1443 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1444 r = seccomp_rule_add_exact(
1445 seccomp,
1446 SCMP_ACT_ERRNO(EPERM),
1447 SCMP_SYS(setns),
1448 1,
1449 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1450 if (r < 0) {
1451 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m",
1452 seccomp_arch_to_string(arch));
1453 break;
1454 }
1455 }
1456 }
1457 if (r < 0)
1458 continue;
1459
1460 r = seccomp_load(seccomp);
1461 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1462 return r;
1463 if (r < 0)
1464 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m",
1465 seccomp_arch_to_string(arch));
1466 }
1467
1468 return 0;
1469}
1470
1471int seccomp_protect_sysctl(void) {
1472 uint32_t arch;
1473 int r;
1474
1475 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1476 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1477
1478 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1479
1480 if (IN_SET(arch,
1481 SCMP_ARCH_AARCH64,
1482#ifdef SCMP_ARCH_LOONGARCH64
1483 SCMP_ARCH_LOONGARCH64,
1484#endif
1485#ifdef SCMP_ARCH_RISCV64
1486 SCMP_ARCH_RISCV64,
1487#endif
1488 SCMP_ARCH_X32
1489 ))
1490 /* No _sysctl syscall */
1491 continue;
1492
1493 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1494 if (r < 0)
1495 return r;
1496
1497 r = seccomp_rule_add_exact(
1498 seccomp,
1499 SCMP_ACT_ERRNO(EPERM),
1500 SCMP_SYS(_sysctl),
1501 0);
1502 if (r < 0) {
1503 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m",
1504 seccomp_arch_to_string(arch));
1505 continue;
1506 }
1507
1508 r = seccomp_load(seccomp);
1509 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1510 return r;
1511 if (r < 0)
1512 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m",
1513 seccomp_arch_to_string(arch));
1514 }
1515
1516 return 0;
1517}
1518
1519int seccomp_protect_syslog(void) {
1520 uint32_t arch;
1521 int r;
1522
1523 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1524 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1525
1526 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1527 if (r < 0)
1528 return r;
1529
1530 r = seccomp_rule_add_exact(
1531 seccomp,
1532 SCMP_ACT_ERRNO(EPERM),
1533 SCMP_SYS(syslog),
1534 0);
1535
1536 if (r < 0) {
1537 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1538 continue;
1539 }
1540
1541 r = seccomp_load(seccomp);
1542 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1543 return r;
1544 if (r < 0)
1545 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m",
1546 seccomp_arch_to_string(arch));
1547 }
1548
1549 return 0;
1550}
1551
1552int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1553 uint32_t arch;
1554 int r;
1555
1556 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1557 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1558 bool supported;
1559
1560 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1561
1562 switch (arch) {
1563
1564 case SCMP_ARCH_X86_64:
1565 case SCMP_ARCH_X32:
1566 case SCMP_ARCH_ARM:
1567 case SCMP_ARCH_AARCH64:
1568#ifdef SCMP_ARCH_LOONGARCH64
1569 case SCMP_ARCH_LOONGARCH64:
1570#endif
1571 case SCMP_ARCH_MIPSEL64N32:
1572 case SCMP_ARCH_MIPS64N32:
1573 case SCMP_ARCH_MIPSEL64:
1574 case SCMP_ARCH_MIPS64:
1575#ifdef SCMP_ARCH_RISCV64
1576 case SCMP_ARCH_RISCV64:
1577#endif
1578 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1579 supported = true;
1580 break;
1581
1582 case SCMP_ARCH_S390:
1583 case SCMP_ARCH_S390X:
1584 case SCMP_ARCH_X86:
1585 case SCMP_ARCH_MIPSEL:
1586 case SCMP_ARCH_MIPS:
1587#ifdef SCMP_ARCH_PARISC
1588 case SCMP_ARCH_PARISC:
1589#endif
1590#ifdef SCMP_ARCH_PARISC64
1591 case SCMP_ARCH_PARISC64:
1592#endif
1593 case SCMP_ARCH_PPC:
1594 case SCMP_ARCH_PPC64:
1595 case SCMP_ARCH_PPC64LE:
1596 default:
1597 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1598 * don't know */
1599 supported = false;
1600 }
1601
1602 if (!supported)
1603 continue;
1604
1605 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1606 if (r < 0)
1607 return r;
1608
1609 if (allow_list) {
1610 int first = 0, last = 0;
1611 void *afp;
1612
1613 /* If this is an allow list, we first block the address families that are out of
1614 * range and then everything that is not in the set. First, we find the lowest and
1615 * highest address family in the set. */
1616
1617 SET_FOREACH(afp, address_families) {
1618 int af = PTR_TO_INT(afp);
1619
1620 if (af <= 0 || af >= af_max())
1621 continue;
1622
1623 if (first == 0 || af < first)
1624 first = af;
1625
1626 if (last == 0 || af > last)
1627 last = af;
1628 }
1629
1630 assert((first == 0) == (last == 0));
1631
1632 if (first == 0) {
1633
1634 /* No entries in the valid range, block everything */
1635 r = seccomp_rule_add_exact(
1636 seccomp,
1637 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1638 SCMP_SYS(socket),
1639 0);
1640 if (r < 0) {
1641 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1642 seccomp_arch_to_string(arch));
1643 continue;
1644 }
1645
1646 } else {
1647
1648 /* Block everything below the first entry */
1649 r = seccomp_rule_add_exact(
1650 seccomp,
1651 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1652 SCMP_SYS(socket),
1653 1,
1654 SCMP_A0(SCMP_CMP_LT, first));
1655 if (r < 0) {
1656 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1657 seccomp_arch_to_string(arch));
1658 continue;
1659 }
1660
1661 /* Block everything above the last entry */
1662 r = seccomp_rule_add_exact(
1663 seccomp,
1664 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1665 SCMP_SYS(socket),
1666 1,
1667 SCMP_A0(SCMP_CMP_GT, last));
1668 if (r < 0) {
1669 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1670 seccomp_arch_to_string(arch));
1671 continue;
1672 }
1673
1674 /* Block everything between the first and last entry */
1675 for (int af = 1; af < af_max(); af++) {
1676
1677 if (set_contains(address_families, INT_TO_PTR(af)))
1678 continue;
1679
1680 r = seccomp_rule_add_exact(
1681 seccomp,
1682 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1683 SCMP_SYS(socket),
1684 1,
1685 SCMP_A0(SCMP_CMP_EQ, af));
1686 if (r < 0)
1687 break;
1688 }
1689 if (r < 0) {
1690 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1691 seccomp_arch_to_string(arch));
1692 continue;
1693 }
1694 }
1695
1696 } else {
1697 void *af;
1698
1699 /* If this is a deny list, then generate one rule for each address family that are
1700 * then combined in OR checks. */
1701
1702 SET_FOREACH(af, address_families) {
1703 r = seccomp_rule_add_exact(
1704 seccomp,
1705 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1706 SCMP_SYS(socket),
1707 1,
1708 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1709 if (r < 0)
1710 break;
1711 }
1712 if (r < 0) {
1713 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m",
1714 seccomp_arch_to_string(arch));
1715 continue;
1716 }
1717 }
1718
1719 r = seccomp_load(seccomp);
1720 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1721 return r;
1722 if (r < 0)
1723 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m",
1724 seccomp_arch_to_string(arch));
1725 }
1726
1727 return 0;
1728}
1729
1730int seccomp_restrict_realtime_full(int error_code) {
1731 static const int permitted_policies[] = {
1732 SCHED_OTHER,
1733 SCHED_BATCH,
1734 SCHED_IDLE,
1735 };
1736
1737 int r, max_policy = 0;
1738 uint32_t arch;
1739
1740 assert(error_code > 0);
1741
1742 /* Determine the highest policy constant we want to allow */
1743 FOREACH_ELEMENT(policy, permitted_policies)
1744 if (*policy > max_policy)
1745 max_policy = *policy;
1746
1747 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1748 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1749 int p;
1750
1751 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1752
1753 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1754 if (r < 0)
1755 return r;
1756
1757 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1758 * allow list. */
1759 for (p = 0; p < max_policy; p++) {
1760 bool good = false;
1761
1762 /* Check if this is in the allow list. */
1763 FOREACH_ELEMENT(policy, permitted_policies)
1764 if (*policy == p) {
1765 good = true;
1766 break;
1767 }
1768
1769 if (good)
1770 continue;
1771
1772 /* Deny this policy */
1773 r = seccomp_rule_add_exact(
1774 seccomp,
1775 SCMP_ACT_ERRNO(error_code),
1776 SCMP_SYS(sched_setscheduler),
1777 1,
1778 SCMP_A1(SCMP_CMP_EQ, p));
1779 if (r < 0) {
1780 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1781 seccomp_arch_to_string(arch));
1782 continue;
1783 }
1784 }
1785
1786 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1787 * are unsigned here, hence no need no check for < 0 values. */
1788 r = seccomp_rule_add_exact(
1789 seccomp,
1790 SCMP_ACT_ERRNO(error_code),
1791 SCMP_SYS(sched_setscheduler),
1792 1,
1793 SCMP_A1(SCMP_CMP_GT, max_policy));
1794 if (r < 0) {
1795 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
1796 seccomp_arch_to_string(arch));
1797 continue;
1798 }
1799
1800 r = seccomp_load(seccomp);
1801 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1802 return r;
1803 if (r < 0)
1804 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m",
1805 seccomp_arch_to_string(arch));
1806 }
1807
1808 return 0;
1809}
1810
1811static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1812 uint32_t arch,
1813 int nr,
1814 unsigned arg_cnt,
1815 const struct scmp_arg_cmp arg) {
1816 int r;
1817
1818 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1819 if (r < 0) {
1820 _cleanup_free_ char *n = NULL;
1821
1822 n = seccomp_syscall_resolve_num_arch(arch, nr);
1823 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1824 strna(n),
1825 seccomp_arch_to_string(arch));
1826 }
1827
1828 return r;
1829}
1830
1831/* For known architectures, check that syscalls are indeed defined or not. */
1832#if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || defined(__loongarch_lp64) || (defined(__riscv) && __riscv_xlen == 64)
1833assert_cc(SCMP_SYS(shmget) > 0);
1834assert_cc(SCMP_SYS(shmat) > 0);
1835assert_cc(SCMP_SYS(shmdt) > 0);
1836#endif
1837
1838int seccomp_memory_deny_write_execute(void) {
1839 uint32_t arch;
1840 unsigned loaded = 0;
1841
1842 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1843 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1844 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1845
1846 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1847
1848 switch (arch) {
1849
1850 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1851 * We ignore that here, which means there's still a way to get writable/executable
1852 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1853 *
1854 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1855 * on that front (kernel work done in 5.18).
1856 */
1857
1858 case SCMP_ARCH_X86:
1859 case SCMP_ARCH_S390:
1860 filter_syscall = SCMP_SYS(mmap2);
1861 block_syscall = SCMP_SYS(mmap);
1862 /* shmat multiplexed, see above */
1863 break;
1864
1865 case SCMP_ARCH_PPC:
1866 case SCMP_ARCH_PPC64:
1867 case SCMP_ARCH_PPC64LE:
1868 case SCMP_ARCH_S390X:
1869 filter_syscall = SCMP_SYS(mmap);
1870 /* shmat multiplexed, see above */
1871 break;
1872
1873 case SCMP_ARCH_ARM:
1874 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1875 shmat_syscall = SCMP_SYS(shmat);
1876 break;
1877
1878 case SCMP_ARCH_X86_64:
1879 case SCMP_ARCH_X32:
1880 case SCMP_ARCH_AARCH64:
1881#ifdef SCMP_ARCH_LOONGARCH64
1882 case SCMP_ARCH_LOONGARCH64:
1883#endif
1884#ifdef SCMP_ARCH_RISCV64
1885 case SCMP_ARCH_RISCV64:
1886#endif
1887 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64, loongarch64 and riscv64 have only mmap */
1888 shmat_syscall = SCMP_SYS(shmat);
1889 break;
1890
1891 /* Please add more definitions here, if you port systemd to other architectures! */
1892
1893#if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64) && !defined(__loongarch_lp64)
1894#warning "Consider adding the right mmap() syscall definitions here!"
1895#endif
1896 }
1897
1898 /* Can't filter mmap() on this arch, then skip it */
1899 if (filter_syscall == 0)
1900 continue;
1901
1902 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1903 if (r < 0)
1904 return r;
1905
1906 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1907 1,
1908 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1909 if (r < 0)
1910 continue;
1911
1912 if (block_syscall != 0) {
1913 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1914 if (r < 0)
1915 continue;
1916 }
1917
1918 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1919 1,
1920 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1921 if (r < 0)
1922 continue;
1923
1924 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1925 1,
1926 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1927 if (r < 0)
1928 continue;
1929
1930 if (shmat_syscall > 0) {
1931 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1932 1,
1933 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1934 if (r < 0)
1935 continue;
1936 }
1937
1938 r = seccomp_load(seccomp);
1939 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
1940 return r;
1941 if (r < 0)
1942 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1943 seccomp_arch_to_string(arch));
1944 loaded++;
1945 }
1946
1947 if (loaded == 0)
1948 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1949
1950 return loaded;
1951}
1952
1953int seccomp_restrict_archs(Set *archs) {
1954 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1955 int r;
1956 bool blocked_new = false;
1957
1958 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1959 * list.
1960 *
1961 * There are some qualifications. However the most important use is to stop processes from bypassing
1962 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1963 * in a non-native architecture. There are no holes in this use case, at least so far. */
1964
1965 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1966 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1967 * to run a program with the restrictions applied. */
1968 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1969 if (!seccomp)
1970 return -ENOMEM;
1971
1972 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1973 uint32_t arch = seccomp_local_archs[i];
1974
1975 /* See above comment, our "native" architecture is never blocked. */
1976 if (arch == seccomp_arch_native())
1977 continue;
1978
1979 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1980 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1981 continue;
1982
1983 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1984
1985 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1986 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1987 * The important thing is that you can block the old 32-bit x86 syscalls.
1988 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1989 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1990 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1991
1992 if (block) {
1993 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1994 blocked_new = true;
1995 } else {
1996 r = seccomp_arch_add(seccomp, arch);
1997 if (r < 0 && r != -EEXIST)
1998 return r;
1999 }
2000 }
2001
2002 /* All architectures that will be blocked by the seccomp program were
2003 * already blocked. */
2004 if (!blocked_new)
2005 return 0;
2006
2007 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
2008 if (r < 0)
2009 return r;
2010
2011 r = seccomp_load(seccomp);
2012 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2013 return r;
2014 if (r < 0)
2015 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
2016
2017 return 0;
2018}
2019
2020int parse_syscall_archs(char **l, Set **archs) {
2021 int r;
2022
2023 assert(l);
2024 assert(archs);
2025
2026 STRV_FOREACH(s, l) {
2027 uint32_t a;
2028
2029 r = seccomp_arch_from_string(*s, &a);
2030 if (r < 0)
2031 return -EINVAL;
2032
2033 r = set_ensure_put(archs, NULL, UINT32_TO_PTR(a + 1));
2034 if (r < 0)
2035 return -ENOMEM;
2036 }
2037
2038 return 0;
2039}
2040
2041int seccomp_filter_set_add_by_name(Hashmap *filter, bool add, const char *name) {
2042 assert(filter);
2043 assert(name);
2044
2045 if (name[0] == '@') {
2046 const SyscallFilterSet *more;
2047
2048 more = syscall_filter_set_find(name);
2049 if (!more)
2050 return -ENXIO;
2051
2052 return seccomp_filter_set_add(filter, add, more);
2053 }
2054
2055 int id = seccomp_syscall_resolve_name(name);
2056 if (id == __NR_SCMP_ERROR) {
2057 log_debug("System call %s is not known, ignoring.", name);
2058 return 0;
2059 }
2060
2061 if (add)
2062 return hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
2063
2064 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
2065 return 0;
2066}
2067
2068int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
2069 int r;
2070
2071 assert(filter);
2072 assert(set);
2073
2074 NULSTR_FOREACH(i, set->value) {
2075 r = seccomp_filter_set_add_by_name(filter, add, i);
2076 if (r < 0)
2077 return r;
2078 }
2079
2080 return 0;
2081}
2082
2083int seccomp_lock_personality(unsigned long personality) {
2084 uint32_t arch;
2085 int r;
2086
2087 if (personality >= PERSONALITY_INVALID)
2088 return -EINVAL;
2089
2090 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2091 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2092
2093 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2094 if (r < 0)
2095 return r;
2096
2097 r = seccomp_rule_add_exact(
2098 seccomp,
2099 SCMP_ACT_ERRNO(EPERM),
2100 SCMP_SYS(personality),
2101 1,
2102 SCMP_A0(SCMP_CMP_NE, personality));
2103 if (r < 0) {
2104 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m",
2105 seccomp_arch_to_string(arch));
2106 continue;
2107 }
2108
2109 r = seccomp_load(seccomp);
2110 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2111 return r;
2112 if (r < 0)
2113 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m",
2114 seccomp_arch_to_string(arch));
2115 }
2116
2117 return 0;
2118}
2119
2120int seccomp_protect_hostname(void) {
2121 uint32_t arch;
2122 int r;
2123
2124 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2125 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2126
2127 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2128 if (r < 0)
2129 return r;
2130
2131 r = seccomp_rule_add_exact(
2132 seccomp,
2133 SCMP_ACT_ERRNO(EPERM),
2134 SCMP_SYS(sethostname),
2135 0);
2136 if (r < 0) {
2137 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m",
2138 seccomp_arch_to_string(arch));
2139 continue;
2140 }
2141
2142 r = seccomp_rule_add_exact(
2143 seccomp,
2144 SCMP_ACT_ERRNO(EPERM),
2145 SCMP_SYS(setdomainname),
2146 0);
2147 if (r < 0) {
2148 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m",
2149 seccomp_arch_to_string(arch));
2150 continue;
2151 }
2152
2153 r = seccomp_load(seccomp);
2154 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2155 return r;
2156 if (r < 0)
2157 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m",
2158 seccomp_arch_to_string(arch));
2159 }
2160
2161 return 0;
2162}
2163
2164static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2165 /* Checks the mode_t parameter of the following system calls:
2166 *
2167 * → chmod() + fchmod() + fchmodat() + fchmodat2()
2168 * → open() + creat() + openat()
2169 * → mkdir() + mkdirat()
2170 * → mknod() + mknodat()
2171 *
2172 * Returns error if *everything* failed, and 0 otherwise.
2173 */
2174 int r;
2175 bool any = false;
2176
2177 r = seccomp_rule_add_exact(
2178 seccomp,
2179 SCMP_ACT_ERRNO(EPERM),
2180 SCMP_SYS(chmod),
2181 1,
2182 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2183 if (r < 0)
2184 log_debug_errno(r, "Failed to add filter for chmod: %m");
2185 else
2186 any = true;
2187
2188 r = seccomp_rule_add_exact(
2189 seccomp,
2190 SCMP_ACT_ERRNO(EPERM),
2191 SCMP_SYS(fchmod),
2192 1,
2193 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2194 if (r < 0)
2195 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2196 else
2197 any = true;
2198
2199 r = seccomp_rule_add_exact(
2200 seccomp,
2201 SCMP_ACT_ERRNO(EPERM),
2202 SCMP_SYS(fchmodat),
2203 1,
2204 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2205 if (r < 0)
2206 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2207 else
2208 any = true;
2209
2210#if defined(__SNR_fchmodat2)
2211 r = seccomp_rule_add_exact(
2212 seccomp,
2213 SCMP_ACT_ERRNO(EPERM),
2214 SCMP_SYS(fchmodat2),
2215 1,
2216 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2217#else
2218 /* It looks like this libseccomp does not know about fchmodat2().
2219 * Pretend the fchmodat2() system call is not supported at all,
2220 * regardless of the kernel version. */
2221 r = seccomp_rule_add_exact(
2222 seccomp,
2223 SCMP_ACT_ERRNO(ENOSYS),
2224 __NR_fchmodat2,
2225 0);
2226#endif
2227 if (r < 0)
2228 log_debug_errno(r, "Failed to add filter for fchmodat2: %m");
2229 else
2230 any = true;
2231
2232 r = seccomp_rule_add_exact(
2233 seccomp,
2234 SCMP_ACT_ERRNO(EPERM),
2235 SCMP_SYS(mkdir),
2236 1,
2237 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2238 if (r < 0)
2239 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2240 else
2241 any = true;
2242
2243 r = seccomp_rule_add_exact(
2244 seccomp,
2245 SCMP_ACT_ERRNO(EPERM),
2246 SCMP_SYS(mkdirat),
2247 1,
2248 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2249 if (r < 0)
2250 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2251 else
2252 any = true;
2253
2254 r = seccomp_rule_add_exact(
2255 seccomp,
2256 SCMP_ACT_ERRNO(EPERM),
2257 SCMP_SYS(mknod),
2258 1,
2259 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2260 if (r < 0)
2261 log_debug_errno(r, "Failed to add filter for mknod: %m");
2262 else
2263 any = true;
2264
2265 r = seccomp_rule_add_exact(
2266 seccomp,
2267 SCMP_ACT_ERRNO(EPERM),
2268 SCMP_SYS(mknodat),
2269 1,
2270 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2271 if (r < 0)
2272 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2273 else
2274 any = true;
2275
2276 r = seccomp_rule_add_exact(
2277 seccomp,
2278 SCMP_ACT_ERRNO(EPERM),
2279 SCMP_SYS(open),
2280 2,
2281 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2282 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2283 if (r < 0)
2284 log_debug_errno(r, "Failed to add filter for open: %m");
2285 else
2286 any = true;
2287
2288 r = seccomp_rule_add_exact(
2289 seccomp,
2290 SCMP_ACT_ERRNO(EPERM),
2291 SCMP_SYS(openat),
2292 2,
2293 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2294 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2295 if (r < 0)
2296 log_debug_errno(r, "Failed to add filter for openat: %m");
2297 else
2298 any = true;
2299
2300#if defined(__SNR_openat2)
2301 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2302 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2303 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2304 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2305 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2306 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2307 r = seccomp_rule_add_exact(
2308 seccomp,
2309 SCMP_ACT_ERRNO(ENOSYS),
2310 SCMP_SYS(openat2),
2311 0);
2312 if (r < 0)
2313 log_debug_errno(r, "Failed to add filter for openat2: %m");
2314 else
2315 any = true;
2316#endif
2317
2318 r = seccomp_rule_add_exact(
2319 seccomp,
2320 SCMP_ACT_ERRNO(EPERM),
2321 SCMP_SYS(creat),
2322 1,
2323 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2324 if (r < 0)
2325 log_debug_errno(r, "Failed to add filter for creat: %m");
2326 else
2327 any = true;
2328
2329 return any ? 0 : r;
2330}
2331
2332int seccomp_restrict_suid_sgid(void) {
2333 uint32_t arch;
2334 int r, k;
2335
2336 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2337 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2338
2339 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2340 if (r < 0)
2341 return r;
2342
2343 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2344 if (r < 0)
2345 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m",
2346 seccomp_arch_to_string(arch));
2347
2348 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2349 if (k < 0)
2350 log_debug_errno(k, "Failed to add sgid rule for architecture %s, ignoring: %m",
2351 seccomp_arch_to_string(arch));
2352
2353 if (r < 0 && k < 0)
2354 continue;
2355
2356 r = seccomp_load(seccomp);
2357 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2358 return r;
2359 if (r < 0)
2360 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m",
2361 seccomp_arch_to_string(arch));
2362 }
2363
2364 return 0;
2365}
2366
2367uint32_t scmp_act_kill_process(void) {
2368
2369 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2370 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2371 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2372 * for single-threaded apps does the right thing. */
2373
2374#ifdef SCMP_ACT_KILL_PROCESS
2375 if (seccomp_api_get() >= 3)
2376 return SCMP_ACT_KILL_PROCESS;
2377#endif
2378
2379 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2380}
2381
2382int parse_syscall_and_errno(const char *in, char **name, int *error) {
2383 _cleanup_free_ char *n = NULL;
2384 char *p;
2385 int e = -1;
2386
2387 assert(in);
2388 assert(name);
2389 assert(error);
2390
2391 /*
2392 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2393 * If errno is omitted, then error is set to -1.
2394 * Empty syscall name is not allowed.
2395 * Here, we do not check that the syscall name is valid or not.
2396 */
2397
2398 p = strchr(in, ':');
2399 if (p) {
2400 e = seccomp_parse_errno_or_action(p + 1);
2401 if (e < 0)
2402 return e;
2403
2404 n = strndup(in, p - in);
2405 } else
2406 n = strdup(in);
2407
2408 if (!n)
2409 return -ENOMEM;
2410
2411 if (isempty(n))
2412 return -EINVAL;
2413
2414 *error = e;
2415 *name = TAKE_PTR(n);
2416
2417 return 0;
2418}
2419
2420static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2421 bool any = false;
2422 int r;
2423
2424 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2425 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2426
2427 r = seccomp_rule_add_exact(
2428 seccomp,
2429 SCMP_ACT_ERRNO(EINVAL),
2430 SCMP_SYS(open),
2431 1,
2432 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2433 if (r < 0)
2434 log_debug_errno(r, "Failed to add filter for open: %m");
2435 else
2436 any = true;
2437
2438 r = seccomp_rule_add_exact(
2439 seccomp,
2440 SCMP_ACT_ERRNO(EINVAL),
2441 SCMP_SYS(openat),
2442 1,
2443 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2444 if (r < 0)
2445 log_debug_errno(r, "Failed to add filter for openat: %m");
2446 else
2447 any = true;
2448
2449#if defined(__SNR_openat2)
2450 /* The new openat2() system call can't be filtered sensibly, see above. */
2451 r = seccomp_rule_add_exact(
2452 seccomp,
2453 SCMP_ACT_ERRNO(ENOSYS),
2454 SCMP_SYS(openat2),
2455 0);
2456 if (r < 0)
2457 log_debug_errno(r, "Failed to add filter for openat2: %m");
2458 else
2459 any = true;
2460#endif
2461
2462 return any ? 0 : r;
2463}
2464
2465int seccomp_suppress_sync(void) {
2466 uint32_t arch;
2467 int r;
2468
2469 /* This behaves slightly differently from SystemCallFilter=~@sync:0, in that negative fds (which
2470 * we can determine to be invalid) are still refused with EBADF. See #34478.
2471 *
2472 * Additionally, O_SYNC/O_DSYNC are masked. */
2473
2474 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2475 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2476
2477 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2478 if (r < 0)
2479 return r;
2480
2481 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2482 int id;
2483
2484 id = seccomp_syscall_resolve_name(c);
2485 if (id == __NR_SCMP_ERROR) {
2486 log_debug("System call %s is not known, ignoring.", c);
2487 continue;
2488 }
2489
2490 if (STR_IN_SET(c, "fdatasync", "fsync", "sync_file_range", "sync_file_range2", "syncfs"))
2491 r = seccomp_rule_add_exact(
2492 seccomp,
2493 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2494 id,
2495 1,
2496 SCMP_A0(SCMP_CMP_LE, INT_MAX)); /* The rule handles arguments in unsigned. Hence, this
2497 * means non-negative fd matches the rule, and the negative
2498 * fd passed to the syscall (then it fails with EBADF). */
2499 else
2500 r = seccomp_rule_add_exact(
2501 seccomp,
2502 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2503 id,
2504 0);
2505 if (r < 0)
2506 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2507 }
2508
2509 (void) block_open_flag(seccomp, O_SYNC);
2510#if O_DSYNC != O_SYNC
2511 (void) block_open_flag(seccomp, O_DSYNC);
2512#endif
2513
2514 r = seccomp_load(seccomp);
2515 if (ERRNO_IS_NEG_SECCOMP_FATAL(r))
2516 return r;
2517 if (r < 0)
2518 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m",
2519 seccomp_arch_to_string(arch));
2520 }
2521
2522 return 0;
2523}
2524
2525#endif
2526
2527bool seccomp_errno_or_action_is_valid(int n) {
2528 return n == SECCOMP_ERROR_NUMBER_KILL || errno_is_valid(n);
2529}
2530
2531int seccomp_parse_errno_or_action(const char *p) {
2532 if (streq_ptr(p, "kill"))
2533 return SECCOMP_ERROR_NUMBER_KILL;
2534 return parse_errno(p);
2535}
2536
2537const char* seccomp_errno_or_action_to_string(int num) {
2538 if (num == SECCOMP_ERROR_NUMBER_KILL)
2539 return "kill";
2540 return errno_to_name(num);
2541}