]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #20522 from yuwata/cgroup-fix
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <seccomp.h>
7 #include <stddef.h>
8 #include <sys/mman.h>
9 #include <sys/prctl.h>
10 #include <sys/shm.h>
11 #include <sys/stat.h>
12
13 #include "af-list.h"
14 #include "alloc-util.h"
15 #include "env-util.h"
16 #include "errno-list.h"
17 #include "macro.h"
18 #include "nsflags.h"
19 #include "nulstr-util.h"
20 #include "process-util.h"
21 #include "seccomp-util.h"
22 #include "set.h"
23 #include "string-util.h"
24 #include "strv.h"
25
26 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
27 uint32_t seccomp_local_archs[] = {
28
29 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
30
31 #if defined(__x86_64__) && defined(__ILP32__)
32 SCMP_ARCH_X86,
33 SCMP_ARCH_X86_64,
34 SCMP_ARCH_X32, /* native */
35 #elif defined(__x86_64__) && !defined(__ILP32__)
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X32,
38 SCMP_ARCH_X86_64, /* native */
39 #elif defined(__i386__)
40 SCMP_ARCH_X86,
41 #elif defined(__aarch64__)
42 SCMP_ARCH_ARM,
43 SCMP_ARCH_AARCH64, /* native */
44 #elif defined(__arm__)
45 SCMP_ARCH_ARM,
46 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
47 SCMP_ARCH_MIPSEL,
48 SCMP_ARCH_MIPS, /* native */
49 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL, /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
53 SCMP_ARCH_MIPSEL,
54 SCMP_ARCH_MIPS,
55 SCMP_ARCH_MIPSEL64N32,
56 SCMP_ARCH_MIPS64N32,
57 SCMP_ARCH_MIPSEL64,
58 SCMP_ARCH_MIPS64, /* native */
59 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
60 SCMP_ARCH_MIPS,
61 SCMP_ARCH_MIPSEL,
62 SCMP_ARCH_MIPS64N32,
63 SCMP_ARCH_MIPSEL64N32,
64 SCMP_ARCH_MIPS64,
65 SCMP_ARCH_MIPSEL64, /* native */
66 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
67 SCMP_ARCH_MIPSEL,
68 SCMP_ARCH_MIPS,
69 SCMP_ARCH_MIPSEL64,
70 SCMP_ARCH_MIPS64,
71 SCMP_ARCH_MIPSEL64N32,
72 SCMP_ARCH_MIPS64N32, /* native */
73 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
74 SCMP_ARCH_MIPS,
75 SCMP_ARCH_MIPSEL,
76 SCMP_ARCH_MIPS64,
77 SCMP_ARCH_MIPSEL64,
78 SCMP_ARCH_MIPS64N32,
79 SCMP_ARCH_MIPSEL64N32, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64LE,
83 SCMP_ARCH_PPC64, /* native */
84 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
85 SCMP_ARCH_PPC,
86 SCMP_ARCH_PPC64,
87 SCMP_ARCH_PPC64LE, /* native */
88 #elif defined(__powerpc__)
89 SCMP_ARCH_PPC,
90 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
91 SCMP_ARCH_RISCV64,
92 #elif defined(__s390x__)
93 SCMP_ARCH_S390,
94 SCMP_ARCH_S390X, /* native */
95 #elif defined(__s390__)
96 SCMP_ARCH_S390,
97 #endif
98 SECCOMP_LOCAL_ARCH_END
99 };
100
101 const char* seccomp_arch_to_string(uint32_t c) {
102 /* Maintain order used in <seccomp.h>.
103 *
104 * Names used here should be the same as those used for ConditionArchitecture=,
105 * except for "subarchitectures" like x32. */
106
107 switch(c) {
108 case SCMP_ARCH_NATIVE:
109 return "native";
110 case SCMP_ARCH_X86:
111 return "x86";
112 case SCMP_ARCH_X86_64:
113 return "x86-64";
114 case SCMP_ARCH_X32:
115 return "x32";
116 case SCMP_ARCH_ARM:
117 return "arm";
118 case SCMP_ARCH_AARCH64:
119 return "arm64";
120 case SCMP_ARCH_MIPS:
121 return "mips";
122 case SCMP_ARCH_MIPS64:
123 return "mips64";
124 case SCMP_ARCH_MIPS64N32:
125 return "mips64-n32";
126 case SCMP_ARCH_MIPSEL:
127 return "mips-le";
128 case SCMP_ARCH_MIPSEL64:
129 return "mips64-le";
130 case SCMP_ARCH_MIPSEL64N32:
131 return "mips64-le-n32";
132 case SCMP_ARCH_PPC:
133 return "ppc";
134 case SCMP_ARCH_PPC64:
135 return "ppc64";
136 case SCMP_ARCH_PPC64LE:
137 return "ppc64-le";
138 #ifdef SCMP_ARCH_RISCV64
139 case SCMP_ARCH_RISCV64:
140 return "riscv64";
141 #endif
142 case SCMP_ARCH_S390:
143 return "s390";
144 case SCMP_ARCH_S390X:
145 return "s390x";
146 default:
147 return NULL;
148 }
149 }
150
151 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
152 if (!n)
153 return -EINVAL;
154
155 assert(ret);
156
157 if (streq(n, "native"))
158 *ret = SCMP_ARCH_NATIVE;
159 else if (streq(n, "x86"))
160 *ret = SCMP_ARCH_X86;
161 else if (streq(n, "x86-64"))
162 *ret = SCMP_ARCH_X86_64;
163 else if (streq(n, "x32"))
164 *ret = SCMP_ARCH_X32;
165 else if (streq(n, "arm"))
166 *ret = SCMP_ARCH_ARM;
167 else if (streq(n, "arm64"))
168 *ret = SCMP_ARCH_AARCH64;
169 else if (streq(n, "mips"))
170 *ret = SCMP_ARCH_MIPS;
171 else if (streq(n, "mips64"))
172 *ret = SCMP_ARCH_MIPS64;
173 else if (streq(n, "mips64-n32"))
174 *ret = SCMP_ARCH_MIPS64N32;
175 else if (streq(n, "mips-le"))
176 *ret = SCMP_ARCH_MIPSEL;
177 else if (streq(n, "mips64-le"))
178 *ret = SCMP_ARCH_MIPSEL64;
179 else if (streq(n, "mips64-le-n32"))
180 *ret = SCMP_ARCH_MIPSEL64N32;
181 else if (streq(n, "ppc"))
182 *ret = SCMP_ARCH_PPC;
183 else if (streq(n, "ppc64"))
184 *ret = SCMP_ARCH_PPC64;
185 else if (streq(n, "ppc64-le"))
186 *ret = SCMP_ARCH_PPC64LE;
187 #ifdef SCMP_ARCH_RISCV64
188 else if (streq(n, "riscv64"))
189 *ret = SCMP_ARCH_RISCV64;
190 #endif
191 else if (streq(n, "s390"))
192 *ret = SCMP_ARCH_S390;
193 else if (streq(n, "s390x"))
194 *ret = SCMP_ARCH_S390X;
195 else
196 return -EINVAL;
197
198 return 0;
199 }
200
201 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
202 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
203 int r;
204
205 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
206 * any others. Also, turns off the NNP fiddling. */
207
208 seccomp = seccomp_init(default_action);
209 if (!seccomp)
210 return -ENOMEM;
211
212 if (arch != SCMP_ARCH_NATIVE &&
213 arch != seccomp_arch_native()) {
214
215 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
216 if (r < 0)
217 return r;
218
219 r = seccomp_arch_add(seccomp, arch);
220 if (r < 0)
221 return r;
222
223 assert(seccomp_arch_exist(seccomp, arch) >= 0);
224 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
225 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
226 } else {
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
229 }
230
231 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
232 if (r < 0)
233 return r;
234
235 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
236 if (r < 0)
237 return r;
238
239 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
240 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
241 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
242 if (r < 0)
243 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
244 }
245 #endif
246
247 *ret = TAKE_PTR(seccomp);
248 return 0;
249 }
250
251 static bool is_basic_seccomp_available(void) {
252 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
253 }
254
255 static bool is_seccomp_filter_available(void) {
256 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
257 errno == EFAULT;
258 }
259
260 bool is_seccomp_available(void) {
261 static int cached_enabled = -1;
262
263 if (cached_enabled < 0) {
264 int b;
265
266 b = getenv_bool_secure("SYSTEMD_SECCOMP");
267 if (b != 0) {
268 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
269 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
270
271 cached_enabled =
272 is_basic_seccomp_available() &&
273 is_seccomp_filter_available();
274 } else
275 cached_enabled = false;
276 }
277
278 return cached_enabled;
279 }
280
281 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
282 [SYSCALL_FILTER_SET_DEFAULT] = {
283 .name = "@default",
284 .help = "System calls that are always permitted",
285 .value =
286 "brk\0"
287 "cacheflush\0"
288 "clock_getres\0"
289 "clock_getres_time64\0"
290 "clock_gettime\0"
291 "clock_gettime64\0"
292 "clock_nanosleep\0"
293 "clock_nanosleep_time64\0"
294 "execve\0"
295 "exit\0"
296 "exit_group\0"
297 "futex\0"
298 "futex_time64\0"
299 "get_robust_list\0"
300 "get_thread_area\0"
301 "getegid\0"
302 "getegid32\0"
303 "geteuid\0"
304 "geteuid32\0"
305 "getgid\0"
306 "getgid32\0"
307 "getgroups\0"
308 "getgroups32\0"
309 "getpgid\0"
310 "getpgrp\0"
311 "getpid\0"
312 "getppid\0"
313 "getrandom\0"
314 "getresgid\0"
315 "getresgid32\0"
316 "getresuid\0"
317 "getresuid32\0"
318 "getrlimit\0" /* make sure processes can query stack size and such */
319 "getsid\0"
320 "gettid\0"
321 "gettimeofday\0"
322 "getuid\0"
323 "getuid32\0"
324 "membarrier\0"
325 "mmap\0"
326 "mmap2\0"
327 "mprotect\0"
328 "munmap\0"
329 "nanosleep\0"
330 "pause\0"
331 "prlimit64\0"
332 "restart_syscall\0"
333 "rseq\0"
334 "rt_sigreturn\0"
335 "sched_getaffinity\0"
336 "sched_yield\0"
337 "set_robust_list\0"
338 "set_thread_area\0"
339 "set_tid_address\0"
340 "set_tls\0"
341 "sigreturn\0"
342 "time\0"
343 "ugetrlimit\0"
344 },
345 [SYSCALL_FILTER_SET_AIO] = {
346 .name = "@aio",
347 .help = "Asynchronous IO",
348 .value =
349 "io_cancel\0"
350 "io_destroy\0"
351 "io_getevents\0"
352 "io_pgetevents\0"
353 "io_pgetevents_time64\0"
354 "io_setup\0"
355 "io_submit\0"
356 "io_uring_enter\0"
357 "io_uring_register\0"
358 "io_uring_setup\0"
359 },
360 [SYSCALL_FILTER_SET_BASIC_IO] = {
361 .name = "@basic-io",
362 .help = "Basic IO",
363 .value =
364 "_llseek\0"
365 "close\0"
366 "close_range\0"
367 "dup\0"
368 "dup2\0"
369 "dup3\0"
370 "lseek\0"
371 "pread64\0"
372 "preadv\0"
373 "preadv2\0"
374 "pwrite64\0"
375 "pwritev\0"
376 "pwritev2\0"
377 "read\0"
378 "readv\0"
379 "write\0"
380 "writev\0"
381 },
382 [SYSCALL_FILTER_SET_CHOWN] = {
383 .name = "@chown",
384 .help = "Change ownership of files and directories",
385 .value =
386 "chown\0"
387 "chown32\0"
388 "fchown\0"
389 "fchown32\0"
390 "fchownat\0"
391 "lchown\0"
392 "lchown32\0"
393 },
394 [SYSCALL_FILTER_SET_CLOCK] = {
395 .name = "@clock",
396 .help = "Change the system time",
397 .value =
398 "adjtimex\0"
399 "clock_adjtime\0"
400 "clock_adjtime64\0"
401 "clock_settime\0"
402 "clock_settime64\0"
403 "settimeofday\0"
404 },
405 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
406 .name = "@cpu-emulation",
407 .help = "System calls for CPU emulation functionality",
408 .value =
409 "modify_ldt\0"
410 "subpage_prot\0"
411 "switch_endian\0"
412 "vm86\0"
413 "vm86old\0"
414 },
415 [SYSCALL_FILTER_SET_DEBUG] = {
416 .name = "@debug",
417 .help = "Debugging, performance monitoring and tracing functionality",
418 .value =
419 "lookup_dcookie\0"
420 "perf_event_open\0"
421 "pidfd_getfd\0"
422 "ptrace\0"
423 "rtas\0"
424 #if defined __s390__ || defined __s390x__
425 "s390_runtime_instr\0"
426 #endif
427 "sys_debug_setcontext\0"
428 },
429 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
430 .name = "@file-system",
431 .help = "File system operations",
432 .value =
433 "access\0"
434 "chdir\0"
435 "chmod\0"
436 "close\0"
437 "creat\0"
438 "faccessat\0"
439 "faccessat2\0"
440 "fallocate\0"
441 "fchdir\0"
442 "fchmod\0"
443 "fchmodat\0"
444 "fcntl\0"
445 "fcntl64\0"
446 "fgetxattr\0"
447 "flistxattr\0"
448 "fremovexattr\0"
449 "fsetxattr\0"
450 "fstat\0"
451 "fstat64\0"
452 "fstatat64\0"
453 "fstatfs\0"
454 "fstatfs64\0"
455 "ftruncate\0"
456 "ftruncate64\0"
457 "futimesat\0"
458 "getcwd\0"
459 "getdents\0"
460 "getdents64\0"
461 "getxattr\0"
462 "inotify_add_watch\0"
463 "inotify_init\0"
464 "inotify_init1\0"
465 "inotify_rm_watch\0"
466 "lgetxattr\0"
467 "link\0"
468 "linkat\0"
469 "listxattr\0"
470 "llistxattr\0"
471 "lremovexattr\0"
472 "lsetxattr\0"
473 "lstat\0"
474 "lstat64\0"
475 "mkdir\0"
476 "mkdirat\0"
477 "mknod\0"
478 "mknodat\0"
479 "newfstatat\0"
480 "oldfstat\0"
481 "oldlstat\0"
482 "oldstat\0"
483 "open\0"
484 "openat\0"
485 "openat2\0"
486 "readlink\0"
487 "readlinkat\0"
488 "removexattr\0"
489 "rename\0"
490 "renameat\0"
491 "renameat2\0"
492 "rmdir\0"
493 "setxattr\0"
494 "stat\0"
495 "stat64\0"
496 "statfs\0"
497 "statfs64\0"
498 "statx\0"
499 "symlink\0"
500 "symlinkat\0"
501 "truncate\0"
502 "truncate64\0"
503 "unlink\0"
504 "unlinkat\0"
505 "utime\0"
506 "utimensat\0"
507 "utimensat_time64\0"
508 "utimes\0"
509 },
510 [SYSCALL_FILTER_SET_IO_EVENT] = {
511 .name = "@io-event",
512 .help = "Event loop system calls",
513 .value =
514 "_newselect\0"
515 "epoll_create\0"
516 "epoll_create1\0"
517 "epoll_ctl\0"
518 "epoll_ctl_old\0"
519 "epoll_pwait\0"
520 "epoll_pwait2\0"
521 "epoll_wait\0"
522 "epoll_wait_old\0"
523 "eventfd\0"
524 "eventfd2\0"
525 "poll\0"
526 "ppoll\0"
527 "ppoll_time64\0"
528 "pselect6\0"
529 "pselect6_time64\0"
530 "select\0"
531 },
532 [SYSCALL_FILTER_SET_IPC] = {
533 .name = "@ipc",
534 .help = "SysV IPC, POSIX Message Queues or other IPC",
535 .value =
536 "ipc\0"
537 "memfd_create\0"
538 "mq_getsetattr\0"
539 "mq_notify\0"
540 "mq_open\0"
541 "mq_timedreceive\0"
542 "mq_timedreceive_time64\0"
543 "mq_timedsend\0"
544 "mq_timedsend_time64\0"
545 "mq_unlink\0"
546 "msgctl\0"
547 "msgget\0"
548 "msgrcv\0"
549 "msgsnd\0"
550 "pipe\0"
551 "pipe2\0"
552 "process_madvise\0"
553 "process_vm_readv\0"
554 "process_vm_writev\0"
555 "semctl\0"
556 "semget\0"
557 "semop\0"
558 "semtimedop\0"
559 "semtimedop_time64\0"
560 "shmat\0"
561 "shmctl\0"
562 "shmdt\0"
563 "shmget\0"
564 },
565 [SYSCALL_FILTER_SET_KEYRING] = {
566 .name = "@keyring",
567 .help = "Kernel keyring access",
568 .value =
569 "add_key\0"
570 "keyctl\0"
571 "request_key\0"
572 },
573 [SYSCALL_FILTER_SET_MEMLOCK] = {
574 .name = "@memlock",
575 .help = "Memory locking control",
576 .value =
577 "mlock\0"
578 "mlock2\0"
579 "mlockall\0"
580 "munlock\0"
581 "munlockall\0"
582 },
583 [SYSCALL_FILTER_SET_MODULE] = {
584 .name = "@module",
585 .help = "Loading and unloading of kernel modules",
586 .value =
587 "delete_module\0"
588 "finit_module\0"
589 "init_module\0"
590 },
591 [SYSCALL_FILTER_SET_MOUNT] = {
592 .name = "@mount",
593 .help = "Mounting and unmounting of file systems",
594 .value =
595 "chroot\0"
596 "fsconfig\0"
597 "fsmount\0"
598 "fsopen\0"
599 "fspick\0"
600 "mount\0"
601 "mount_setattr\0"
602 "move_mount\0"
603 "open_tree\0"
604 "pivot_root\0"
605 "umount\0"
606 "umount2\0"
607 },
608 [SYSCALL_FILTER_SET_NETWORK_IO] = {
609 .name = "@network-io",
610 .help = "Network or Unix socket IO, should not be needed if not network facing",
611 .value =
612 "accept\0"
613 "accept4\0"
614 "bind\0"
615 "connect\0"
616 "getpeername\0"
617 "getsockname\0"
618 "getsockopt\0"
619 "listen\0"
620 "recv\0"
621 "recvfrom\0"
622 "recvmmsg\0"
623 "recvmmsg_time64\0"
624 "recvmsg\0"
625 "send\0"
626 "sendmmsg\0"
627 "sendmsg\0"
628 "sendto\0"
629 "setsockopt\0"
630 "shutdown\0"
631 "socket\0"
632 "socketcall\0"
633 "socketpair\0"
634 },
635 [SYSCALL_FILTER_SET_OBSOLETE] = {
636 /* some unknown even to libseccomp */
637 .name = "@obsolete",
638 .help = "Unusual, obsolete or unimplemented system calls",
639 .value =
640 "_sysctl\0"
641 "afs_syscall\0"
642 "bdflush\0"
643 "break\0"
644 "create_module\0"
645 "ftime\0"
646 "get_kernel_syms\0"
647 "getpmsg\0"
648 "gtty\0"
649 "idle\0"
650 "lock\0"
651 "mpx\0"
652 "prof\0"
653 "profil\0"
654 "putpmsg\0"
655 "query_module\0"
656 "security\0"
657 "sgetmask\0"
658 "ssetmask\0"
659 "stime\0"
660 "stty\0"
661 "sysfs\0"
662 "tuxcall\0"
663 "ulimit\0"
664 "uselib\0"
665 "ustat\0"
666 "vserver\0"
667 },
668 [SYSCALL_FILTER_SET_PKEY] = {
669 .name = "@pkey",
670 .help = "System calls used for memory protection keys",
671 .value =
672 "pkey_alloc\0"
673 "pkey_free\0"
674 "pkey_mprotect\0"
675 },
676 [SYSCALL_FILTER_SET_PRIVILEGED] = {
677 .name = "@privileged",
678 .help = "All system calls which need super-user capabilities",
679 .value =
680 "@chown\0"
681 "@clock\0"
682 "@module\0"
683 "@raw-io\0"
684 "@reboot\0"
685 "@swap\0"
686 "_sysctl\0"
687 "acct\0"
688 "bpf\0"
689 "capset\0"
690 "chroot\0"
691 "fanotify_init\0"
692 "fanotify_mark\0"
693 "nfsservctl\0"
694 "open_by_handle_at\0"
695 "pivot_root\0"
696 "quotactl\0"
697 "setdomainname\0"
698 "setfsuid\0"
699 "setfsuid32\0"
700 "setgroups\0"
701 "setgroups32\0"
702 "sethostname\0"
703 "setresuid\0"
704 "setresuid32\0"
705 "setreuid\0"
706 "setreuid32\0"
707 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
708 "setuid32\0"
709 "vhangup\0"
710 },
711 [SYSCALL_FILTER_SET_PROCESS] = {
712 .name = "@process",
713 .help = "Process control, execution, namespacing operations",
714 .value =
715 "arch_prctl\0"
716 "capget\0" /* Able to query arbitrary processes */
717 "clone\0"
718 "clone3\0"
719 "execveat\0"
720 "fork\0"
721 "getrusage\0"
722 "kill\0"
723 "pidfd_open\0"
724 "pidfd_send_signal\0"
725 "prctl\0"
726 "rt_sigqueueinfo\0"
727 "rt_tgsigqueueinfo\0"
728 "setns\0"
729 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
730 "tgkill\0"
731 "times\0"
732 "tkill\0"
733 "unshare\0"
734 "vfork\0"
735 "wait4\0"
736 "waitid\0"
737 "waitpid\0"
738 },
739 [SYSCALL_FILTER_SET_RAW_IO] = {
740 .name = "@raw-io",
741 .help = "Raw I/O port access",
742 .value =
743 "ioperm\0"
744 "iopl\0"
745 "pciconfig_iobase\0"
746 "pciconfig_read\0"
747 "pciconfig_write\0"
748 #if defined __s390__ || defined __s390x__
749 "s390_pci_mmio_read\0"
750 "s390_pci_mmio_write\0"
751 #endif
752 },
753 [SYSCALL_FILTER_SET_REBOOT] = {
754 .name = "@reboot",
755 .help = "Reboot and reboot preparation/kexec",
756 .value =
757 "kexec_file_load\0"
758 "kexec_load\0"
759 "reboot\0"
760 },
761 [SYSCALL_FILTER_SET_RESOURCES] = {
762 .name = "@resources",
763 .help = "Alter resource settings",
764 .value =
765 "ioprio_set\0"
766 "mbind\0"
767 "migrate_pages\0"
768 "move_pages\0"
769 "nice\0"
770 "sched_setaffinity\0"
771 "sched_setattr\0"
772 "sched_setparam\0"
773 "sched_setscheduler\0"
774 "set_mempolicy\0"
775 "setpriority\0"
776 "setrlimit\0"
777 },
778 [SYSCALL_FILTER_SET_SETUID] = {
779 .name = "@setuid",
780 .help = "Operations for changing user/group credentials",
781 .value =
782 "setgid\0"
783 "setgid32\0"
784 "setgroups\0"
785 "setgroups32\0"
786 "setregid\0"
787 "setregid32\0"
788 "setresgid\0"
789 "setresgid32\0"
790 "setresuid\0"
791 "setresuid32\0"
792 "setreuid\0"
793 "setreuid32\0"
794 "setuid\0"
795 "setuid32\0"
796 },
797 [SYSCALL_FILTER_SET_SIGNAL] = {
798 .name = "@signal",
799 .help = "Process signal handling",
800 .value =
801 "rt_sigaction\0"
802 "rt_sigpending\0"
803 "rt_sigprocmask\0"
804 "rt_sigsuspend\0"
805 "rt_sigtimedwait\0"
806 "rt_sigtimedwait_time64\0"
807 "sigaction\0"
808 "sigaltstack\0"
809 "signal\0"
810 "signalfd\0"
811 "signalfd4\0"
812 "sigpending\0"
813 "sigprocmask\0"
814 "sigsuspend\0"
815 },
816 [SYSCALL_FILTER_SET_SWAP] = {
817 .name = "@swap",
818 .help = "Enable/disable swap devices",
819 .value =
820 "swapoff\0"
821 "swapon\0"
822 },
823 [SYSCALL_FILTER_SET_SYNC] = {
824 .name = "@sync",
825 .help = "Synchronize files and memory to storage",
826 .value =
827 "fdatasync\0"
828 "fsync\0"
829 "msync\0"
830 "sync\0"
831 "sync_file_range\0"
832 "sync_file_range2\0"
833 "syncfs\0"
834 },
835 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
836 .name = "@system-service",
837 .help = "General system service operations",
838 .value =
839 "@aio\0"
840 "@basic-io\0"
841 "@chown\0"
842 "@default\0"
843 "@file-system\0"
844 "@io-event\0"
845 "@ipc\0"
846 "@keyring\0"
847 "@memlock\0"
848 "@network-io\0"
849 "@process\0"
850 "@resources\0"
851 "@setuid\0"
852 "@signal\0"
853 "@sync\0"
854 "@timer\0"
855 "capget\0"
856 "capset\0"
857 "copy_file_range\0"
858 "fadvise64\0"
859 "fadvise64_64\0"
860 "flock\0"
861 "get_mempolicy\0"
862 "getcpu\0"
863 "getpriority\0"
864 "ioctl\0"
865 "ioprio_get\0"
866 "kcmp\0"
867 "madvise\0"
868 "mremap\0"
869 "name_to_handle_at\0"
870 "oldolduname\0"
871 "olduname\0"
872 "personality\0"
873 "readahead\0"
874 "readdir\0"
875 "remap_file_pages\0"
876 "sched_get_priority_max\0"
877 "sched_get_priority_min\0"
878 "sched_getattr\0"
879 "sched_getparam\0"
880 "sched_getscheduler\0"
881 "sched_rr_get_interval\0"
882 "sched_rr_get_interval_time64\0"
883 "sched_yield\0"
884 "sendfile\0"
885 "sendfile64\0"
886 "setfsgid\0"
887 "setfsgid32\0"
888 "setfsuid\0"
889 "setfsuid32\0"
890 "setpgid\0"
891 "setsid\0"
892 "splice\0"
893 "sysinfo\0"
894 "tee\0"
895 "umask\0"
896 "uname\0"
897 "userfaultfd\0"
898 "vmsplice\0"
899 },
900 [SYSCALL_FILTER_SET_TIMER] = {
901 .name = "@timer",
902 .help = "Schedule operations by time",
903 .value =
904 "alarm\0"
905 "getitimer\0"
906 "setitimer\0"
907 "timer_create\0"
908 "timer_delete\0"
909 "timer_getoverrun\0"
910 "timer_gettime\0"
911 "timer_gettime64\0"
912 "timer_settime\0"
913 "timer_settime64\0"
914 "timerfd_create\0"
915 "timerfd_gettime\0"
916 "timerfd_gettime64\0"
917 "timerfd_settime\0"
918 "timerfd_settime64\0"
919 "times\0"
920 },
921 [SYSCALL_FILTER_SET_KNOWN] = {
922 .name = "@known",
923 .help = "All known syscalls declared in the kernel",
924 .value =
925 #include "syscall-list.h"
926 },
927 };
928
929 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
930 if (isempty(name) || name[0] != '@')
931 return NULL;
932
933 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
934 if (streq(syscall_filter_sets[i].name, name))
935 return syscall_filter_sets + i;
936
937 return NULL;
938 }
939
940 static int add_syscall_filter_set(
941 scmp_filter_ctx seccomp,
942 const SyscallFilterSet *set,
943 uint32_t action,
944 char **exclude,
945 bool log_missing,
946 char ***added);
947
948 int seccomp_add_syscall_filter_item(
949 scmp_filter_ctx *seccomp,
950 const char *name,
951 uint32_t action,
952 char **exclude,
953 bool log_missing,
954 char ***added) {
955
956 assert(seccomp);
957 assert(name);
958
959 if (strv_contains(exclude, name))
960 return 0;
961
962 /* Any syscalls that are handled are added to the *added strv. The pointer
963 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
964
965 if (name[0] == '@') {
966 const SyscallFilterSet *other;
967
968 other = syscall_filter_set_find(name);
969 if (!other)
970 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
971 "Filter set %s is not known!",
972 name);
973
974 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
975
976 } else {
977 int id, r;
978
979 id = seccomp_syscall_resolve_name(name);
980 if (id == __NR_SCMP_ERROR) {
981 if (log_missing)
982 log_debug("System call %s is not known, ignoring.", name);
983 return 0;
984 }
985
986 r = seccomp_rule_add_exact(seccomp, action, id, 0);
987 if (r < 0) {
988 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
989 bool ignore = r == -EDOM;
990
991 if (!ignore || log_missing)
992 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
993 name, id, ignore ? ", ignoring" : "");
994 if (!ignore)
995 return r;
996 }
997
998 if (added) {
999 r = strv_extend(added, name);
1000 if (r < 0)
1001 return r;
1002 }
1003
1004 return 0;
1005 }
1006 }
1007
1008 static int add_syscall_filter_set(
1009 scmp_filter_ctx seccomp,
1010 const SyscallFilterSet *set,
1011 uint32_t action,
1012 char **exclude,
1013 bool log_missing,
1014 char ***added) {
1015
1016 const char *sys;
1017 int r;
1018
1019 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1020
1021 assert(seccomp);
1022 assert(set);
1023
1024 NULSTR_FOREACH(sys, set->value) {
1025 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1026 if (r < 0)
1027 return r;
1028 }
1029
1030 return 0;
1031 }
1032
1033 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1034 uint32_t arch;
1035 int r;
1036
1037 assert(set);
1038
1039 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1040 * each local arch. */
1041
1042 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1043 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1044
1045 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1046
1047 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1048 if (r < 0)
1049 return r;
1050
1051 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1052 if (r < 0)
1053 return log_debug_errno(r, "Failed to add filter set: %m");
1054
1055 r = seccomp_load(seccomp);
1056 if (ERRNO_IS_SECCOMP_FATAL(r))
1057 return r;
1058 if (r < 0)
1059 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1060 }
1061
1062 return 0;
1063 }
1064
1065 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1066 uint32_t arch;
1067 int r;
1068
1069 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1070 * of a SyscallFilterSet* table. */
1071
1072 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1073 return 0;
1074
1075 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1076 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1077 void *syscall_id, *val;
1078
1079 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1080
1081 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1082 if (r < 0)
1083 return r;
1084
1085 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1086 uint32_t a = action;
1087 int id = PTR_TO_INT(syscall_id) - 1;
1088 int error = PTR_TO_INT(val);
1089
1090 if (error == SECCOMP_ERROR_NUMBER_KILL)
1091 a = scmp_act_kill_process();
1092 #ifdef SCMP_ACT_LOG
1093 else if (action == SCMP_ACT_LOG)
1094 a = SCMP_ACT_LOG;
1095 #endif
1096 else if (error >= 0)
1097 a = SCMP_ACT_ERRNO(error);
1098
1099 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1100 if (r < 0) {
1101 /* If the system call is not known on this architecture, then that's
1102 * fine, let's ignore it */
1103 _cleanup_free_ char *n = NULL;
1104 bool ignore;
1105
1106 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1107 ignore = r == -EDOM;
1108 if (!ignore || log_missing)
1109 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1110 strna(n), id, ignore ? ", ignoring" : "");
1111 if (!ignore)
1112 return r;
1113 }
1114 }
1115
1116 r = seccomp_load(seccomp);
1117 if (ERRNO_IS_SECCOMP_FATAL(r))
1118 return r;
1119 if (r < 0)
1120 log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
1121 seccomp_arch_to_string(arch));
1122 }
1123
1124 return 0;
1125 }
1126
1127 int seccomp_parse_syscall_filter(
1128 const char *name,
1129 int errno_num,
1130 Hashmap *filter,
1131 SeccompParseFlags flags,
1132 const char *unit,
1133 const char *filename,
1134 unsigned line) {
1135
1136 int r;
1137
1138 assert(name);
1139 assert(filter);
1140
1141 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1142 return -EINVAL;
1143
1144 if (name[0] == '@') {
1145 const SyscallFilterSet *set;
1146 const char *i;
1147
1148 set = syscall_filter_set_find(name);
1149 if (!set) {
1150 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1151 return -EINVAL;
1152
1153 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1154 "Unknown system call group, ignoring: %s", name);
1155 return 0;
1156 }
1157
1158 NULSTR_FOREACH(i, set->value) {
1159 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1160 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1161 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1162 * about them. */
1163 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1164 if (r < 0)
1165 return r;
1166 }
1167 } else {
1168 int id;
1169
1170 id = seccomp_syscall_resolve_name(name);
1171 if (id == __NR_SCMP_ERROR) {
1172 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1173 return -EINVAL;
1174
1175 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1176 "Failed to parse system call, ignoring: %s", name);
1177 return 0;
1178 }
1179
1180 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1181 * it from the list. The entries in allow-list with non-negative error value will be
1182 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1183 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1184 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1185 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1186 if (r < 0)
1187 switch (r) {
1188 case -ENOMEM:
1189 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1190 case -EEXIST:
1191 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1192 break;
1193 default:
1194 return r;
1195 }
1196 } else
1197 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1198 }
1199
1200 return 0;
1201 }
1202
1203 int seccomp_restrict_namespaces(unsigned long retain) {
1204 uint32_t arch;
1205 int r;
1206
1207 if (DEBUG_LOGGING) {
1208 _cleanup_free_ char *s = NULL;
1209
1210 (void) namespace_flags_to_string(retain, &s);
1211 log_debug("Restricting namespace to: %s.", strna(s));
1212 }
1213
1214 /* NOOP? */
1215 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1216 return 0;
1217
1218 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1219 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1220
1221 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1222
1223 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1224 if (r < 0)
1225 return r;
1226
1227 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1228 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1229 * altogether. */
1230 r = seccomp_rule_add_exact(
1231 seccomp,
1232 SCMP_ACT_ERRNO(EPERM),
1233 SCMP_SYS(setns),
1234 0);
1235 else
1236 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1237 * special invocation with a zero flags argument, right here. */
1238 r = seccomp_rule_add_exact(
1239 seccomp,
1240 SCMP_ACT_ERRNO(EPERM),
1241 SCMP_SYS(setns),
1242 1,
1243 SCMP_A1(SCMP_CMP_EQ, 0));
1244 if (r < 0) {
1245 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1246 continue;
1247 }
1248
1249 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1250 unsigned long f;
1251
1252 f = namespace_flag_map[i].flag;
1253 if (FLAGS_SET(retain, f)) {
1254 log_debug("Permitting %s.", namespace_flag_map[i].name);
1255 continue;
1256 }
1257
1258 log_debug("Blocking %s.", namespace_flag_map[i].name);
1259
1260 r = seccomp_rule_add_exact(
1261 seccomp,
1262 SCMP_ACT_ERRNO(EPERM),
1263 SCMP_SYS(unshare),
1264 1,
1265 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1266 if (r < 0) {
1267 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1268 break;
1269 }
1270
1271 /* On s390/s390x the first two parameters to clone are switched */
1272 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1273 r = seccomp_rule_add_exact(
1274 seccomp,
1275 SCMP_ACT_ERRNO(EPERM),
1276 SCMP_SYS(clone),
1277 1,
1278 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1279 else
1280 r = seccomp_rule_add_exact(
1281 seccomp,
1282 SCMP_ACT_ERRNO(EPERM),
1283 SCMP_SYS(clone),
1284 1,
1285 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1286 if (r < 0) {
1287 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1288 break;
1289 }
1290
1291 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1292 r = seccomp_rule_add_exact(
1293 seccomp,
1294 SCMP_ACT_ERRNO(EPERM),
1295 SCMP_SYS(setns),
1296 1,
1297 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1298 if (r < 0) {
1299 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1300 break;
1301 }
1302 }
1303 }
1304 if (r < 0)
1305 continue;
1306
1307 r = seccomp_load(seccomp);
1308 if (ERRNO_IS_SECCOMP_FATAL(r))
1309 return r;
1310 if (r < 0)
1311 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1312 }
1313
1314 return 0;
1315 }
1316
1317 int seccomp_protect_sysctl(void) {
1318 uint32_t arch;
1319 int r;
1320
1321 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1322 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1323
1324 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1325
1326 if (IN_SET(arch,
1327 SCMP_ARCH_AARCH64,
1328 #ifdef SCMP_ARCH_RISCV64
1329 SCMP_ARCH_RISCV64,
1330 #endif
1331 SCMP_ARCH_X32
1332 ))
1333 /* No _sysctl syscall */
1334 continue;
1335
1336 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1337 if (r < 0)
1338 return r;
1339
1340 r = seccomp_rule_add_exact(
1341 seccomp,
1342 SCMP_ACT_ERRNO(EPERM),
1343 SCMP_SYS(_sysctl),
1344 0);
1345 if (r < 0) {
1346 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1347 continue;
1348 }
1349
1350 r = seccomp_load(seccomp);
1351 if (ERRNO_IS_SECCOMP_FATAL(r))
1352 return r;
1353 if (r < 0)
1354 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1355 }
1356
1357 return 0;
1358 }
1359
1360 int seccomp_protect_syslog(void) {
1361 uint32_t arch;
1362 int r;
1363
1364 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1365 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1366
1367 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1368 if (r < 0)
1369 return r;
1370
1371 r = seccomp_rule_add_exact(
1372 seccomp,
1373 SCMP_ACT_ERRNO(EPERM),
1374 SCMP_SYS(syslog),
1375 0);
1376
1377 if (r < 0) {
1378 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1379 continue;
1380 }
1381
1382 r = seccomp_load(seccomp);
1383 if (ERRNO_IS_SECCOMP_FATAL(r))
1384 return r;
1385 if (r < 0)
1386 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1387 }
1388
1389 return 0;
1390 }
1391
1392 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1393 uint32_t arch;
1394 int r;
1395
1396 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1397 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1398 bool supported;
1399
1400 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1401
1402 switch (arch) {
1403
1404 case SCMP_ARCH_X86_64:
1405 case SCMP_ARCH_X32:
1406 case SCMP_ARCH_ARM:
1407 case SCMP_ARCH_AARCH64:
1408 case SCMP_ARCH_MIPSEL64N32:
1409 case SCMP_ARCH_MIPS64N32:
1410 case SCMP_ARCH_MIPSEL64:
1411 case SCMP_ARCH_MIPS64:
1412 #ifdef SCMP_ARCH_RISCV64
1413 case SCMP_ARCH_RISCV64:
1414 #endif
1415 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1416 supported = true;
1417 break;
1418
1419 case SCMP_ARCH_S390:
1420 case SCMP_ARCH_S390X:
1421 case SCMP_ARCH_X86:
1422 case SCMP_ARCH_MIPSEL:
1423 case SCMP_ARCH_MIPS:
1424 case SCMP_ARCH_PPC:
1425 case SCMP_ARCH_PPC64:
1426 case SCMP_ARCH_PPC64LE:
1427 default:
1428 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1429 * don't know */
1430 supported = false;
1431 break;
1432 }
1433
1434 if (!supported)
1435 continue;
1436
1437 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1438 if (r < 0)
1439 return r;
1440
1441 if (allow_list) {
1442 int first = 0, last = 0;
1443 void *afp;
1444
1445 /* If this is an allow list, we first block the address families that are out of
1446 * range and then everything that is not in the set. First, we find the lowest and
1447 * highest address family in the set. */
1448
1449 SET_FOREACH(afp, address_families) {
1450 int af = PTR_TO_INT(afp);
1451
1452 if (af <= 0 || af >= af_max())
1453 continue;
1454
1455 if (first == 0 || af < first)
1456 first = af;
1457
1458 if (last == 0 || af > last)
1459 last = af;
1460 }
1461
1462 assert((first == 0) == (last == 0));
1463
1464 if (first == 0) {
1465
1466 /* No entries in the valid range, block everything */
1467 r = seccomp_rule_add_exact(
1468 seccomp,
1469 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1470 SCMP_SYS(socket),
1471 0);
1472 if (r < 0) {
1473 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1474 continue;
1475 }
1476
1477 } else {
1478
1479 /* Block everything below the first entry */
1480 r = seccomp_rule_add_exact(
1481 seccomp,
1482 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1483 SCMP_SYS(socket),
1484 1,
1485 SCMP_A0(SCMP_CMP_LT, first));
1486 if (r < 0) {
1487 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1488 continue;
1489 }
1490
1491 /* Block everything above the last entry */
1492 r = seccomp_rule_add_exact(
1493 seccomp,
1494 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1495 SCMP_SYS(socket),
1496 1,
1497 SCMP_A0(SCMP_CMP_GT, last));
1498 if (r < 0) {
1499 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1500 continue;
1501 }
1502
1503 /* Block everything between the first and last entry */
1504 for (int af = 1; af < af_max(); af++) {
1505
1506 if (set_contains(address_families, INT_TO_PTR(af)))
1507 continue;
1508
1509 r = seccomp_rule_add_exact(
1510 seccomp,
1511 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1512 SCMP_SYS(socket),
1513 1,
1514 SCMP_A0(SCMP_CMP_EQ, af));
1515 if (r < 0)
1516 break;
1517 }
1518 if (r < 0) {
1519 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1520 continue;
1521 }
1522 }
1523
1524 } else {
1525 void *af;
1526
1527 /* If this is a deny list, then generate one rule for each address family that are
1528 * then combined in OR checks. */
1529
1530 SET_FOREACH(af, address_families) {
1531 r = seccomp_rule_add_exact(
1532 seccomp,
1533 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1534 SCMP_SYS(socket),
1535 1,
1536 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1537 if (r < 0)
1538 break;
1539 }
1540 if (r < 0) {
1541 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1542 continue;
1543 }
1544 }
1545
1546 r = seccomp_load(seccomp);
1547 if (ERRNO_IS_SECCOMP_FATAL(r))
1548 return r;
1549 if (r < 0)
1550 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1551 }
1552
1553 return 0;
1554 }
1555
1556 int seccomp_restrict_realtime(void) {
1557 static const int permitted_policies[] = {
1558 SCHED_OTHER,
1559 SCHED_BATCH,
1560 SCHED_IDLE,
1561 };
1562
1563 int r, max_policy = 0;
1564 uint32_t arch;
1565 unsigned i;
1566
1567 /* Determine the highest policy constant we want to allow */
1568 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1569 if (permitted_policies[i] > max_policy)
1570 max_policy = permitted_policies[i];
1571
1572 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1573 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1574 int p;
1575
1576 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1577
1578 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1579 if (r < 0)
1580 return r;
1581
1582 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1583 * allow list. */
1584 for (p = 0; p < max_policy; p++) {
1585 bool good = false;
1586
1587 /* Check if this is in the allow list. */
1588 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1589 if (permitted_policies[i] == p) {
1590 good = true;
1591 break;
1592 }
1593
1594 if (good)
1595 continue;
1596
1597 /* Deny this policy */
1598 r = seccomp_rule_add_exact(
1599 seccomp,
1600 SCMP_ACT_ERRNO(EPERM),
1601 SCMP_SYS(sched_setscheduler),
1602 1,
1603 SCMP_A1(SCMP_CMP_EQ, p));
1604 if (r < 0) {
1605 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 continue;
1607 }
1608 }
1609
1610 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1611 * are unsigned here, hence no need no check for < 0 values. */
1612 r = seccomp_rule_add_exact(
1613 seccomp,
1614 SCMP_ACT_ERRNO(EPERM),
1615 SCMP_SYS(sched_setscheduler),
1616 1,
1617 SCMP_A1(SCMP_CMP_GT, max_policy));
1618 if (r < 0) {
1619 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1620 continue;
1621 }
1622
1623 r = seccomp_load(seccomp);
1624 if (ERRNO_IS_SECCOMP_FATAL(r))
1625 return r;
1626 if (r < 0)
1627 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1628 }
1629
1630 return 0;
1631 }
1632
1633 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1634 uint32_t arch,
1635 int nr,
1636 unsigned arg_cnt,
1637 const struct scmp_arg_cmp arg) {
1638 int r;
1639
1640 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1641 if (r < 0) {
1642 _cleanup_free_ char *n = NULL;
1643
1644 n = seccomp_syscall_resolve_num_arch(arch, nr);
1645 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1646 strna(n),
1647 seccomp_arch_to_string(arch));
1648 }
1649
1650 return r;
1651 }
1652
1653 /* For known architectures, check that syscalls are indeed defined or not. */
1654 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1655 assert_cc(SCMP_SYS(shmget) > 0);
1656 assert_cc(SCMP_SYS(shmat) > 0);
1657 assert_cc(SCMP_SYS(shmdt) > 0);
1658 #endif
1659
1660 int seccomp_memory_deny_write_execute(void) {
1661 uint32_t arch;
1662 unsigned loaded = 0;
1663
1664 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1665 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1666 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1667
1668 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1669
1670 switch (arch) {
1671
1672 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1673 * We ignore that here, which means there's still a way to get writable/executable
1674 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1675
1676 case SCMP_ARCH_X86:
1677 case SCMP_ARCH_S390:
1678 filter_syscall = SCMP_SYS(mmap2);
1679 block_syscall = SCMP_SYS(mmap);
1680 /* shmat multiplexed, see above */
1681 break;
1682
1683 case SCMP_ARCH_PPC:
1684 case SCMP_ARCH_PPC64:
1685 case SCMP_ARCH_PPC64LE:
1686 case SCMP_ARCH_S390X:
1687 filter_syscall = SCMP_SYS(mmap);
1688 /* shmat multiplexed, see above */
1689 break;
1690
1691 case SCMP_ARCH_ARM:
1692 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1693 shmat_syscall = SCMP_SYS(shmat);
1694 break;
1695
1696 case SCMP_ARCH_X86_64:
1697 case SCMP_ARCH_X32:
1698 case SCMP_ARCH_AARCH64:
1699 #ifdef SCMP_ARCH_RISCV64
1700 case SCMP_ARCH_RISCV64:
1701 #endif
1702 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1703 shmat_syscall = SCMP_SYS(shmat);
1704 break;
1705
1706 /* Please add more definitions here, if you port systemd to other architectures! */
1707
1708 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1709 #warning "Consider adding the right mmap() syscall definitions here!"
1710 #endif
1711 }
1712
1713 /* Can't filter mmap() on this arch, then skip it */
1714 if (filter_syscall == 0)
1715 continue;
1716
1717 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1718 if (r < 0)
1719 return r;
1720
1721 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1722 1,
1723 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1724 if (r < 0)
1725 continue;
1726
1727 if (block_syscall != 0) {
1728 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1729 if (r < 0)
1730 continue;
1731 }
1732
1733 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1734 1,
1735 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1736 if (r < 0)
1737 continue;
1738
1739 #ifdef __NR_pkey_mprotect
1740 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1741 1,
1742 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1743 if (r < 0)
1744 continue;
1745 #endif
1746
1747 if (shmat_syscall > 0) {
1748 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1749 1,
1750 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1751 if (r < 0)
1752 continue;
1753 }
1754
1755 r = seccomp_load(seccomp);
1756 if (ERRNO_IS_SECCOMP_FATAL(r))
1757 return r;
1758 if (r < 0)
1759 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1760 seccomp_arch_to_string(arch));
1761 loaded++;
1762 }
1763
1764 if (loaded == 0)
1765 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1766
1767 return loaded;
1768 }
1769
1770 int seccomp_restrict_archs(Set *archs) {
1771 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1772 int r;
1773 bool blocked_new = false;
1774
1775 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1776 * list.
1777 *
1778 * There are some qualifications. However the most important use is to stop processes from bypassing
1779 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1780 * in a non-native architecture. There are no holes in this use case, at least so far. */
1781
1782 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1783 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1784 * to run a program with the restrictions applied. */
1785 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1786 if (!seccomp)
1787 return -ENOMEM;
1788
1789 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1790 uint32_t arch = seccomp_local_archs[i];
1791
1792 /* See above comment, our "native" architecture is never blocked. */
1793 if (arch == seccomp_arch_native())
1794 continue;
1795
1796 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1797 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1798 continue;
1799
1800 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1801
1802 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1803 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1804 * The important thing is that you can block the old 32-bit x86 syscalls.
1805 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1806 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1807 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1808
1809 if (block) {
1810 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1811 blocked_new = true;
1812 } else {
1813 r = seccomp_arch_add(seccomp, arch);
1814 if (r < 0 && r != -EEXIST)
1815 return r;
1816 }
1817 }
1818
1819 /* All architectures that will be blocked by the seccomp program were
1820 * already blocked. */
1821 if (!blocked_new)
1822 return 0;
1823
1824 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1825 if (r < 0)
1826 return r;
1827
1828 r = seccomp_load(seccomp);
1829 if (ERRNO_IS_SECCOMP_FATAL(r))
1830 return r;
1831 if (r < 0)
1832 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1833
1834 return 0;
1835 }
1836
1837 int parse_syscall_archs(char **l, Set **ret_archs) {
1838 _cleanup_set_free_ Set *archs = NULL;
1839 char **s;
1840 int r;
1841
1842 assert(l);
1843 assert(ret_archs);
1844
1845 STRV_FOREACH(s, l) {
1846 uint32_t a;
1847
1848 r = seccomp_arch_from_string(*s, &a);
1849 if (r < 0)
1850 return -EINVAL;
1851
1852 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1853 if (r < 0)
1854 return -ENOMEM;
1855 }
1856
1857 *ret_archs = TAKE_PTR(archs);
1858 return 0;
1859 }
1860
1861 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1862 const char *i;
1863 int r;
1864
1865 assert(set);
1866
1867 NULSTR_FOREACH(i, set->value) {
1868
1869 if (i[0] == '@') {
1870 const SyscallFilterSet *more;
1871
1872 more = syscall_filter_set_find(i);
1873 if (!more)
1874 return -ENXIO;
1875
1876 r = seccomp_filter_set_add(filter, add, more);
1877 if (r < 0)
1878 return r;
1879 } else {
1880 int id;
1881
1882 id = seccomp_syscall_resolve_name(i);
1883 if (id == __NR_SCMP_ERROR) {
1884 log_debug("Couldn't resolve system call, ignoring: %s", i);
1885 continue;
1886 }
1887
1888 if (add) {
1889 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1890 if (r < 0)
1891 return r;
1892 } else
1893 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1894 }
1895 }
1896
1897 return 0;
1898 }
1899
1900 int seccomp_lock_personality(unsigned long personality) {
1901 uint32_t arch;
1902 int r;
1903
1904 if (personality >= PERSONALITY_INVALID)
1905 return -EINVAL;
1906
1907 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1908 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1909
1910 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1911 if (r < 0)
1912 return r;
1913
1914 r = seccomp_rule_add_exact(
1915 seccomp,
1916 SCMP_ACT_ERRNO(EPERM),
1917 SCMP_SYS(personality),
1918 1,
1919 SCMP_A0(SCMP_CMP_NE, personality));
1920 if (r < 0) {
1921 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1922 continue;
1923 }
1924
1925 r = seccomp_load(seccomp);
1926 if (ERRNO_IS_SECCOMP_FATAL(r))
1927 return r;
1928 if (r < 0)
1929 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1930 }
1931
1932 return 0;
1933 }
1934
1935 int seccomp_protect_hostname(void) {
1936 uint32_t arch;
1937 int r;
1938
1939 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1940 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1941
1942 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1943 if (r < 0)
1944 return r;
1945
1946 r = seccomp_rule_add_exact(
1947 seccomp,
1948 SCMP_ACT_ERRNO(EPERM),
1949 SCMP_SYS(sethostname),
1950 0);
1951 if (r < 0) {
1952 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1953 continue;
1954 }
1955
1956 r = seccomp_rule_add_exact(
1957 seccomp,
1958 SCMP_ACT_ERRNO(EPERM),
1959 SCMP_SYS(setdomainname),
1960 0);
1961 if (r < 0) {
1962 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1963 continue;
1964 }
1965
1966 r = seccomp_load(seccomp);
1967 if (ERRNO_IS_SECCOMP_FATAL(r))
1968 return r;
1969 if (r < 0)
1970 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1971 }
1972
1973 return 0;
1974 }
1975
1976 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1977 /* Checks the mode_t parameter of the following system calls:
1978 *
1979 * → chmod() + fchmod() + fchmodat()
1980 * → open() + creat() + openat()
1981 * → mkdir() + mkdirat()
1982 * → mknod() + mknodat()
1983 *
1984 * Returns error if *everything* failed, and 0 otherwise.
1985 */
1986 int r;
1987 bool any = false;
1988
1989 r = seccomp_rule_add_exact(
1990 seccomp,
1991 SCMP_ACT_ERRNO(EPERM),
1992 SCMP_SYS(chmod),
1993 1,
1994 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1995 if (r < 0)
1996 log_debug_errno(r, "Failed to add filter for chmod: %m");
1997 else
1998 any = true;
1999
2000 r = seccomp_rule_add_exact(
2001 seccomp,
2002 SCMP_ACT_ERRNO(EPERM),
2003 SCMP_SYS(fchmod),
2004 1,
2005 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2006 if (r < 0)
2007 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2008 else
2009 any = true;
2010
2011 r = seccomp_rule_add_exact(
2012 seccomp,
2013 SCMP_ACT_ERRNO(EPERM),
2014 SCMP_SYS(fchmodat),
2015 1,
2016 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2017 if (r < 0)
2018 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2019 else
2020 any = true;
2021
2022 r = seccomp_rule_add_exact(
2023 seccomp,
2024 SCMP_ACT_ERRNO(EPERM),
2025 SCMP_SYS(mkdir),
2026 1,
2027 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2028 if (r < 0)
2029 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2030 else
2031 any = true;
2032
2033 r = seccomp_rule_add_exact(
2034 seccomp,
2035 SCMP_ACT_ERRNO(EPERM),
2036 SCMP_SYS(mkdirat),
2037 1,
2038 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2039 if (r < 0)
2040 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2041 else
2042 any = true;
2043
2044 r = seccomp_rule_add_exact(
2045 seccomp,
2046 SCMP_ACT_ERRNO(EPERM),
2047 SCMP_SYS(mknod),
2048 1,
2049 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2050 if (r < 0)
2051 log_debug_errno(r, "Failed to add filter for mknod: %m");
2052 else
2053 any = true;
2054
2055 r = seccomp_rule_add_exact(
2056 seccomp,
2057 SCMP_ACT_ERRNO(EPERM),
2058 SCMP_SYS(mknodat),
2059 1,
2060 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2061 if (r < 0)
2062 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2063 else
2064 any = true;
2065
2066 #if SCMP_SYS(open) > 0
2067 r = seccomp_rule_add_exact(
2068 seccomp,
2069 SCMP_ACT_ERRNO(EPERM),
2070 SCMP_SYS(open),
2071 2,
2072 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2073 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2074 if (r < 0)
2075 log_debug_errno(r, "Failed to add filter for open: %m");
2076 else
2077 any = true;
2078 #endif
2079
2080 r = seccomp_rule_add_exact(
2081 seccomp,
2082 SCMP_ACT_ERRNO(EPERM),
2083 SCMP_SYS(openat),
2084 2,
2085 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2086 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2087 if (r < 0)
2088 log_debug_errno(r, "Failed to add filter for openat: %m");
2089 else
2090 any = true;
2091
2092 #if defined(__SNR_openat2)
2093 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2094 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2095 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2096 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2097 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2098 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2099 r = seccomp_rule_add_exact(
2100 seccomp,
2101 SCMP_ACT_ERRNO(ENOSYS),
2102 SCMP_SYS(openat2),
2103 0);
2104 if (r < 0)
2105 log_debug_errno(r, "Failed to add filter for openat2: %m");
2106 else
2107 any = true;
2108 #endif
2109
2110 r = seccomp_rule_add_exact(
2111 seccomp,
2112 SCMP_ACT_ERRNO(EPERM),
2113 SCMP_SYS(creat),
2114 1,
2115 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2116 if (r < 0)
2117 log_debug_errno(r, "Failed to add filter for creat: %m");
2118 else
2119 any = true;
2120
2121 return any ? 0 : r;
2122 }
2123
2124 int seccomp_restrict_suid_sgid(void) {
2125 uint32_t arch;
2126 int r, k;
2127
2128 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2129 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2130
2131 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2132 if (r < 0)
2133 return r;
2134
2135 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2136 if (r < 0)
2137 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2138
2139 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2140 if (k < 0)
2141 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2142
2143 if (r < 0 && k < 0)
2144 continue;
2145
2146 r = seccomp_load(seccomp);
2147 if (ERRNO_IS_SECCOMP_FATAL(r))
2148 return r;
2149 if (r < 0)
2150 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2151 }
2152
2153 return 0;
2154 }
2155
2156 uint32_t scmp_act_kill_process(void) {
2157
2158 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2159 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2160 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2161 * for single-threaded apps does the right thing. */
2162
2163 #ifdef SCMP_ACT_KILL_PROCESS
2164 if (seccomp_api_get() >= 3)
2165 return SCMP_ACT_KILL_PROCESS;
2166 #endif
2167
2168 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2169 }
2170
2171 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2172 _cleanup_free_ char *n = NULL;
2173 char *p;
2174 int e = -1;
2175
2176 assert(in);
2177 assert(name);
2178 assert(error);
2179
2180 /*
2181 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2182 * If errno is omitted, then error is set to -1.
2183 * Empty syscall name is not allowed.
2184 * Here, we do not check that the syscall name is valid or not.
2185 */
2186
2187 p = strchr(in, ':');
2188 if (p) {
2189 e = seccomp_parse_errno_or_action(p + 1);
2190 if (e < 0)
2191 return e;
2192
2193 n = strndup(in, p - in);
2194 } else
2195 n = strdup(in);
2196
2197 if (!n)
2198 return -ENOMEM;
2199
2200 if (isempty(n))
2201 return -EINVAL;
2202
2203 *error = e;
2204 *name = TAKE_PTR(n);
2205
2206 return 0;
2207 }
2208
2209 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2210 bool any = false;
2211 int r;
2212
2213 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2214 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2215
2216 #if SCMP_SYS(open) > 0
2217 r = seccomp_rule_add_exact(
2218 seccomp,
2219 SCMP_ACT_ERRNO(EINVAL),
2220 SCMP_SYS(open),
2221 1,
2222 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2223 if (r < 0)
2224 log_debug_errno(r, "Failed to add filter for open: %m");
2225 else
2226 any = true;
2227 #endif
2228
2229 r = seccomp_rule_add_exact(
2230 seccomp,
2231 SCMP_ACT_ERRNO(EINVAL),
2232 SCMP_SYS(openat),
2233 1,
2234 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2235 if (r < 0)
2236 log_debug_errno(r, "Failed to add filter for openat: %m");
2237 else
2238 any = true;
2239
2240 #if defined(__SNR_openat2)
2241 /* The new openat2() system call can't be filtered sensibly, see above. */
2242 r = seccomp_rule_add_exact(
2243 seccomp,
2244 SCMP_ACT_ERRNO(ENOSYS),
2245 SCMP_SYS(openat2),
2246 0);
2247 if (r < 0)
2248 log_debug_errno(r, "Failed to add filter for openat2: %m");
2249 else
2250 any = true;
2251 #endif
2252
2253 return any ? 0 : r;
2254 }
2255
2256 int seccomp_suppress_sync(void) {
2257 uint32_t arch;
2258 int r;
2259
2260 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2261 * manageable, and also masks O_SYNC/O_DSYNC */
2262
2263 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2264 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2265 const char *c;
2266
2267 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2268 if (r < 0)
2269 return r;
2270
2271 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2272 int id;
2273
2274 id = seccomp_syscall_resolve_name(c);
2275 if (id == __NR_SCMP_ERROR) {
2276 log_debug("System call %s is not known, ignoring.", c);
2277 continue;
2278 }
2279
2280 r = seccomp_rule_add_exact(
2281 seccomp,
2282 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2283 id,
2284 0);
2285 if (r < 0)
2286 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2287 }
2288
2289 (void) block_open_flag(seccomp, O_SYNC);
2290 #if O_DSYNC != O_SYNC
2291 (void) block_open_flag(seccomp, O_DSYNC);
2292 #endif
2293
2294 r = seccomp_load(seccomp);
2295 if (ERRNO_IS_SECCOMP_FATAL(r))
2296 return r;
2297 if (r < 0)
2298 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2299 }
2300
2301 return 0;
2302 }