]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
32bd8aa73bd78f89b7595eaed2b2f6e44cc67cb4
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "nsflags.h"
22 #include "nulstr-util.h"
23 #include "process-util.h"
24 #include "seccomp-util.h"
25 #include "set.h"
26 #include "string-util.h"
27 #include "strv.h"
28
29 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
30 uint32_t seccomp_local_archs[] = {
31
32 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
33
34 #if defined(__x86_64__) && defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X86_64,
37 SCMP_ARCH_X32, /* native */
38 #elif defined(__x86_64__) && !defined(__ILP32__)
39 SCMP_ARCH_X86,
40 SCMP_ARCH_X32,
41 SCMP_ARCH_X86_64, /* native */
42 #elif defined(__i386__)
43 SCMP_ARCH_X86,
44 #elif defined(__aarch64__)
45 SCMP_ARCH_ARM,
46 SCMP_ARCH_AARCH64, /* native */
47 #elif defined(__arm__)
48 SCMP_ARCH_ARM,
49 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL,
51 SCMP_ARCH_MIPS, /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPSEL,
57 SCMP_ARCH_MIPS,
58 SCMP_ARCH_MIPSEL64N32,
59 SCMP_ARCH_MIPS64N32,
60 SCMP_ARCH_MIPSEL64,
61 SCMP_ARCH_MIPS64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL,
65 SCMP_ARCH_MIPS64N32,
66 SCMP_ARCH_MIPSEL64N32,
67 SCMP_ARCH_MIPS64,
68 SCMP_ARCH_MIPSEL64, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL,
71 SCMP_ARCH_MIPS,
72 SCMP_ARCH_MIPSEL64,
73 SCMP_ARCH_MIPS64,
74 SCMP_ARCH_MIPSEL64N32,
75 SCMP_ARCH_MIPS64N32, /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPS,
78 SCMP_ARCH_MIPSEL,
79 SCMP_ARCH_MIPS64,
80 SCMP_ARCH_MIPSEL64,
81 SCMP_ARCH_MIPS64N32,
82 SCMP_ARCH_MIPSEL64N32, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64LE,
86 SCMP_ARCH_PPC64, /* native */
87 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
88 SCMP_ARCH_PPC,
89 SCMP_ARCH_PPC64,
90 SCMP_ARCH_PPC64LE, /* native */
91 #elif defined(__powerpc__)
92 SCMP_ARCH_PPC,
93 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
94 SCMP_ARCH_RISCV64,
95 #elif defined(__s390x__)
96 SCMP_ARCH_S390,
97 SCMP_ARCH_S390X, /* native */
98 #elif defined(__s390__)
99 SCMP_ARCH_S390,
100 #endif
101 SECCOMP_LOCAL_ARCH_END
102 };
103
104 const char* seccomp_arch_to_string(uint32_t c) {
105 /* Maintain order used in <seccomp.h>.
106 *
107 * Names used here should be the same as those used for ConditionArchitecture=,
108 * except for "subarchitectures" like x32. */
109
110 switch(c) {
111 case SCMP_ARCH_NATIVE:
112 return "native";
113 case SCMP_ARCH_X86:
114 return "x86";
115 case SCMP_ARCH_X86_64:
116 return "x86-64";
117 case SCMP_ARCH_X32:
118 return "x32";
119 case SCMP_ARCH_ARM:
120 return "arm";
121 case SCMP_ARCH_AARCH64:
122 return "arm64";
123 case SCMP_ARCH_MIPS:
124 return "mips";
125 case SCMP_ARCH_MIPS64:
126 return "mips64";
127 case SCMP_ARCH_MIPS64N32:
128 return "mips64-n32";
129 case SCMP_ARCH_MIPSEL:
130 return "mips-le";
131 case SCMP_ARCH_MIPSEL64:
132 return "mips64-le";
133 case SCMP_ARCH_MIPSEL64N32:
134 return "mips64-le-n32";
135 case SCMP_ARCH_PPC:
136 return "ppc";
137 case SCMP_ARCH_PPC64:
138 return "ppc64";
139 case SCMP_ARCH_PPC64LE:
140 return "ppc64-le";
141 #ifdef SCMP_ARCH_RISCV64
142 case SCMP_ARCH_RISCV64:
143 return "riscv64";
144 #endif
145 case SCMP_ARCH_S390:
146 return "s390";
147 case SCMP_ARCH_S390X:
148 return "s390x";
149 default:
150 return NULL;
151 }
152 }
153
154 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
155 if (!n)
156 return -EINVAL;
157
158 assert(ret);
159
160 if (streq(n, "native"))
161 *ret = SCMP_ARCH_NATIVE;
162 else if (streq(n, "x86"))
163 *ret = SCMP_ARCH_X86;
164 else if (streq(n, "x86-64"))
165 *ret = SCMP_ARCH_X86_64;
166 else if (streq(n, "x32"))
167 *ret = SCMP_ARCH_X32;
168 else if (streq(n, "arm"))
169 *ret = SCMP_ARCH_ARM;
170 else if (streq(n, "arm64"))
171 *ret = SCMP_ARCH_AARCH64;
172 else if (streq(n, "mips"))
173 *ret = SCMP_ARCH_MIPS;
174 else if (streq(n, "mips64"))
175 *ret = SCMP_ARCH_MIPS64;
176 else if (streq(n, "mips64-n32"))
177 *ret = SCMP_ARCH_MIPS64N32;
178 else if (streq(n, "mips-le"))
179 *ret = SCMP_ARCH_MIPSEL;
180 else if (streq(n, "mips64-le"))
181 *ret = SCMP_ARCH_MIPSEL64;
182 else if (streq(n, "mips64-le-n32"))
183 *ret = SCMP_ARCH_MIPSEL64N32;
184 else if (streq(n, "ppc"))
185 *ret = SCMP_ARCH_PPC;
186 else if (streq(n, "ppc64"))
187 *ret = SCMP_ARCH_PPC64;
188 else if (streq(n, "ppc64-le"))
189 *ret = SCMP_ARCH_PPC64LE;
190 #ifdef SCMP_ARCH_RISCV64
191 else if (streq(n, "riscv64"))
192 *ret = SCMP_ARCH_RISCV64;
193 #endif
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 return r;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 return r;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 return r;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 return r;
241
242 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
243 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
244 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
245 if (r < 0)
246 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
247 }
248 #endif
249
250 *ret = TAKE_PTR(seccomp);
251 return 0;
252 }
253
254 static bool is_basic_seccomp_available(void) {
255 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
256 }
257
258 static bool is_seccomp_filter_available(void) {
259 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
260 errno == EFAULT;
261 }
262
263 bool is_seccomp_available(void) {
264 static int cached_enabled = -1;
265
266 if (cached_enabled < 0) {
267 int b;
268
269 b = getenv_bool_secure("SYSTEMD_SECCOMP");
270 if (b != 0) {
271 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
272 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
273
274 cached_enabled =
275 is_basic_seccomp_available() &&
276 is_seccomp_filter_available();
277 } else
278 cached_enabled = false;
279 }
280
281 return cached_enabled;
282 }
283
284 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
285 [SYSCALL_FILTER_SET_DEFAULT] = {
286 .name = "@default",
287 .help = "System calls that are always permitted",
288 .value =
289 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
290 "brk\0"
291 "cacheflush\0"
292 "clock_getres\0"
293 "clock_getres_time64\0"
294 "clock_gettime\0"
295 "clock_gettime64\0"
296 "clock_nanosleep\0"
297 "clock_nanosleep_time64\0"
298 "execve\0"
299 "exit\0"
300 "exit_group\0"
301 "futex\0"
302 "futex_time64\0"
303 "get_robust_list\0"
304 "get_thread_area\0"
305 "getegid\0"
306 "getegid32\0"
307 "geteuid\0"
308 "geteuid32\0"
309 "getgid\0"
310 "getgid32\0"
311 "getgroups\0"
312 "getgroups32\0"
313 "getpgid\0"
314 "getpgrp\0"
315 "getpid\0"
316 "getppid\0"
317 "getrandom\0"
318 "getresgid\0"
319 "getresgid32\0"
320 "getresuid\0"
321 "getresuid32\0"
322 "getrlimit\0" /* make sure processes can query stack size and such */
323 "getsid\0"
324 "gettid\0"
325 "gettimeofday\0"
326 "getuid\0"
327 "getuid32\0"
328 "membarrier\0"
329 "mmap\0"
330 "mmap2\0"
331 "mprotect\0"
332 "munmap\0"
333 "nanosleep\0"
334 "pause\0"
335 "prlimit64\0"
336 "restart_syscall\0"
337 "rseq\0"
338 "rt_sigreturn\0"
339 "sched_getaffinity\0"
340 "sched_yield\0"
341 "set_robust_list\0"
342 "set_thread_area\0"
343 "set_tid_address\0"
344 "set_tls\0"
345 "sigreturn\0"
346 "time\0"
347 "ugetrlimit\0"
348 },
349 [SYSCALL_FILTER_SET_AIO] = {
350 .name = "@aio",
351 .help = "Asynchronous IO",
352 .value =
353 "io_cancel\0"
354 "io_destroy\0"
355 "io_getevents\0"
356 "io_pgetevents\0"
357 "io_pgetevents_time64\0"
358 "io_setup\0"
359 "io_submit\0"
360 "io_uring_enter\0"
361 "io_uring_register\0"
362 "io_uring_setup\0"
363 },
364 [SYSCALL_FILTER_SET_BASIC_IO] = {
365 .name = "@basic-io",
366 .help = "Basic IO",
367 .value =
368 "_llseek\0"
369 "close\0"
370 "close_range\0"
371 "dup\0"
372 "dup2\0"
373 "dup3\0"
374 "lseek\0"
375 "pread64\0"
376 "preadv\0"
377 "preadv2\0"
378 "pwrite64\0"
379 "pwritev\0"
380 "pwritev2\0"
381 "read\0"
382 "readv\0"
383 "write\0"
384 "writev\0"
385 },
386 [SYSCALL_FILTER_SET_CHOWN] = {
387 .name = "@chown",
388 .help = "Change ownership of files and directories",
389 .value =
390 "chown\0"
391 "chown32\0"
392 "fchown\0"
393 "fchown32\0"
394 "fchownat\0"
395 "lchown\0"
396 "lchown32\0"
397 },
398 [SYSCALL_FILTER_SET_CLOCK] = {
399 .name = "@clock",
400 .help = "Change the system time",
401 .value =
402 "adjtimex\0"
403 "clock_adjtime\0"
404 "clock_adjtime64\0"
405 "clock_settime\0"
406 "clock_settime64\0"
407 "settimeofday\0"
408 },
409 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
410 .name = "@cpu-emulation",
411 .help = "System calls for CPU emulation functionality",
412 .value =
413 "modify_ldt\0"
414 "subpage_prot\0"
415 "switch_endian\0"
416 "vm86\0"
417 "vm86old\0"
418 },
419 [SYSCALL_FILTER_SET_DEBUG] = {
420 .name = "@debug",
421 .help = "Debugging, performance monitoring and tracing functionality",
422 .value =
423 "lookup_dcookie\0"
424 "perf_event_open\0"
425 "pidfd_getfd\0"
426 "ptrace\0"
427 "rtas\0"
428 #if defined __s390__ || defined __s390x__
429 "s390_runtime_instr\0"
430 #endif
431 "sys_debug_setcontext\0"
432 },
433 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
434 .name = "@file-system",
435 .help = "File system operations",
436 .value =
437 "access\0"
438 "chdir\0"
439 "chmod\0"
440 "close\0"
441 "creat\0"
442 "faccessat\0"
443 "faccessat2\0"
444 "fallocate\0"
445 "fchdir\0"
446 "fchmod\0"
447 "fchmodat\0"
448 "fcntl\0"
449 "fcntl64\0"
450 "fgetxattr\0"
451 "flistxattr\0"
452 "fremovexattr\0"
453 "fsetxattr\0"
454 "fstat\0"
455 "fstat64\0"
456 "fstatat64\0"
457 "fstatfs\0"
458 "fstatfs64\0"
459 "ftruncate\0"
460 "ftruncate64\0"
461 "futimesat\0"
462 "getcwd\0"
463 "getdents\0"
464 "getdents64\0"
465 "getxattr\0"
466 "inotify_add_watch\0"
467 "inotify_init\0"
468 "inotify_init1\0"
469 "inotify_rm_watch\0"
470 "lgetxattr\0"
471 "link\0"
472 "linkat\0"
473 "listxattr\0"
474 "llistxattr\0"
475 "lremovexattr\0"
476 "lsetxattr\0"
477 "lstat\0"
478 "lstat64\0"
479 "mkdir\0"
480 "mkdirat\0"
481 "mknod\0"
482 "mknodat\0"
483 "newfstatat\0"
484 "oldfstat\0"
485 "oldlstat\0"
486 "oldstat\0"
487 "open\0"
488 "openat\0"
489 "openat2\0"
490 "readlink\0"
491 "readlinkat\0"
492 "removexattr\0"
493 "rename\0"
494 "renameat\0"
495 "renameat2\0"
496 "rmdir\0"
497 "setxattr\0"
498 "stat\0"
499 "stat64\0"
500 "statfs\0"
501 "statfs64\0"
502 "statx\0"
503 "symlink\0"
504 "symlinkat\0"
505 "truncate\0"
506 "truncate64\0"
507 "unlink\0"
508 "unlinkat\0"
509 "utime\0"
510 "utimensat\0"
511 "utimensat_time64\0"
512 "utimes\0"
513 },
514 [SYSCALL_FILTER_SET_IO_EVENT] = {
515 .name = "@io-event",
516 .help = "Event loop system calls",
517 .value =
518 "_newselect\0"
519 "epoll_create\0"
520 "epoll_create1\0"
521 "epoll_ctl\0"
522 "epoll_ctl_old\0"
523 "epoll_pwait\0"
524 "epoll_pwait2\0"
525 "epoll_wait\0"
526 "epoll_wait_old\0"
527 "eventfd\0"
528 "eventfd2\0"
529 "poll\0"
530 "ppoll\0"
531 "ppoll_time64\0"
532 "pselect6\0"
533 "pselect6_time64\0"
534 "select\0"
535 },
536 [SYSCALL_FILTER_SET_IPC] = {
537 .name = "@ipc",
538 .help = "SysV IPC, POSIX Message Queues or other IPC",
539 .value =
540 "ipc\0"
541 "memfd_create\0"
542 "mq_getsetattr\0"
543 "mq_notify\0"
544 "mq_open\0"
545 "mq_timedreceive\0"
546 "mq_timedreceive_time64\0"
547 "mq_timedsend\0"
548 "mq_timedsend_time64\0"
549 "mq_unlink\0"
550 "msgctl\0"
551 "msgget\0"
552 "msgrcv\0"
553 "msgsnd\0"
554 "pipe\0"
555 "pipe2\0"
556 "process_madvise\0"
557 "process_vm_readv\0"
558 "process_vm_writev\0"
559 "semctl\0"
560 "semget\0"
561 "semop\0"
562 "semtimedop\0"
563 "semtimedop_time64\0"
564 "shmat\0"
565 "shmctl\0"
566 "shmdt\0"
567 "shmget\0"
568 },
569 [SYSCALL_FILTER_SET_KEYRING] = {
570 .name = "@keyring",
571 .help = "Kernel keyring access",
572 .value =
573 "add_key\0"
574 "keyctl\0"
575 "request_key\0"
576 },
577 [SYSCALL_FILTER_SET_MEMLOCK] = {
578 .name = "@memlock",
579 .help = "Memory locking control",
580 .value =
581 "mlock\0"
582 "mlock2\0"
583 "mlockall\0"
584 "munlock\0"
585 "munlockall\0"
586 },
587 [SYSCALL_FILTER_SET_MODULE] = {
588 .name = "@module",
589 .help = "Loading and unloading of kernel modules",
590 .value =
591 "delete_module\0"
592 "finit_module\0"
593 "init_module\0"
594 },
595 [SYSCALL_FILTER_SET_MOUNT] = {
596 .name = "@mount",
597 .help = "Mounting and unmounting of file systems",
598 .value =
599 "chroot\0"
600 "fsconfig\0"
601 "fsmount\0"
602 "fsopen\0"
603 "fspick\0"
604 "mount\0"
605 "mount_setattr\0"
606 "move_mount\0"
607 "open_tree\0"
608 "pivot_root\0"
609 "umount\0"
610 "umount2\0"
611 },
612 [SYSCALL_FILTER_SET_NETWORK_IO] = {
613 .name = "@network-io",
614 .help = "Network or Unix socket IO, should not be needed if not network facing",
615 .value =
616 "accept\0"
617 "accept4\0"
618 "bind\0"
619 "connect\0"
620 "getpeername\0"
621 "getsockname\0"
622 "getsockopt\0"
623 "listen\0"
624 "recv\0"
625 "recvfrom\0"
626 "recvmmsg\0"
627 "recvmmsg_time64\0"
628 "recvmsg\0"
629 "send\0"
630 "sendmmsg\0"
631 "sendmsg\0"
632 "sendto\0"
633 "setsockopt\0"
634 "shutdown\0"
635 "socket\0"
636 "socketcall\0"
637 "socketpair\0"
638 },
639 [SYSCALL_FILTER_SET_OBSOLETE] = {
640 /* some unknown even to libseccomp */
641 .name = "@obsolete",
642 .help = "Unusual, obsolete or unimplemented system calls",
643 .value =
644 "_sysctl\0"
645 "afs_syscall\0"
646 "bdflush\0"
647 "break\0"
648 "create_module\0"
649 "ftime\0"
650 "get_kernel_syms\0"
651 "getpmsg\0"
652 "gtty\0"
653 "idle\0"
654 "lock\0"
655 "mpx\0"
656 "prof\0"
657 "profil\0"
658 "putpmsg\0"
659 "query_module\0"
660 "security\0"
661 "sgetmask\0"
662 "ssetmask\0"
663 "stime\0"
664 "stty\0"
665 "sysfs\0"
666 "tuxcall\0"
667 "ulimit\0"
668 "uselib\0"
669 "ustat\0"
670 "vserver\0"
671 },
672 [SYSCALL_FILTER_SET_PKEY] = {
673 .name = "@pkey",
674 .help = "System calls used for memory protection keys",
675 .value =
676 "pkey_alloc\0"
677 "pkey_free\0"
678 "pkey_mprotect\0"
679 },
680 [SYSCALL_FILTER_SET_PRIVILEGED] = {
681 .name = "@privileged",
682 .help = "All system calls which need super-user capabilities",
683 .value =
684 "@chown\0"
685 "@clock\0"
686 "@module\0"
687 "@raw-io\0"
688 "@reboot\0"
689 "@swap\0"
690 "_sysctl\0"
691 "acct\0"
692 "bpf\0"
693 "capset\0"
694 "chroot\0"
695 "fanotify_init\0"
696 "fanotify_mark\0"
697 "nfsservctl\0"
698 "open_by_handle_at\0"
699 "pivot_root\0"
700 "quotactl\0"
701 "setdomainname\0"
702 "setfsuid\0"
703 "setfsuid32\0"
704 "setgroups\0"
705 "setgroups32\0"
706 "sethostname\0"
707 "setresuid\0"
708 "setresuid32\0"
709 "setreuid\0"
710 "setreuid32\0"
711 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
712 "setuid32\0"
713 "vhangup\0"
714 },
715 [SYSCALL_FILTER_SET_PROCESS] = {
716 .name = "@process",
717 .help = "Process control, execution, namespacing operations",
718 .value =
719 "capget\0" /* Able to query arbitrary processes */
720 "clone\0"
721 "clone3\0"
722 "execveat\0"
723 "fork\0"
724 "getrusage\0"
725 "kill\0"
726 "pidfd_open\0"
727 "pidfd_send_signal\0"
728 "prctl\0"
729 "rt_sigqueueinfo\0"
730 "rt_tgsigqueueinfo\0"
731 "setns\0"
732 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
733 "tgkill\0"
734 "times\0"
735 "tkill\0"
736 "unshare\0"
737 "vfork\0"
738 "wait4\0"
739 "waitid\0"
740 "waitpid\0"
741 },
742 [SYSCALL_FILTER_SET_RAW_IO] = {
743 .name = "@raw-io",
744 .help = "Raw I/O port access",
745 .value =
746 "ioperm\0"
747 "iopl\0"
748 "pciconfig_iobase\0"
749 "pciconfig_read\0"
750 "pciconfig_write\0"
751 #if defined __s390__ || defined __s390x__
752 "s390_pci_mmio_read\0"
753 "s390_pci_mmio_write\0"
754 #endif
755 },
756 [SYSCALL_FILTER_SET_REBOOT] = {
757 .name = "@reboot",
758 .help = "Reboot and reboot preparation/kexec",
759 .value =
760 "kexec_file_load\0"
761 "kexec_load\0"
762 "reboot\0"
763 },
764 [SYSCALL_FILTER_SET_RESOURCES] = {
765 .name = "@resources",
766 .help = "Alter resource settings",
767 .value =
768 "ioprio_set\0"
769 "mbind\0"
770 "migrate_pages\0"
771 "move_pages\0"
772 "nice\0"
773 "sched_setaffinity\0"
774 "sched_setattr\0"
775 "sched_setparam\0"
776 "sched_setscheduler\0"
777 "set_mempolicy\0"
778 "setpriority\0"
779 "setrlimit\0"
780 },
781 [SYSCALL_FILTER_SET_SETUID] = {
782 .name = "@setuid",
783 .help = "Operations for changing user/group credentials",
784 .value =
785 "setgid\0"
786 "setgid32\0"
787 "setgroups\0"
788 "setgroups32\0"
789 "setregid\0"
790 "setregid32\0"
791 "setresgid\0"
792 "setresgid32\0"
793 "setresuid\0"
794 "setresuid32\0"
795 "setreuid\0"
796 "setreuid32\0"
797 "setuid\0"
798 "setuid32\0"
799 },
800 [SYSCALL_FILTER_SET_SIGNAL] = {
801 .name = "@signal",
802 .help = "Process signal handling",
803 .value =
804 "rt_sigaction\0"
805 "rt_sigpending\0"
806 "rt_sigprocmask\0"
807 "rt_sigsuspend\0"
808 "rt_sigtimedwait\0"
809 "rt_sigtimedwait_time64\0"
810 "sigaction\0"
811 "sigaltstack\0"
812 "signal\0"
813 "signalfd\0"
814 "signalfd4\0"
815 "sigpending\0"
816 "sigprocmask\0"
817 "sigsuspend\0"
818 },
819 [SYSCALL_FILTER_SET_SWAP] = {
820 .name = "@swap",
821 .help = "Enable/disable swap devices",
822 .value =
823 "swapoff\0"
824 "swapon\0"
825 },
826 [SYSCALL_FILTER_SET_SYNC] = {
827 .name = "@sync",
828 .help = "Synchronize files and memory to storage",
829 .value =
830 "fdatasync\0"
831 "fsync\0"
832 "msync\0"
833 "sync\0"
834 "sync_file_range\0"
835 "sync_file_range2\0"
836 "syncfs\0"
837 },
838 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
839 .name = "@system-service",
840 .help = "General system service operations",
841 .value =
842 "@aio\0"
843 "@basic-io\0"
844 "@chown\0"
845 "@default\0"
846 "@file-system\0"
847 "@io-event\0"
848 "@ipc\0"
849 "@keyring\0"
850 "@memlock\0"
851 "@network-io\0"
852 "@process\0"
853 "@resources\0"
854 "@setuid\0"
855 "@signal\0"
856 "@sync\0"
857 "@timer\0"
858 "capget\0"
859 "capset\0"
860 "copy_file_range\0"
861 "fadvise64\0"
862 "fadvise64_64\0"
863 "flock\0"
864 "get_mempolicy\0"
865 "getcpu\0"
866 "getpriority\0"
867 "ioctl\0"
868 "ioprio_get\0"
869 "kcmp\0"
870 "madvise\0"
871 "mremap\0"
872 "name_to_handle_at\0"
873 "oldolduname\0"
874 "olduname\0"
875 "personality\0"
876 "readahead\0"
877 "readdir\0"
878 "remap_file_pages\0"
879 "sched_get_priority_max\0"
880 "sched_get_priority_min\0"
881 "sched_getattr\0"
882 "sched_getparam\0"
883 "sched_getscheduler\0"
884 "sched_rr_get_interval\0"
885 "sched_rr_get_interval_time64\0"
886 "sched_yield\0"
887 "sendfile\0"
888 "sendfile64\0"
889 "setfsgid\0"
890 "setfsgid32\0"
891 "setfsuid\0"
892 "setfsuid32\0"
893 "setpgid\0"
894 "setsid\0"
895 "splice\0"
896 "sysinfo\0"
897 "tee\0"
898 "umask\0"
899 "uname\0"
900 "userfaultfd\0"
901 "vmsplice\0"
902 },
903 [SYSCALL_FILTER_SET_TIMER] = {
904 .name = "@timer",
905 .help = "Schedule operations by time",
906 .value =
907 "alarm\0"
908 "getitimer\0"
909 "setitimer\0"
910 "timer_create\0"
911 "timer_delete\0"
912 "timer_getoverrun\0"
913 "timer_gettime\0"
914 "timer_gettime64\0"
915 "timer_settime\0"
916 "timer_settime64\0"
917 "timerfd_create\0"
918 "timerfd_gettime\0"
919 "timerfd_gettime64\0"
920 "timerfd_settime\0"
921 "timerfd_settime64\0"
922 "times\0"
923 },
924 [SYSCALL_FILTER_SET_KNOWN] = {
925 .name = "@known",
926 .help = "All known syscalls declared in the kernel",
927 .value =
928 #include "syscall-list.h"
929 },
930 };
931
932 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
933 if (isempty(name) || name[0] != '@')
934 return NULL;
935
936 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
937 if (streq(syscall_filter_sets[i].name, name))
938 return syscall_filter_sets + i;
939
940 return NULL;
941 }
942
943 static int add_syscall_filter_set(
944 scmp_filter_ctx seccomp,
945 const SyscallFilterSet *set,
946 uint32_t action,
947 char **exclude,
948 bool log_missing,
949 char ***added);
950
951 int seccomp_add_syscall_filter_item(
952 scmp_filter_ctx *seccomp,
953 const char *name,
954 uint32_t action,
955 char **exclude,
956 bool log_missing,
957 char ***added) {
958
959 assert(seccomp);
960 assert(name);
961
962 if (strv_contains(exclude, name))
963 return 0;
964
965 /* Any syscalls that are handled are added to the *added strv. The pointer
966 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
967
968 if (name[0] == '@') {
969 const SyscallFilterSet *other;
970
971 other = syscall_filter_set_find(name);
972 if (!other)
973 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
974 "Filter set %s is not known!",
975 name);
976
977 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
978
979 } else {
980 int id, r;
981
982 id = seccomp_syscall_resolve_name(name);
983 if (id == __NR_SCMP_ERROR) {
984 if (log_missing)
985 log_debug("System call %s is not known, ignoring.", name);
986 return 0;
987 }
988
989 r = seccomp_rule_add_exact(seccomp, action, id, 0);
990 if (r < 0) {
991 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
992 bool ignore = r == -EDOM;
993
994 if (!ignore || log_missing)
995 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
996 name, id, ignore ? ", ignoring" : "");
997 if (!ignore)
998 return r;
999 }
1000
1001 if (added) {
1002 r = strv_extend(added, name);
1003 if (r < 0)
1004 return r;
1005 }
1006
1007 return 0;
1008 }
1009 }
1010
1011 static int add_syscall_filter_set(
1012 scmp_filter_ctx seccomp,
1013 const SyscallFilterSet *set,
1014 uint32_t action,
1015 char **exclude,
1016 bool log_missing,
1017 char ***added) {
1018
1019 const char *sys;
1020 int r;
1021
1022 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1023
1024 assert(seccomp);
1025 assert(set);
1026
1027 NULSTR_FOREACH(sys, set->value) {
1028 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1029 if (r < 0)
1030 return r;
1031 }
1032
1033 return 0;
1034 }
1035
1036 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1037 uint32_t arch;
1038 int r;
1039
1040 assert(set);
1041
1042 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1043 * each local arch. */
1044
1045 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1046 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1047
1048 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1049
1050 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1051 if (r < 0)
1052 return r;
1053
1054 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1055 if (r < 0)
1056 return log_debug_errno(r, "Failed to add filter set: %m");
1057
1058 r = seccomp_load(seccomp);
1059 if (ERRNO_IS_SECCOMP_FATAL(r))
1060 return r;
1061 if (r < 0)
1062 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1063 }
1064
1065 return 0;
1066 }
1067
1068 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1069 uint32_t arch;
1070 int r;
1071
1072 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1073 * of a SyscallFilterSet* table. */
1074
1075 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1076 return 0;
1077
1078 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1079 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1080 void *syscall_id, *val;
1081
1082 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1083
1084 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1085 if (r < 0)
1086 return r;
1087
1088 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1089 uint32_t a = action;
1090 int id = PTR_TO_INT(syscall_id) - 1;
1091 int error = PTR_TO_INT(val);
1092
1093 if (error == SECCOMP_ERROR_NUMBER_KILL)
1094 a = scmp_act_kill_process();
1095 #ifdef SCMP_ACT_LOG
1096 else if (action == SCMP_ACT_LOG)
1097 a = SCMP_ACT_LOG;
1098 #endif
1099 else if (error >= 0)
1100 a = SCMP_ACT_ERRNO(error);
1101
1102 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1103 if (r < 0) {
1104 /* If the system call is not known on this architecture, then that's
1105 * fine, let's ignore it */
1106 _cleanup_free_ char *n = NULL;
1107 bool ignore;
1108
1109 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1110 ignore = r == -EDOM;
1111 if (!ignore || log_missing)
1112 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1113 strna(n), id, ignore ? ", ignoring" : "");
1114 if (!ignore)
1115 return r;
1116 }
1117 }
1118
1119 r = seccomp_load(seccomp);
1120 if (ERRNO_IS_SECCOMP_FATAL(r))
1121 return r;
1122 if (r < 0)
1123 log_debug_errno(r, "Failed to install systemc call filter for architecture %s, skipping: %m",
1124 seccomp_arch_to_string(arch));
1125 }
1126
1127 return 0;
1128 }
1129
1130 int seccomp_parse_syscall_filter(
1131 const char *name,
1132 int errno_num,
1133 Hashmap *filter,
1134 SeccompParseFlags flags,
1135 const char *unit,
1136 const char *filename,
1137 unsigned line) {
1138
1139 int r;
1140
1141 assert(name);
1142 assert(filter);
1143
1144 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1145 return -EINVAL;
1146
1147 if (name[0] == '@') {
1148 const SyscallFilterSet *set;
1149 const char *i;
1150
1151 set = syscall_filter_set_find(name);
1152 if (!set) {
1153 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1154 return -EINVAL;
1155
1156 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1157 "Unknown system call group, ignoring: %s", name);
1158 return 0;
1159 }
1160
1161 NULSTR_FOREACH(i, set->value) {
1162 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1163 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1164 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1165 * about them. */
1166 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1167 if (r < 0)
1168 return r;
1169 }
1170 } else {
1171 int id;
1172
1173 id = seccomp_syscall_resolve_name(name);
1174 if (id == __NR_SCMP_ERROR) {
1175 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1176 return -EINVAL;
1177
1178 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1179 "Failed to parse system call, ignoring: %s", name);
1180 return 0;
1181 }
1182
1183 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1184 * it from the list. The entries in allow-list with non-negative error value will be
1185 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1186 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1187 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1188 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1189 if (r < 0)
1190 switch (r) {
1191 case -ENOMEM:
1192 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1193 case -EEXIST:
1194 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1195 break;
1196 default:
1197 return r;
1198 }
1199 } else
1200 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1201 }
1202
1203 return 0;
1204 }
1205
1206 int seccomp_restrict_namespaces(unsigned long retain) {
1207 uint32_t arch;
1208 int r;
1209
1210 if (DEBUG_LOGGING) {
1211 _cleanup_free_ char *s = NULL;
1212
1213 (void) namespace_flags_to_string(retain, &s);
1214 log_debug("Restricting namespace to: %s.", strna(s));
1215 }
1216
1217 /* NOOP? */
1218 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1219 return 0;
1220
1221 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1222 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1223
1224 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1225
1226 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1227 if (r < 0)
1228 return r;
1229
1230 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1231 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1232 * altogether. */
1233 r = seccomp_rule_add_exact(
1234 seccomp,
1235 SCMP_ACT_ERRNO(EPERM),
1236 SCMP_SYS(setns),
1237 0);
1238 else
1239 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1240 * special invocation with a zero flags argument, right here. */
1241 r = seccomp_rule_add_exact(
1242 seccomp,
1243 SCMP_ACT_ERRNO(EPERM),
1244 SCMP_SYS(setns),
1245 1,
1246 SCMP_A1(SCMP_CMP_EQ, 0));
1247 if (r < 0) {
1248 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1249 continue;
1250 }
1251
1252 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1253 unsigned long f;
1254
1255 f = namespace_flag_map[i].flag;
1256 if (FLAGS_SET(retain, f)) {
1257 log_debug("Permitting %s.", namespace_flag_map[i].name);
1258 continue;
1259 }
1260
1261 log_debug("Blocking %s.", namespace_flag_map[i].name);
1262
1263 r = seccomp_rule_add_exact(
1264 seccomp,
1265 SCMP_ACT_ERRNO(EPERM),
1266 SCMP_SYS(unshare),
1267 1,
1268 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1269 if (r < 0) {
1270 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1271 break;
1272 }
1273
1274 /* On s390/s390x the first two parameters to clone are switched */
1275 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1276 r = seccomp_rule_add_exact(
1277 seccomp,
1278 SCMP_ACT_ERRNO(EPERM),
1279 SCMP_SYS(clone),
1280 1,
1281 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1282 else
1283 r = seccomp_rule_add_exact(
1284 seccomp,
1285 SCMP_ACT_ERRNO(EPERM),
1286 SCMP_SYS(clone),
1287 1,
1288 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1289 if (r < 0) {
1290 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1291 break;
1292 }
1293
1294 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1295 r = seccomp_rule_add_exact(
1296 seccomp,
1297 SCMP_ACT_ERRNO(EPERM),
1298 SCMP_SYS(setns),
1299 1,
1300 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1301 if (r < 0) {
1302 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1303 break;
1304 }
1305 }
1306 }
1307 if (r < 0)
1308 continue;
1309
1310 r = seccomp_load(seccomp);
1311 if (ERRNO_IS_SECCOMP_FATAL(r))
1312 return r;
1313 if (r < 0)
1314 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1315 }
1316
1317 return 0;
1318 }
1319
1320 int seccomp_protect_sysctl(void) {
1321 uint32_t arch;
1322 int r;
1323
1324 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1325 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1326
1327 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1328
1329 if (IN_SET(arch,
1330 SCMP_ARCH_AARCH64,
1331 #ifdef SCMP_ARCH_RISCV64
1332 SCMP_ARCH_RISCV64,
1333 #endif
1334 SCMP_ARCH_X32
1335 ))
1336 /* No _sysctl syscall */
1337 continue;
1338
1339 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1340 if (r < 0)
1341 return r;
1342
1343 r = seccomp_rule_add_exact(
1344 seccomp,
1345 SCMP_ACT_ERRNO(EPERM),
1346 SCMP_SYS(_sysctl),
1347 0);
1348 if (r < 0) {
1349 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1350 continue;
1351 }
1352
1353 r = seccomp_load(seccomp);
1354 if (ERRNO_IS_SECCOMP_FATAL(r))
1355 return r;
1356 if (r < 0)
1357 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1358 }
1359
1360 return 0;
1361 }
1362
1363 int seccomp_protect_syslog(void) {
1364 uint32_t arch;
1365 int r;
1366
1367 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1368 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1369
1370 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1371 if (r < 0)
1372 return r;
1373
1374 r = seccomp_rule_add_exact(
1375 seccomp,
1376 SCMP_ACT_ERRNO(EPERM),
1377 SCMP_SYS(syslog),
1378 0);
1379
1380 if (r < 0) {
1381 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1382 continue;
1383 }
1384
1385 r = seccomp_load(seccomp);
1386 if (ERRNO_IS_SECCOMP_FATAL(r))
1387 return r;
1388 if (r < 0)
1389 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1390 }
1391
1392 return 0;
1393 }
1394
1395 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1396 uint32_t arch;
1397 int r;
1398
1399 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1400 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1401 bool supported;
1402
1403 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1404
1405 switch (arch) {
1406
1407 case SCMP_ARCH_X86_64:
1408 case SCMP_ARCH_X32:
1409 case SCMP_ARCH_ARM:
1410 case SCMP_ARCH_AARCH64:
1411 case SCMP_ARCH_MIPSEL64N32:
1412 case SCMP_ARCH_MIPS64N32:
1413 case SCMP_ARCH_MIPSEL64:
1414 case SCMP_ARCH_MIPS64:
1415 #ifdef SCMP_ARCH_RISCV64
1416 case SCMP_ARCH_RISCV64:
1417 #endif
1418 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1419 supported = true;
1420 break;
1421
1422 case SCMP_ARCH_S390:
1423 case SCMP_ARCH_S390X:
1424 case SCMP_ARCH_X86:
1425 case SCMP_ARCH_MIPSEL:
1426 case SCMP_ARCH_MIPS:
1427 case SCMP_ARCH_PPC:
1428 case SCMP_ARCH_PPC64:
1429 case SCMP_ARCH_PPC64LE:
1430 default:
1431 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1432 * don't know */
1433 supported = false;
1434 break;
1435 }
1436
1437 if (!supported)
1438 continue;
1439
1440 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1441 if (r < 0)
1442 return r;
1443
1444 if (allow_list) {
1445 int first = 0, last = 0;
1446 void *afp;
1447
1448 /* If this is an allow list, we first block the address families that are out of
1449 * range and then everything that is not in the set. First, we find the lowest and
1450 * highest address family in the set. */
1451
1452 SET_FOREACH(afp, address_families) {
1453 int af = PTR_TO_INT(afp);
1454
1455 if (af <= 0 || af >= af_max())
1456 continue;
1457
1458 if (first == 0 || af < first)
1459 first = af;
1460
1461 if (last == 0 || af > last)
1462 last = af;
1463 }
1464
1465 assert((first == 0) == (last == 0));
1466
1467 if (first == 0) {
1468
1469 /* No entries in the valid range, block everything */
1470 r = seccomp_rule_add_exact(
1471 seccomp,
1472 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1473 SCMP_SYS(socket),
1474 0);
1475 if (r < 0) {
1476 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1477 continue;
1478 }
1479
1480 } else {
1481
1482 /* Block everything below the first entry */
1483 r = seccomp_rule_add_exact(
1484 seccomp,
1485 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1486 SCMP_SYS(socket),
1487 1,
1488 SCMP_A0(SCMP_CMP_LT, first));
1489 if (r < 0) {
1490 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1491 continue;
1492 }
1493
1494 /* Block everything above the last entry */
1495 r = seccomp_rule_add_exact(
1496 seccomp,
1497 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1498 SCMP_SYS(socket),
1499 1,
1500 SCMP_A0(SCMP_CMP_GT, last));
1501 if (r < 0) {
1502 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1503 continue;
1504 }
1505
1506 /* Block everything between the first and last entry */
1507 for (int af = 1; af < af_max(); af++) {
1508
1509 if (set_contains(address_families, INT_TO_PTR(af)))
1510 continue;
1511
1512 r = seccomp_rule_add_exact(
1513 seccomp,
1514 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1515 SCMP_SYS(socket),
1516 1,
1517 SCMP_A0(SCMP_CMP_EQ, af));
1518 if (r < 0)
1519 break;
1520 }
1521 if (r < 0) {
1522 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1523 continue;
1524 }
1525 }
1526
1527 } else {
1528 void *af;
1529
1530 /* If this is a deny list, then generate one rule for each address family that are
1531 * then combined in OR checks. */
1532
1533 SET_FOREACH(af, address_families) {
1534 r = seccomp_rule_add_exact(
1535 seccomp,
1536 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1537 SCMP_SYS(socket),
1538 1,
1539 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1540 if (r < 0)
1541 break;
1542 }
1543 if (r < 0) {
1544 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1545 continue;
1546 }
1547 }
1548
1549 r = seccomp_load(seccomp);
1550 if (ERRNO_IS_SECCOMP_FATAL(r))
1551 return r;
1552 if (r < 0)
1553 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1554 }
1555
1556 return 0;
1557 }
1558
1559 int seccomp_restrict_realtime(void) {
1560 static const int permitted_policies[] = {
1561 SCHED_OTHER,
1562 SCHED_BATCH,
1563 SCHED_IDLE,
1564 };
1565
1566 int r, max_policy = 0;
1567 uint32_t arch;
1568 unsigned i;
1569
1570 /* Determine the highest policy constant we want to allow */
1571 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1572 if (permitted_policies[i] > max_policy)
1573 max_policy = permitted_policies[i];
1574
1575 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1576 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1577 int p;
1578
1579 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1580
1581 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1582 if (r < 0)
1583 return r;
1584
1585 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1586 * allow list. */
1587 for (p = 0; p < max_policy; p++) {
1588 bool good = false;
1589
1590 /* Check if this is in the allow list. */
1591 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1592 if (permitted_policies[i] == p) {
1593 good = true;
1594 break;
1595 }
1596
1597 if (good)
1598 continue;
1599
1600 /* Deny this policy */
1601 r = seccomp_rule_add_exact(
1602 seccomp,
1603 SCMP_ACT_ERRNO(EPERM),
1604 SCMP_SYS(sched_setscheduler),
1605 1,
1606 SCMP_A1(SCMP_CMP_EQ, p));
1607 if (r < 0) {
1608 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 continue;
1610 }
1611 }
1612
1613 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1614 * are unsigned here, hence no need no check for < 0 values. */
1615 r = seccomp_rule_add_exact(
1616 seccomp,
1617 SCMP_ACT_ERRNO(EPERM),
1618 SCMP_SYS(sched_setscheduler),
1619 1,
1620 SCMP_A1(SCMP_CMP_GT, max_policy));
1621 if (r < 0) {
1622 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1623 continue;
1624 }
1625
1626 r = seccomp_load(seccomp);
1627 if (ERRNO_IS_SECCOMP_FATAL(r))
1628 return r;
1629 if (r < 0)
1630 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1631 }
1632
1633 return 0;
1634 }
1635
1636 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1637 uint32_t arch,
1638 int nr,
1639 unsigned arg_cnt,
1640 const struct scmp_arg_cmp arg) {
1641 int r;
1642
1643 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1644 if (r < 0) {
1645 _cleanup_free_ char *n = NULL;
1646
1647 n = seccomp_syscall_resolve_num_arch(arch, nr);
1648 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1649 strna(n),
1650 seccomp_arch_to_string(arch));
1651 }
1652
1653 return r;
1654 }
1655
1656 /* For known architectures, check that syscalls are indeed defined or not. */
1657 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1658 assert_cc(SCMP_SYS(shmget) > 0);
1659 assert_cc(SCMP_SYS(shmat) > 0);
1660 assert_cc(SCMP_SYS(shmdt) > 0);
1661 #endif
1662
1663 int seccomp_memory_deny_write_execute(void) {
1664 uint32_t arch;
1665 unsigned loaded = 0;
1666
1667 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1668 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1669 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1670
1671 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1672
1673 switch (arch) {
1674
1675 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1676 * We ignore that here, which means there's still a way to get writable/executable
1677 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1678
1679 case SCMP_ARCH_X86:
1680 case SCMP_ARCH_S390:
1681 filter_syscall = SCMP_SYS(mmap2);
1682 block_syscall = SCMP_SYS(mmap);
1683 /* shmat multiplexed, see above */
1684 break;
1685
1686 case SCMP_ARCH_PPC:
1687 case SCMP_ARCH_PPC64:
1688 case SCMP_ARCH_PPC64LE:
1689 case SCMP_ARCH_S390X:
1690 filter_syscall = SCMP_SYS(mmap);
1691 /* shmat multiplexed, see above */
1692 break;
1693
1694 case SCMP_ARCH_ARM:
1695 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1696 shmat_syscall = SCMP_SYS(shmat);
1697 break;
1698
1699 case SCMP_ARCH_X86_64:
1700 case SCMP_ARCH_X32:
1701 case SCMP_ARCH_AARCH64:
1702 #ifdef SCMP_ARCH_RISCV64
1703 case SCMP_ARCH_RISCV64:
1704 #endif
1705 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1706 shmat_syscall = SCMP_SYS(shmat);
1707 break;
1708
1709 /* Please add more definitions here, if you port systemd to other architectures! */
1710
1711 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1712 #warning "Consider adding the right mmap() syscall definitions here!"
1713 #endif
1714 }
1715
1716 /* Can't filter mmap() on this arch, then skip it */
1717 if (filter_syscall == 0)
1718 continue;
1719
1720 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1721 if (r < 0)
1722 return r;
1723
1724 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1725 1,
1726 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1727 if (r < 0)
1728 continue;
1729
1730 if (block_syscall != 0) {
1731 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1732 if (r < 0)
1733 continue;
1734 }
1735
1736 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1737 1,
1738 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1739 if (r < 0)
1740 continue;
1741
1742 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1743 1,
1744 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1745 if (r < 0)
1746 continue;
1747
1748 if (shmat_syscall > 0) {
1749 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1750 1,
1751 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1752 if (r < 0)
1753 continue;
1754 }
1755
1756 r = seccomp_load(seccomp);
1757 if (ERRNO_IS_SECCOMP_FATAL(r))
1758 return r;
1759 if (r < 0)
1760 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1761 seccomp_arch_to_string(arch));
1762 loaded++;
1763 }
1764
1765 if (loaded == 0)
1766 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1767
1768 return loaded;
1769 }
1770
1771 int seccomp_restrict_archs(Set *archs) {
1772 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1773 int r;
1774 bool blocked_new = false;
1775
1776 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1777 * list.
1778 *
1779 * There are some qualifications. However the most important use is to stop processes from bypassing
1780 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1781 * in a non-native architecture. There are no holes in this use case, at least so far. */
1782
1783 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1784 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1785 * to run a program with the restrictions applied. */
1786 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1787 if (!seccomp)
1788 return -ENOMEM;
1789
1790 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1791 uint32_t arch = seccomp_local_archs[i];
1792
1793 /* See above comment, our "native" architecture is never blocked. */
1794 if (arch == seccomp_arch_native())
1795 continue;
1796
1797 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1798 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1799 continue;
1800
1801 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1802
1803 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1804 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1805 * The important thing is that you can block the old 32-bit x86 syscalls.
1806 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1807 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1808 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1809
1810 if (block) {
1811 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1812 blocked_new = true;
1813 } else {
1814 r = seccomp_arch_add(seccomp, arch);
1815 if (r < 0 && r != -EEXIST)
1816 return r;
1817 }
1818 }
1819
1820 /* All architectures that will be blocked by the seccomp program were
1821 * already blocked. */
1822 if (!blocked_new)
1823 return 0;
1824
1825 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1826 if (r < 0)
1827 return r;
1828
1829 r = seccomp_load(seccomp);
1830 if (ERRNO_IS_SECCOMP_FATAL(r))
1831 return r;
1832 if (r < 0)
1833 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1834
1835 return 0;
1836 }
1837
1838 int parse_syscall_archs(char **l, Set **ret_archs) {
1839 _cleanup_set_free_ Set *archs = NULL;
1840 char **s;
1841 int r;
1842
1843 assert(l);
1844 assert(ret_archs);
1845
1846 STRV_FOREACH(s, l) {
1847 uint32_t a;
1848
1849 r = seccomp_arch_from_string(*s, &a);
1850 if (r < 0)
1851 return -EINVAL;
1852
1853 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1854 if (r < 0)
1855 return -ENOMEM;
1856 }
1857
1858 *ret_archs = TAKE_PTR(archs);
1859 return 0;
1860 }
1861
1862 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1863 const char *i;
1864 int r;
1865
1866 assert(set);
1867
1868 NULSTR_FOREACH(i, set->value) {
1869
1870 if (i[0] == '@') {
1871 const SyscallFilterSet *more;
1872
1873 more = syscall_filter_set_find(i);
1874 if (!more)
1875 return -ENXIO;
1876
1877 r = seccomp_filter_set_add(filter, add, more);
1878 if (r < 0)
1879 return r;
1880 } else {
1881 int id;
1882
1883 id = seccomp_syscall_resolve_name(i);
1884 if (id == __NR_SCMP_ERROR) {
1885 log_debug("Couldn't resolve system call, ignoring: %s", i);
1886 continue;
1887 }
1888
1889 if (add) {
1890 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1891 if (r < 0)
1892 return r;
1893 } else
1894 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1895 }
1896 }
1897
1898 return 0;
1899 }
1900
1901 int seccomp_lock_personality(unsigned long personality) {
1902 uint32_t arch;
1903 int r;
1904
1905 if (personality >= PERSONALITY_INVALID)
1906 return -EINVAL;
1907
1908 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1909 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1910
1911 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1912 if (r < 0)
1913 return r;
1914
1915 r = seccomp_rule_add_exact(
1916 seccomp,
1917 SCMP_ACT_ERRNO(EPERM),
1918 SCMP_SYS(personality),
1919 1,
1920 SCMP_A0(SCMP_CMP_NE, personality));
1921 if (r < 0) {
1922 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1923 continue;
1924 }
1925
1926 r = seccomp_load(seccomp);
1927 if (ERRNO_IS_SECCOMP_FATAL(r))
1928 return r;
1929 if (r < 0)
1930 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1931 }
1932
1933 return 0;
1934 }
1935
1936 int seccomp_protect_hostname(void) {
1937 uint32_t arch;
1938 int r;
1939
1940 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1941 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1942
1943 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1944 if (r < 0)
1945 return r;
1946
1947 r = seccomp_rule_add_exact(
1948 seccomp,
1949 SCMP_ACT_ERRNO(EPERM),
1950 SCMP_SYS(sethostname),
1951 0);
1952 if (r < 0) {
1953 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1954 continue;
1955 }
1956
1957 r = seccomp_rule_add_exact(
1958 seccomp,
1959 SCMP_ACT_ERRNO(EPERM),
1960 SCMP_SYS(setdomainname),
1961 0);
1962 if (r < 0) {
1963 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1964 continue;
1965 }
1966
1967 r = seccomp_load(seccomp);
1968 if (ERRNO_IS_SECCOMP_FATAL(r))
1969 return r;
1970 if (r < 0)
1971 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1972 }
1973
1974 return 0;
1975 }
1976
1977 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
1978 /* Checks the mode_t parameter of the following system calls:
1979 *
1980 * → chmod() + fchmod() + fchmodat()
1981 * → open() + creat() + openat()
1982 * → mkdir() + mkdirat()
1983 * → mknod() + mknodat()
1984 *
1985 * Returns error if *everything* failed, and 0 otherwise.
1986 */
1987 int r;
1988 bool any = false;
1989
1990 r = seccomp_rule_add_exact(
1991 seccomp,
1992 SCMP_ACT_ERRNO(EPERM),
1993 SCMP_SYS(chmod),
1994 1,
1995 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
1996 if (r < 0)
1997 log_debug_errno(r, "Failed to add filter for chmod: %m");
1998 else
1999 any = true;
2000
2001 r = seccomp_rule_add_exact(
2002 seccomp,
2003 SCMP_ACT_ERRNO(EPERM),
2004 SCMP_SYS(fchmod),
2005 1,
2006 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2007 if (r < 0)
2008 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2009 else
2010 any = true;
2011
2012 r = seccomp_rule_add_exact(
2013 seccomp,
2014 SCMP_ACT_ERRNO(EPERM),
2015 SCMP_SYS(fchmodat),
2016 1,
2017 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2018 if (r < 0)
2019 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2020 else
2021 any = true;
2022
2023 r = seccomp_rule_add_exact(
2024 seccomp,
2025 SCMP_ACT_ERRNO(EPERM),
2026 SCMP_SYS(mkdir),
2027 1,
2028 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2029 if (r < 0)
2030 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2031 else
2032 any = true;
2033
2034 r = seccomp_rule_add_exact(
2035 seccomp,
2036 SCMP_ACT_ERRNO(EPERM),
2037 SCMP_SYS(mkdirat),
2038 1,
2039 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2040 if (r < 0)
2041 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2042 else
2043 any = true;
2044
2045 r = seccomp_rule_add_exact(
2046 seccomp,
2047 SCMP_ACT_ERRNO(EPERM),
2048 SCMP_SYS(mknod),
2049 1,
2050 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2051 if (r < 0)
2052 log_debug_errno(r, "Failed to add filter for mknod: %m");
2053 else
2054 any = true;
2055
2056 r = seccomp_rule_add_exact(
2057 seccomp,
2058 SCMP_ACT_ERRNO(EPERM),
2059 SCMP_SYS(mknodat),
2060 1,
2061 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2062 if (r < 0)
2063 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2064 else
2065 any = true;
2066
2067 r = seccomp_rule_add_exact(
2068 seccomp,
2069 SCMP_ACT_ERRNO(EPERM),
2070 SCMP_SYS(open),
2071 2,
2072 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2073 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2074 if (r < 0)
2075 log_debug_errno(r, "Failed to add filter for open: %m");
2076 else
2077 any = true;
2078
2079 r = seccomp_rule_add_exact(
2080 seccomp,
2081 SCMP_ACT_ERRNO(EPERM),
2082 SCMP_SYS(openat),
2083 2,
2084 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2085 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2086 if (r < 0)
2087 log_debug_errno(r, "Failed to add filter for openat: %m");
2088 else
2089 any = true;
2090
2091 #if defined(__SNR_openat2)
2092 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2093 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2094 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2095 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2096 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2097 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2098 r = seccomp_rule_add_exact(
2099 seccomp,
2100 SCMP_ACT_ERRNO(ENOSYS),
2101 SCMP_SYS(openat2),
2102 0);
2103 if (r < 0)
2104 log_debug_errno(r, "Failed to add filter for openat2: %m");
2105 else
2106 any = true;
2107 #endif
2108
2109 r = seccomp_rule_add_exact(
2110 seccomp,
2111 SCMP_ACT_ERRNO(EPERM),
2112 SCMP_SYS(creat),
2113 1,
2114 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2115 if (r < 0)
2116 log_debug_errno(r, "Failed to add filter for creat: %m");
2117 else
2118 any = true;
2119
2120 return any ? 0 : r;
2121 }
2122
2123 int seccomp_restrict_suid_sgid(void) {
2124 uint32_t arch;
2125 int r, k;
2126
2127 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2128 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2129
2130 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2131 if (r < 0)
2132 return r;
2133
2134 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2135 if (r < 0)
2136 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2137
2138 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2139 if (k < 0)
2140 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2141
2142 if (r < 0 && k < 0)
2143 continue;
2144
2145 r = seccomp_load(seccomp);
2146 if (ERRNO_IS_SECCOMP_FATAL(r))
2147 return r;
2148 if (r < 0)
2149 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2150 }
2151
2152 return 0;
2153 }
2154
2155 uint32_t scmp_act_kill_process(void) {
2156
2157 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2158 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2159 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2160 * for single-threaded apps does the right thing. */
2161
2162 #ifdef SCMP_ACT_KILL_PROCESS
2163 if (seccomp_api_get() >= 3)
2164 return SCMP_ACT_KILL_PROCESS;
2165 #endif
2166
2167 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2168 }
2169
2170 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2171 _cleanup_free_ char *n = NULL;
2172 char *p;
2173 int e = -1;
2174
2175 assert(in);
2176 assert(name);
2177 assert(error);
2178
2179 /*
2180 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2181 * If errno is omitted, then error is set to -1.
2182 * Empty syscall name is not allowed.
2183 * Here, we do not check that the syscall name is valid or not.
2184 */
2185
2186 p = strchr(in, ':');
2187 if (p) {
2188 e = seccomp_parse_errno_or_action(p + 1);
2189 if (e < 0)
2190 return e;
2191
2192 n = strndup(in, p - in);
2193 } else
2194 n = strdup(in);
2195
2196 if (!n)
2197 return -ENOMEM;
2198
2199 if (isempty(n))
2200 return -EINVAL;
2201
2202 *error = e;
2203 *name = TAKE_PTR(n);
2204
2205 return 0;
2206 }
2207
2208 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2209 bool any = false;
2210 int r;
2211
2212 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2213 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2214
2215 r = seccomp_rule_add_exact(
2216 seccomp,
2217 SCMP_ACT_ERRNO(EINVAL),
2218 SCMP_SYS(open),
2219 1,
2220 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2221 if (r < 0)
2222 log_debug_errno(r, "Failed to add filter for open: %m");
2223 else
2224 any = true;
2225
2226 r = seccomp_rule_add_exact(
2227 seccomp,
2228 SCMP_ACT_ERRNO(EINVAL),
2229 SCMP_SYS(openat),
2230 1,
2231 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2232 if (r < 0)
2233 log_debug_errno(r, "Failed to add filter for openat: %m");
2234 else
2235 any = true;
2236
2237 #if defined(__SNR_openat2)
2238 /* The new openat2() system call can't be filtered sensibly, see above. */
2239 r = seccomp_rule_add_exact(
2240 seccomp,
2241 SCMP_ACT_ERRNO(ENOSYS),
2242 SCMP_SYS(openat2),
2243 0);
2244 if (r < 0)
2245 log_debug_errno(r, "Failed to add filter for openat2: %m");
2246 else
2247 any = true;
2248 #endif
2249
2250 return any ? 0 : r;
2251 }
2252
2253 int seccomp_suppress_sync(void) {
2254 uint32_t arch;
2255 int r;
2256
2257 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2258 * manageable, and also masks O_SYNC/O_DSYNC */
2259
2260 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2261 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2262 const char *c;
2263
2264 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2265 if (r < 0)
2266 return r;
2267
2268 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2269 int id;
2270
2271 id = seccomp_syscall_resolve_name(c);
2272 if (id == __NR_SCMP_ERROR) {
2273 log_debug("System call %s is not known, ignoring.", c);
2274 continue;
2275 }
2276
2277 r = seccomp_rule_add_exact(
2278 seccomp,
2279 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2280 id,
2281 0);
2282 if (r < 0)
2283 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2284 }
2285
2286 (void) block_open_flag(seccomp, O_SYNC);
2287 #if O_DSYNC != O_SYNC
2288 (void) block_open_flag(seccomp, O_DSYNC);
2289 #endif
2290
2291 r = seccomp_load(seccomp);
2292 if (ERRNO_IS_SECCOMP_FATAL(r))
2293 return r;
2294 if (r < 0)
2295 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2296 }
2297
2298 return 0;
2299 }