]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #28269 from yuwata/udev-builtin-net_id-cleanups-part1
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "namespace-util.h"
22 #include "nsflags.h"
23 #include "nulstr-util.h"
24 #include "process-util.h"
25 #include "seccomp-util.h"
26 #include "set.h"
27 #include "string-util.h"
28 #include "strv.h"
29
30 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
31 uint32_t seccomp_local_archs[] = {
32
33 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
34
35 #if defined(__x86_64__) && defined(__ILP32__)
36 SCMP_ARCH_X86,
37 SCMP_ARCH_X86_64,
38 SCMP_ARCH_X32, /* native */
39 #elif defined(__x86_64__) && !defined(__ILP32__)
40 SCMP_ARCH_X86,
41 SCMP_ARCH_X32,
42 SCMP_ARCH_X86_64, /* native */
43 #elif defined(__i386__)
44 SCMP_ARCH_X86,
45 #elif defined(__aarch64__)
46 SCMP_ARCH_ARM,
47 SCMP_ARCH_AARCH64, /* native */
48 #elif defined(__arm__)
49 SCMP_ARCH_ARM,
50 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
51 SCMP_ARCH_MIPSEL,
52 SCMP_ARCH_MIPS, /* native */
53 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
54 SCMP_ARCH_MIPS,
55 SCMP_ARCH_MIPSEL, /* native */
56 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64N32,
61 SCMP_ARCH_MIPSEL64,
62 SCMP_ARCH_MIPS64, /* native */
63 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL,
66 SCMP_ARCH_MIPS64N32,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64,
69 SCMP_ARCH_MIPSEL64, /* native */
70 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64,
75 SCMP_ARCH_MIPSEL64N32,
76 SCMP_ARCH_MIPS64N32, /* native */
77 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
78 SCMP_ARCH_MIPS,
79 SCMP_ARCH_MIPSEL,
80 SCMP_ARCH_MIPS64,
81 SCMP_ARCH_MIPSEL64,
82 SCMP_ARCH_MIPS64N32,
83 SCMP_ARCH_MIPSEL64N32, /* native */
84 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
85 SCMP_ARCH_PARISC,
86 SCMP_ARCH_PARISC64, /* native */
87 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
88 SCMP_ARCH_PARISC,
89 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
90 SCMP_ARCH_PPC,
91 SCMP_ARCH_PPC64LE,
92 SCMP_ARCH_PPC64, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64,
96 SCMP_ARCH_PPC64LE, /* native */
97 #elif defined(__powerpc__)
98 SCMP_ARCH_PPC,
99 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
100 SCMP_ARCH_RISCV64,
101 #elif defined(__s390x__)
102 SCMP_ARCH_S390,
103 SCMP_ARCH_S390X, /* native */
104 #elif defined(__s390__)
105 SCMP_ARCH_S390,
106 #endif
107 SECCOMP_LOCAL_ARCH_END
108 };
109
110 const char* seccomp_arch_to_string(uint32_t c) {
111 /* Maintain order used in <seccomp.h>.
112 *
113 * Names used here should be the same as those used for ConditionArchitecture=,
114 * except for "subarchitectures" like x32. */
115
116 switch (c) {
117 case SCMP_ARCH_NATIVE:
118 return "native";
119 case SCMP_ARCH_X86:
120 return "x86";
121 case SCMP_ARCH_X86_64:
122 return "x86-64";
123 case SCMP_ARCH_X32:
124 return "x32";
125 case SCMP_ARCH_ARM:
126 return "arm";
127 case SCMP_ARCH_AARCH64:
128 return "arm64";
129 case SCMP_ARCH_MIPS:
130 return "mips";
131 case SCMP_ARCH_MIPS64:
132 return "mips64";
133 case SCMP_ARCH_MIPS64N32:
134 return "mips64-n32";
135 case SCMP_ARCH_MIPSEL:
136 return "mips-le";
137 case SCMP_ARCH_MIPSEL64:
138 return "mips64-le";
139 case SCMP_ARCH_MIPSEL64N32:
140 return "mips64-le-n32";
141 #ifdef SCMP_ARCH_PARISC
142 case SCMP_ARCH_PARISC:
143 return "parisc";
144 #endif
145 #ifdef SCMP_ARCH_PARISC64
146 case SCMP_ARCH_PARISC64:
147 return "parisc64";
148 #endif
149 case SCMP_ARCH_PPC:
150 return "ppc";
151 case SCMP_ARCH_PPC64:
152 return "ppc64";
153 case SCMP_ARCH_PPC64LE:
154 return "ppc64-le";
155 #ifdef SCMP_ARCH_RISCV64
156 case SCMP_ARCH_RISCV64:
157 return "riscv64";
158 #endif
159 case SCMP_ARCH_S390:
160 return "s390";
161 case SCMP_ARCH_S390X:
162 return "s390x";
163 default:
164 return NULL;
165 }
166 }
167
168 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
169 if (!n)
170 return -EINVAL;
171
172 assert(ret);
173
174 if (streq(n, "native"))
175 *ret = SCMP_ARCH_NATIVE;
176 else if (streq(n, "x86"))
177 *ret = SCMP_ARCH_X86;
178 else if (streq(n, "x86-64"))
179 *ret = SCMP_ARCH_X86_64;
180 else if (streq(n, "x32"))
181 *ret = SCMP_ARCH_X32;
182 else if (streq(n, "arm"))
183 *ret = SCMP_ARCH_ARM;
184 else if (streq(n, "arm64"))
185 *ret = SCMP_ARCH_AARCH64;
186 else if (streq(n, "mips"))
187 *ret = SCMP_ARCH_MIPS;
188 else if (streq(n, "mips64"))
189 *ret = SCMP_ARCH_MIPS64;
190 else if (streq(n, "mips64-n32"))
191 *ret = SCMP_ARCH_MIPS64N32;
192 else if (streq(n, "mips-le"))
193 *ret = SCMP_ARCH_MIPSEL;
194 else if (streq(n, "mips64-le"))
195 *ret = SCMP_ARCH_MIPSEL64;
196 else if (streq(n, "mips64-le-n32"))
197 *ret = SCMP_ARCH_MIPSEL64N32;
198 #ifdef SCMP_ARCH_PARISC
199 else if (streq(n, "parisc"))
200 *ret = SCMP_ARCH_PARISC;
201 #endif
202 #ifdef SCMP_ARCH_PARISC64
203 else if (streq(n, "parisc64"))
204 *ret = SCMP_ARCH_PARISC64;
205 #endif
206 else if (streq(n, "ppc"))
207 *ret = SCMP_ARCH_PPC;
208 else if (streq(n, "ppc64"))
209 *ret = SCMP_ARCH_PPC64;
210 else if (streq(n, "ppc64-le"))
211 *ret = SCMP_ARCH_PPC64LE;
212 #ifdef SCMP_ARCH_RISCV64
213 else if (streq(n, "riscv64"))
214 *ret = SCMP_ARCH_RISCV64;
215 #endif
216 else if (streq(n, "s390"))
217 *ret = SCMP_ARCH_S390;
218 else if (streq(n, "s390x"))
219 *ret = SCMP_ARCH_S390X;
220 else
221 return -EINVAL;
222
223 return 0;
224 }
225
226 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
227 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
228 int r;
229
230 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
231 * any others. Also, turns off the NNP fiddling. */
232
233 seccomp = seccomp_init(default_action);
234 if (!seccomp)
235 return -ENOMEM;
236
237 if (arch != SCMP_ARCH_NATIVE &&
238 arch != seccomp_arch_native()) {
239
240 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
241 if (r < 0)
242 return r;
243
244 r = seccomp_arch_add(seccomp, arch);
245 if (r < 0)
246 return r;
247
248 assert(seccomp_arch_exist(seccomp, arch) >= 0);
249 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
250 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
251 } else {
252 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
253 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
254 }
255
256 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
257 if (r < 0)
258 return r;
259
260 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
261 if (r < 0)
262 return r;
263
264 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
265 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
266 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
267 if (r < 0)
268 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
269 }
270 #endif
271
272 *ret = TAKE_PTR(seccomp);
273 return 0;
274 }
275
276 static bool is_basic_seccomp_available(void) {
277 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
278 }
279
280 static bool is_seccomp_filter_available(void) {
281 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
282 errno == EFAULT;
283 }
284
285 bool is_seccomp_available(void) {
286 static int cached_enabled = -1;
287
288 if (cached_enabled < 0) {
289 int b;
290
291 b = getenv_bool_secure("SYSTEMD_SECCOMP");
292 if (b != 0) {
293 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
294 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
295
296 cached_enabled =
297 is_basic_seccomp_available() &&
298 is_seccomp_filter_available();
299 } else
300 cached_enabled = false;
301 }
302
303 return cached_enabled;
304 }
305
306 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
307 [SYSCALL_FILTER_SET_DEFAULT] = {
308 .name = "@default",
309 .help = "System calls that are always permitted",
310 .value =
311 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
312 "brk\0"
313 "cacheflush\0"
314 "clock_getres\0"
315 "clock_getres_time64\0"
316 "clock_gettime\0"
317 "clock_gettime64\0"
318 "clock_nanosleep\0"
319 "clock_nanosleep_time64\0"
320 "execve\0"
321 "exit\0"
322 "exit_group\0"
323 "futex\0"
324 "futex_time64\0"
325 "futex_waitv\0"
326 "get_robust_list\0"
327 "get_thread_area\0"
328 "getegid\0"
329 "getegid32\0"
330 "geteuid\0"
331 "geteuid32\0"
332 "getgid\0"
333 "getgid32\0"
334 "getgroups\0"
335 "getgroups32\0"
336 "getpgid\0"
337 "getpgrp\0"
338 "getpid\0"
339 "getppid\0"
340 "getrandom\0"
341 "getresgid\0"
342 "getresgid32\0"
343 "getresuid\0"
344 "getresuid32\0"
345 "getrlimit\0" /* make sure processes can query stack size and such */
346 "getsid\0"
347 "gettid\0"
348 "gettimeofday\0"
349 "getuid\0"
350 "getuid32\0"
351 "membarrier\0"
352 "mmap\0"
353 "mmap2\0"
354 "mprotect\0"
355 "munmap\0"
356 "nanosleep\0"
357 "pause\0"
358 "prlimit64\0"
359 "restart_syscall\0"
360 "riscv_flush_icache\0"
361 "riscv_hwprobe\0"
362 "rseq\0"
363 "rt_sigreturn\0"
364 "sched_getaffinity\0"
365 "sched_yield\0"
366 "set_robust_list\0"
367 "set_thread_area\0"
368 "set_tid_address\0"
369 "set_tls\0"
370 "sigreturn\0"
371 "time\0"
372 "ugetrlimit\0"
373 },
374 [SYSCALL_FILTER_SET_AIO] = {
375 .name = "@aio",
376 .help = "Asynchronous IO",
377 .value =
378 "io_cancel\0"
379 "io_destroy\0"
380 "io_getevents\0"
381 "io_pgetevents\0"
382 "io_pgetevents_time64\0"
383 "io_setup\0"
384 "io_submit\0"
385 "io_uring_enter\0"
386 "io_uring_register\0"
387 "io_uring_setup\0"
388 },
389 [SYSCALL_FILTER_SET_BASIC_IO] = {
390 .name = "@basic-io",
391 .help = "Basic IO",
392 .value =
393 "_llseek\0"
394 "close\0"
395 "close_range\0"
396 "dup\0"
397 "dup2\0"
398 "dup3\0"
399 "lseek\0"
400 "pread64\0"
401 "preadv\0"
402 "preadv2\0"
403 "pwrite64\0"
404 "pwritev\0"
405 "pwritev2\0"
406 "read\0"
407 "readv\0"
408 "write\0"
409 "writev\0"
410 },
411 [SYSCALL_FILTER_SET_CHOWN] = {
412 .name = "@chown",
413 .help = "Change ownership of files and directories",
414 .value =
415 "chown\0"
416 "chown32\0"
417 "fchown\0"
418 "fchown32\0"
419 "fchownat\0"
420 "lchown\0"
421 "lchown32\0"
422 },
423 [SYSCALL_FILTER_SET_CLOCK] = {
424 .name = "@clock",
425 .help = "Change the system time",
426 .value =
427 "adjtimex\0"
428 "clock_adjtime\0"
429 "clock_adjtime64\0"
430 "clock_settime\0"
431 "clock_settime64\0"
432 "settimeofday\0"
433 },
434 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
435 .name = "@cpu-emulation",
436 .help = "System calls for CPU emulation functionality",
437 .value =
438 "modify_ldt\0"
439 "subpage_prot\0"
440 "switch_endian\0"
441 "vm86\0"
442 "vm86old\0"
443 },
444 [SYSCALL_FILTER_SET_DEBUG] = {
445 .name = "@debug",
446 .help = "Debugging, performance monitoring and tracing functionality",
447 .value =
448 "lookup_dcookie\0"
449 "perf_event_open\0"
450 "pidfd_getfd\0"
451 "ptrace\0"
452 "rtas\0"
453 "s390_runtime_instr\0"
454 "sys_debug_setcontext\0"
455 },
456 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
457 .name = "@file-system",
458 .help = "File system operations",
459 .value =
460 "access\0"
461 "chdir\0"
462 "chmod\0"
463 "close\0"
464 "creat\0"
465 "faccessat\0"
466 "faccessat2\0"
467 "fallocate\0"
468 "fchdir\0"
469 "fchmod\0"
470 "fchmodat\0"
471 "fcntl\0"
472 "fcntl64\0"
473 "fgetxattr\0"
474 "flistxattr\0"
475 "fremovexattr\0"
476 "fsetxattr\0"
477 "fstat\0"
478 "fstat64\0"
479 "fstatat64\0"
480 "fstatfs\0"
481 "fstatfs64\0"
482 "ftruncate\0"
483 "ftruncate64\0"
484 "futimesat\0"
485 "getcwd\0"
486 "getdents\0"
487 "getdents64\0"
488 "getxattr\0"
489 "inotify_add_watch\0"
490 "inotify_init\0"
491 "inotify_init1\0"
492 "inotify_rm_watch\0"
493 "lgetxattr\0"
494 "link\0"
495 "linkat\0"
496 "listxattr\0"
497 "llistxattr\0"
498 "lremovexattr\0"
499 "lsetxattr\0"
500 "lstat\0"
501 "lstat64\0"
502 "mkdir\0"
503 "mkdirat\0"
504 "mknod\0"
505 "mknodat\0"
506 "newfstatat\0"
507 "oldfstat\0"
508 "oldlstat\0"
509 "oldstat\0"
510 "open\0"
511 "openat\0"
512 "openat2\0"
513 "readlink\0"
514 "readlinkat\0"
515 "removexattr\0"
516 "rename\0"
517 "renameat\0"
518 "renameat2\0"
519 "rmdir\0"
520 "setxattr\0"
521 "stat\0"
522 "stat64\0"
523 "statfs\0"
524 "statfs64\0"
525 "statx\0"
526 "symlink\0"
527 "symlinkat\0"
528 "truncate\0"
529 "truncate64\0"
530 "unlink\0"
531 "unlinkat\0"
532 "utime\0"
533 "utimensat\0"
534 "utimensat_time64\0"
535 "utimes\0"
536 },
537 [SYSCALL_FILTER_SET_IO_EVENT] = {
538 .name = "@io-event",
539 .help = "Event loop system calls",
540 .value =
541 "_newselect\0"
542 "epoll_create\0"
543 "epoll_create1\0"
544 "epoll_ctl\0"
545 "epoll_ctl_old\0"
546 "epoll_pwait\0"
547 "epoll_pwait2\0"
548 "epoll_wait\0"
549 "epoll_wait_old\0"
550 "eventfd\0"
551 "eventfd2\0"
552 "poll\0"
553 "ppoll\0"
554 "ppoll_time64\0"
555 "pselect6\0"
556 "pselect6_time64\0"
557 "select\0"
558 },
559 [SYSCALL_FILTER_SET_IPC] = {
560 .name = "@ipc",
561 .help = "SysV IPC, POSIX Message Queues or other IPC",
562 .value =
563 "ipc\0"
564 "memfd_create\0"
565 "mq_getsetattr\0"
566 "mq_notify\0"
567 "mq_open\0"
568 "mq_timedreceive\0"
569 "mq_timedreceive_time64\0"
570 "mq_timedsend\0"
571 "mq_timedsend_time64\0"
572 "mq_unlink\0"
573 "msgctl\0"
574 "msgget\0"
575 "msgrcv\0"
576 "msgsnd\0"
577 "pipe\0"
578 "pipe2\0"
579 "process_madvise\0"
580 "process_vm_readv\0"
581 "process_vm_writev\0"
582 "semctl\0"
583 "semget\0"
584 "semop\0"
585 "semtimedop\0"
586 "semtimedop_time64\0"
587 "shmat\0"
588 "shmctl\0"
589 "shmdt\0"
590 "shmget\0"
591 },
592 [SYSCALL_FILTER_SET_KEYRING] = {
593 .name = "@keyring",
594 .help = "Kernel keyring access",
595 .value =
596 "add_key\0"
597 "keyctl\0"
598 "request_key\0"
599 },
600 [SYSCALL_FILTER_SET_MEMLOCK] = {
601 .name = "@memlock",
602 .help = "Memory locking control",
603 .value =
604 "mlock\0"
605 "mlock2\0"
606 "mlockall\0"
607 "munlock\0"
608 "munlockall\0"
609 },
610 [SYSCALL_FILTER_SET_MODULE] = {
611 .name = "@module",
612 .help = "Loading and unloading of kernel modules",
613 .value =
614 "delete_module\0"
615 "finit_module\0"
616 "init_module\0"
617 },
618 [SYSCALL_FILTER_SET_MOUNT] = {
619 .name = "@mount",
620 .help = "Mounting and unmounting of file systems",
621 .value =
622 "chroot\0"
623 "fsconfig\0"
624 "fsmount\0"
625 "fsopen\0"
626 "fspick\0"
627 "mount\0"
628 "mount_setattr\0"
629 "move_mount\0"
630 "open_tree\0"
631 "pivot_root\0"
632 "umount\0"
633 "umount2\0"
634 },
635 [SYSCALL_FILTER_SET_NETWORK_IO] = {
636 .name = "@network-io",
637 .help = "Network or Unix socket IO, should not be needed if not network facing",
638 .value =
639 "accept\0"
640 "accept4\0"
641 "bind\0"
642 "connect\0"
643 "getpeername\0"
644 "getsockname\0"
645 "getsockopt\0"
646 "listen\0"
647 "recv\0"
648 "recvfrom\0"
649 "recvmmsg\0"
650 "recvmmsg_time64\0"
651 "recvmsg\0"
652 "send\0"
653 "sendmmsg\0"
654 "sendmsg\0"
655 "sendto\0"
656 "setsockopt\0"
657 "shutdown\0"
658 "socket\0"
659 "socketcall\0"
660 "socketpair\0"
661 },
662 [SYSCALL_FILTER_SET_OBSOLETE] = {
663 /* some unknown even to libseccomp */
664 .name = "@obsolete",
665 .help = "Unusual, obsolete or unimplemented system calls",
666 .value =
667 "_sysctl\0"
668 "afs_syscall\0"
669 "bdflush\0"
670 "break\0"
671 "create_module\0"
672 "ftime\0"
673 "get_kernel_syms\0"
674 "getpmsg\0"
675 "gtty\0"
676 "idle\0"
677 "lock\0"
678 "mpx\0"
679 "prof\0"
680 "profil\0"
681 "putpmsg\0"
682 "query_module\0"
683 "security\0"
684 "sgetmask\0"
685 "ssetmask\0"
686 "stime\0"
687 "stty\0"
688 "sysfs\0"
689 "tuxcall\0"
690 "ulimit\0"
691 "uselib\0"
692 "ustat\0"
693 "vserver\0"
694 },
695 [SYSCALL_FILTER_SET_PKEY] = {
696 .name = "@pkey",
697 .help = "System calls used for memory protection keys",
698 .value =
699 "pkey_alloc\0"
700 "pkey_free\0"
701 "pkey_mprotect\0"
702 },
703 [SYSCALL_FILTER_SET_PRIVILEGED] = {
704 .name = "@privileged",
705 .help = "All system calls which need super-user capabilities",
706 .value =
707 "@chown\0"
708 "@clock\0"
709 "@module\0"
710 "@raw-io\0"
711 "@reboot\0"
712 "@swap\0"
713 "_sysctl\0"
714 "acct\0"
715 "bpf\0"
716 "capset\0"
717 "chroot\0"
718 "fanotify_init\0"
719 "fanotify_mark\0"
720 "nfsservctl\0"
721 "open_by_handle_at\0"
722 "pivot_root\0"
723 "quotactl\0"
724 "quotactl_fd\0"
725 "setdomainname\0"
726 "setfsuid\0"
727 "setfsuid32\0"
728 "setgroups\0"
729 "setgroups32\0"
730 "sethostname\0"
731 "setresuid\0"
732 "setresuid32\0"
733 "setreuid\0"
734 "setreuid32\0"
735 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
736 "setuid32\0"
737 "vhangup\0"
738 },
739 [SYSCALL_FILTER_SET_PROCESS] = {
740 .name = "@process",
741 .help = "Process control, execution, namespacing operations",
742 .value =
743 "capget\0" /* Able to query arbitrary processes */
744 "clone\0"
745 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
746 * implement seccomp, so we don't need to list it at all. C.f.
747 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
748 "clone3\0"
749 "execveat\0"
750 "fork\0"
751 "getrusage\0"
752 "kill\0"
753 "pidfd_open\0"
754 "pidfd_send_signal\0"
755 "prctl\0"
756 "rt_sigqueueinfo\0"
757 "rt_tgsigqueueinfo\0"
758 "setns\0"
759 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
760 "tgkill\0"
761 "times\0"
762 "tkill\0"
763 "unshare\0"
764 "vfork\0"
765 "wait4\0"
766 "waitid\0"
767 "waitpid\0"
768 },
769 [SYSCALL_FILTER_SET_RAW_IO] = {
770 .name = "@raw-io",
771 .help = "Raw I/O port access",
772 .value =
773 "ioperm\0"
774 "iopl\0"
775 "pciconfig_iobase\0"
776 "pciconfig_read\0"
777 "pciconfig_write\0"
778 "s390_pci_mmio_read\0"
779 "s390_pci_mmio_write\0"
780 },
781 [SYSCALL_FILTER_SET_REBOOT] = {
782 .name = "@reboot",
783 .help = "Reboot and reboot preparation/kexec",
784 .value =
785 "kexec_file_load\0"
786 "kexec_load\0"
787 "reboot\0"
788 },
789 [SYSCALL_FILTER_SET_RESOURCES] = {
790 .name = "@resources",
791 .help = "Alter resource settings",
792 .value =
793 "ioprio_set\0"
794 "mbind\0"
795 "migrate_pages\0"
796 "move_pages\0"
797 "nice\0"
798 "sched_setaffinity\0"
799 "sched_setattr\0"
800 "sched_setparam\0"
801 "sched_setscheduler\0"
802 "set_mempolicy\0"
803 "set_mempolicy_home_node\0"
804 "setpriority\0"
805 "setrlimit\0"
806 },
807 [SYSCALL_FILTER_SET_SANDBOX] = {
808 .name = "@sandbox",
809 .help = "Sandbox functionality",
810 .value =
811 "landlock_add_rule\0"
812 "landlock_create_ruleset\0"
813 "landlock_restrict_self\0"
814 "seccomp\0"
815 },
816 [SYSCALL_FILTER_SET_SETUID] = {
817 .name = "@setuid",
818 .help = "Operations for changing user/group credentials",
819 .value =
820 "setgid\0"
821 "setgid32\0"
822 "setgroups\0"
823 "setgroups32\0"
824 "setregid\0"
825 "setregid32\0"
826 "setresgid\0"
827 "setresgid32\0"
828 "setresuid\0"
829 "setresuid32\0"
830 "setreuid\0"
831 "setreuid32\0"
832 "setuid\0"
833 "setuid32\0"
834 },
835 [SYSCALL_FILTER_SET_SIGNAL] = {
836 .name = "@signal",
837 .help = "Process signal handling",
838 .value =
839 "rt_sigaction\0"
840 "rt_sigpending\0"
841 "rt_sigprocmask\0"
842 "rt_sigsuspend\0"
843 "rt_sigtimedwait\0"
844 "rt_sigtimedwait_time64\0"
845 "sigaction\0"
846 "sigaltstack\0"
847 "signal\0"
848 "signalfd\0"
849 "signalfd4\0"
850 "sigpending\0"
851 "sigprocmask\0"
852 "sigsuspend\0"
853 },
854 [SYSCALL_FILTER_SET_SWAP] = {
855 .name = "@swap",
856 .help = "Enable/disable swap devices",
857 .value =
858 "swapoff\0"
859 "swapon\0"
860 },
861 [SYSCALL_FILTER_SET_SYNC] = {
862 .name = "@sync",
863 .help = "Synchronize files and memory to storage",
864 .value =
865 "fdatasync\0"
866 "fsync\0"
867 "msync\0"
868 "sync\0"
869 "sync_file_range\0"
870 "sync_file_range2\0"
871 "syncfs\0"
872 },
873 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
874 .name = "@system-service",
875 .help = "General system service operations",
876 .value =
877 "@aio\0"
878 "@basic-io\0"
879 "@chown\0"
880 "@default\0"
881 "@file-system\0"
882 "@io-event\0"
883 "@ipc\0"
884 "@keyring\0"
885 "@memlock\0"
886 "@network-io\0"
887 "@process\0"
888 "@resources\0"
889 "@setuid\0"
890 "@signal\0"
891 "@sync\0"
892 "@timer\0"
893 "arm_fadvise64_64\0"
894 "capget\0"
895 "capset\0"
896 "copy_file_range\0"
897 "fadvise64\0"
898 "fadvise64_64\0"
899 "flock\0"
900 "get_mempolicy\0"
901 "getcpu\0"
902 "getpriority\0"
903 "ioctl\0"
904 "ioprio_get\0"
905 "kcmp\0"
906 "madvise\0"
907 "mremap\0"
908 "name_to_handle_at\0"
909 "oldolduname\0"
910 "olduname\0"
911 "personality\0"
912 "readahead\0"
913 "readdir\0"
914 "remap_file_pages\0"
915 "sched_get_priority_max\0"
916 "sched_get_priority_min\0"
917 "sched_getattr\0"
918 "sched_getparam\0"
919 "sched_getscheduler\0"
920 "sched_rr_get_interval\0"
921 "sched_rr_get_interval_time64\0"
922 "sched_yield\0"
923 "sendfile\0"
924 "sendfile64\0"
925 "setfsgid\0"
926 "setfsgid32\0"
927 "setfsuid\0"
928 "setfsuid32\0"
929 "setpgid\0"
930 "setsid\0"
931 "splice\0"
932 "sysinfo\0"
933 "tee\0"
934 "umask\0"
935 "uname\0"
936 "userfaultfd\0"
937 "vmsplice\0"
938 },
939 [SYSCALL_FILTER_SET_TIMER] = {
940 .name = "@timer",
941 .help = "Schedule operations by time",
942 .value =
943 "alarm\0"
944 "getitimer\0"
945 "setitimer\0"
946 "timer_create\0"
947 "timer_delete\0"
948 "timer_getoverrun\0"
949 "timer_gettime\0"
950 "timer_gettime64\0"
951 "timer_settime\0"
952 "timer_settime64\0"
953 "timerfd_create\0"
954 "timerfd_gettime\0"
955 "timerfd_gettime64\0"
956 "timerfd_settime\0"
957 "timerfd_settime64\0"
958 "times\0"
959 },
960 [SYSCALL_FILTER_SET_KNOWN] = {
961 .name = "@known",
962 .help = "All known syscalls declared in the kernel",
963 .value =
964 "@obsolete\0"
965 #include "syscall-list.h"
966 },
967 };
968
969 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
970 if (isempty(name) || name[0] != '@')
971 return NULL;
972
973 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
974 if (streq(syscall_filter_sets[i].name, name))
975 return syscall_filter_sets + i;
976
977 return NULL;
978 }
979
980 static int add_syscall_filter_set(
981 scmp_filter_ctx seccomp,
982 const SyscallFilterSet *set,
983 uint32_t action,
984 char **exclude,
985 bool log_missing,
986 char ***added);
987
988 int seccomp_add_syscall_filter_item(
989 scmp_filter_ctx *seccomp,
990 const char *name,
991 uint32_t action,
992 char **exclude,
993 bool log_missing,
994 char ***added) {
995
996 assert(seccomp);
997 assert(name);
998
999 if (strv_contains(exclude, name))
1000 return 0;
1001
1002 /* Any syscalls that are handled are added to the *added strv. The pointer
1003 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
1004
1005 if (name[0] == '@') {
1006 const SyscallFilterSet *other;
1007
1008 other = syscall_filter_set_find(name);
1009 if (!other)
1010 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
1011 "Filter set %s is not known!",
1012 name);
1013
1014 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
1015
1016 } else {
1017 int id, r;
1018
1019 id = seccomp_syscall_resolve_name(name);
1020 if (id == __NR_SCMP_ERROR) {
1021 if (log_missing)
1022 log_debug("System call %s is not known, ignoring.", name);
1023 return 0;
1024 }
1025
1026 r = seccomp_rule_add_exact(seccomp, action, id, 0);
1027 if (r < 0) {
1028 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1029 bool ignore = r == -EDOM;
1030
1031 if (!ignore || log_missing)
1032 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1033 name, id, ignore ? ", ignoring" : "");
1034 if (!ignore)
1035 return r;
1036 }
1037
1038 if (added) {
1039 r = strv_extend(added, name);
1040 if (r < 0)
1041 return r;
1042 }
1043
1044 return 0;
1045 }
1046 }
1047
1048 static int add_syscall_filter_set(
1049 scmp_filter_ctx seccomp,
1050 const SyscallFilterSet *set,
1051 uint32_t action,
1052 char **exclude,
1053 bool log_missing,
1054 char ***added) {
1055
1056 int r;
1057
1058 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1059
1060 assert(seccomp);
1061 assert(set);
1062
1063 NULSTR_FOREACH(sys, set->value) {
1064 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1065 if (r < 0)
1066 return r;
1067 }
1068
1069 return 0;
1070 }
1071
1072 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1073 uint32_t arch;
1074 int r;
1075
1076 assert(set);
1077
1078 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1079 * each local arch. */
1080
1081 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1082 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1083
1084 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1085
1086 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1087 if (r < 0)
1088 return r;
1089
1090 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1091 if (r < 0)
1092 return log_debug_errno(r, "Failed to add filter set: %m");
1093
1094 r = seccomp_load(seccomp);
1095 if (r < 0) {
1096 if (ERRNO_IS_SECCOMP_FATAL(r))
1097 return r;
1098 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1099 }
1100 }
1101
1102 return 0;
1103 }
1104
1105 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1106 uint32_t arch;
1107 int r;
1108
1109 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1110 * of a SyscallFilterSet* table. */
1111
1112 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1113 return 0;
1114
1115 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1116 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1117 void *syscall_id, *val;
1118
1119 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1120
1121 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1122 if (r < 0)
1123 return r;
1124
1125 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1126 uint32_t a = action;
1127 int id = PTR_TO_INT(syscall_id) - 1;
1128 int error = PTR_TO_INT(val);
1129
1130 if (error == SECCOMP_ERROR_NUMBER_KILL)
1131 a = scmp_act_kill_process();
1132 #ifdef SCMP_ACT_LOG
1133 else if (action == SCMP_ACT_LOG)
1134 a = SCMP_ACT_LOG;
1135 #endif
1136 else if (error >= 0)
1137 a = SCMP_ACT_ERRNO(error);
1138
1139 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1140 if (r < 0) {
1141 /* If the system call is not known on this architecture, then that's
1142 * fine, let's ignore it */
1143 _cleanup_free_ char *n = NULL;
1144 bool ignore;
1145
1146 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1147 ignore = r == -EDOM;
1148 if (!ignore || log_missing)
1149 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1150 strna(n), id, ignore ? ", ignoring" : "");
1151 if (!ignore)
1152 return r;
1153 }
1154 }
1155
1156 r = seccomp_load(seccomp);
1157 if (r < 0) {
1158 if (ERRNO_IS_SECCOMP_FATAL(r))
1159 return r;
1160 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1161 seccomp_arch_to_string(arch));
1162 }
1163 }
1164
1165 return 0;
1166 }
1167
1168 int seccomp_parse_syscall_filter(
1169 const char *name,
1170 int errno_num,
1171 Hashmap *filter,
1172 SeccompParseFlags flags,
1173 const char *unit,
1174 const char *filename,
1175 unsigned line) {
1176
1177 int r;
1178
1179 assert(name);
1180 assert(filter);
1181
1182 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1183 return -EINVAL;
1184
1185 if (name[0] == '@') {
1186 const SyscallFilterSet *set;
1187
1188 set = syscall_filter_set_find(name);
1189 if (!set) {
1190 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1191 return -EINVAL;
1192
1193 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1194 "Unknown system call group, ignoring: %s", name);
1195 return 0;
1196 }
1197
1198 NULSTR_FOREACH(i, set->value) {
1199 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1200 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1201 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1202 * about them. */
1203 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1204 if (r < 0)
1205 return r;
1206 }
1207 } else {
1208 int id;
1209
1210 id = seccomp_syscall_resolve_name(name);
1211 if (id == __NR_SCMP_ERROR) {
1212 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1213 return -EINVAL;
1214
1215 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1216 "Failed to parse system call, ignoring: %s", name);
1217 return 0;
1218 }
1219
1220 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1221 * it from the list. The entries in allow-list with non-negative error value will be
1222 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1223 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1224 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1225 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1226 if (r < 0)
1227 switch (r) {
1228 case -ENOMEM:
1229 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1230 case -EEXIST:
1231 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1232 break;
1233 default:
1234 return r;
1235 }
1236 } else
1237 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1238 }
1239
1240 return 0;
1241 }
1242
1243 int seccomp_restrict_namespaces(unsigned long retain) {
1244 uint32_t arch;
1245 int r;
1246
1247 if (DEBUG_LOGGING) {
1248 _cleanup_free_ char *s = NULL;
1249
1250 (void) namespace_flags_to_string(retain, &s);
1251 log_debug("Restricting namespace to: %s.", strna(s));
1252 }
1253
1254 /* NOOP? */
1255 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1256 return 0;
1257
1258 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1259 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1260
1261 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1262
1263 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1264 if (r < 0)
1265 return r;
1266
1267 /* We cannot filter on individual flags to clone3(), and we need to disable the
1268 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1269 * users shall fall back to clone(), as if on an older kernel.
1270 *
1271 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1272 * https://github.com/moby/moby/issues/42680. */
1273
1274 r = seccomp_rule_add_exact(
1275 seccomp,
1276 SCMP_ACT_ERRNO(ENOSYS),
1277 SCMP_SYS(clone3),
1278 0);
1279 if (r < 0)
1280 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1281
1282 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1283 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1284 * altogether. */
1285 r = seccomp_rule_add_exact(
1286 seccomp,
1287 SCMP_ACT_ERRNO(EPERM),
1288 SCMP_SYS(setns),
1289 0);
1290 else
1291 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1292 * special invocation with a zero flags argument, right here. */
1293 r = seccomp_rule_add_exact(
1294 seccomp,
1295 SCMP_ACT_ERRNO(EPERM),
1296 SCMP_SYS(setns),
1297 1,
1298 SCMP_A1(SCMP_CMP_EQ, 0));
1299 if (r < 0) {
1300 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1301 continue;
1302 }
1303
1304 for (unsigned i = 0; namespace_info[i].proc_name; i++) {
1305 unsigned long f;
1306
1307 f = namespace_info[i].clone_flag;
1308 if (FLAGS_SET(retain, f)) {
1309 log_debug("Permitting %s.", namespace_info[i].proc_name);
1310 continue;
1311 }
1312
1313 log_trace("Blocking %s.", namespace_info[i].proc_name);
1314
1315 r = seccomp_rule_add_exact(
1316 seccomp,
1317 SCMP_ACT_ERRNO(EPERM),
1318 SCMP_SYS(unshare),
1319 1,
1320 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1321 if (r < 0) {
1322 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1323 break;
1324 }
1325
1326 /* On s390/s390x the first two parameters to clone are switched */
1327 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1328 r = seccomp_rule_add_exact(
1329 seccomp,
1330 SCMP_ACT_ERRNO(EPERM),
1331 SCMP_SYS(clone),
1332 1,
1333 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1334 else
1335 r = seccomp_rule_add_exact(
1336 seccomp,
1337 SCMP_ACT_ERRNO(EPERM),
1338 SCMP_SYS(clone),
1339 1,
1340 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1341 if (r < 0) {
1342 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1343 break;
1344 }
1345
1346 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1347 r = seccomp_rule_add_exact(
1348 seccomp,
1349 SCMP_ACT_ERRNO(EPERM),
1350 SCMP_SYS(setns),
1351 1,
1352 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1353 if (r < 0) {
1354 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1355 break;
1356 }
1357 }
1358 }
1359 if (r < 0)
1360 continue;
1361
1362 r = seccomp_load(seccomp);
1363 if (r < 0) {
1364 if (ERRNO_IS_SECCOMP_FATAL(r))
1365 return r;
1366 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1367 }
1368 }
1369
1370 return 0;
1371 }
1372
1373 int seccomp_protect_sysctl(void) {
1374 uint32_t arch;
1375 int r;
1376
1377 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1378 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1379
1380 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1381
1382 if (IN_SET(arch,
1383 SCMP_ARCH_AARCH64,
1384 #ifdef SCMP_ARCH_RISCV64
1385 SCMP_ARCH_RISCV64,
1386 #endif
1387 SCMP_ARCH_X32
1388 ))
1389 /* No _sysctl syscall */
1390 continue;
1391
1392 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1393 if (r < 0)
1394 return r;
1395
1396 r = seccomp_rule_add_exact(
1397 seccomp,
1398 SCMP_ACT_ERRNO(EPERM),
1399 SCMP_SYS(_sysctl),
1400 0);
1401 if (r < 0) {
1402 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1403 continue;
1404 }
1405
1406 r = seccomp_load(seccomp);
1407 if (r < 0) {
1408 if (ERRNO_IS_SECCOMP_FATAL(r))
1409 return r;
1410 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1411 }
1412 }
1413
1414 return 0;
1415 }
1416
1417 int seccomp_protect_syslog(void) {
1418 uint32_t arch;
1419 int r;
1420
1421 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1422 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1423
1424 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1425 if (r < 0)
1426 return r;
1427
1428 r = seccomp_rule_add_exact(
1429 seccomp,
1430 SCMP_ACT_ERRNO(EPERM),
1431 SCMP_SYS(syslog),
1432 0);
1433
1434 if (r < 0) {
1435 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1436 continue;
1437 }
1438
1439 r = seccomp_load(seccomp);
1440 if (r < 0) {
1441 if (ERRNO_IS_SECCOMP_FATAL(r))
1442 return r;
1443 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1444 }
1445 }
1446
1447 return 0;
1448 }
1449
1450 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1451 uint32_t arch;
1452 int r;
1453
1454 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1455 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1456 bool supported;
1457
1458 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1459
1460 switch (arch) {
1461
1462 case SCMP_ARCH_X86_64:
1463 case SCMP_ARCH_X32:
1464 case SCMP_ARCH_ARM:
1465 case SCMP_ARCH_AARCH64:
1466 case SCMP_ARCH_MIPSEL64N32:
1467 case SCMP_ARCH_MIPS64N32:
1468 case SCMP_ARCH_MIPSEL64:
1469 case SCMP_ARCH_MIPS64:
1470 #ifdef SCMP_ARCH_RISCV64
1471 case SCMP_ARCH_RISCV64:
1472 #endif
1473 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1474 supported = true;
1475 break;
1476
1477 case SCMP_ARCH_S390:
1478 case SCMP_ARCH_S390X:
1479 case SCMP_ARCH_X86:
1480 case SCMP_ARCH_MIPSEL:
1481 case SCMP_ARCH_MIPS:
1482 #ifdef SCMP_ARCH_PARISC
1483 case SCMP_ARCH_PARISC:
1484 #endif
1485 #ifdef SCMP_ARCH_PARISC64
1486 case SCMP_ARCH_PARISC64:
1487 #endif
1488 case SCMP_ARCH_PPC:
1489 case SCMP_ARCH_PPC64:
1490 case SCMP_ARCH_PPC64LE:
1491 default:
1492 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1493 * don't know */
1494 supported = false;
1495 break;
1496 }
1497
1498 if (!supported)
1499 continue;
1500
1501 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1502 if (r < 0)
1503 return r;
1504
1505 if (allow_list) {
1506 int first = 0, last = 0;
1507 void *afp;
1508
1509 /* If this is an allow list, we first block the address families that are out of
1510 * range and then everything that is not in the set. First, we find the lowest and
1511 * highest address family in the set. */
1512
1513 SET_FOREACH(afp, address_families) {
1514 int af = PTR_TO_INT(afp);
1515
1516 if (af <= 0 || af >= af_max())
1517 continue;
1518
1519 if (first == 0 || af < first)
1520 first = af;
1521
1522 if (last == 0 || af > last)
1523 last = af;
1524 }
1525
1526 assert((first == 0) == (last == 0));
1527
1528 if (first == 0) {
1529
1530 /* No entries in the valid range, block everything */
1531 r = seccomp_rule_add_exact(
1532 seccomp,
1533 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1534 SCMP_SYS(socket),
1535 0);
1536 if (r < 0) {
1537 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1538 continue;
1539 }
1540
1541 } else {
1542
1543 /* Block everything below the first entry */
1544 r = seccomp_rule_add_exact(
1545 seccomp,
1546 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1547 SCMP_SYS(socket),
1548 1,
1549 SCMP_A0(SCMP_CMP_LT, first));
1550 if (r < 0) {
1551 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1552 continue;
1553 }
1554
1555 /* Block everything above the last entry */
1556 r = seccomp_rule_add_exact(
1557 seccomp,
1558 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1559 SCMP_SYS(socket),
1560 1,
1561 SCMP_A0(SCMP_CMP_GT, last));
1562 if (r < 0) {
1563 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1564 continue;
1565 }
1566
1567 /* Block everything between the first and last entry */
1568 for (int af = 1; af < af_max(); af++) {
1569
1570 if (set_contains(address_families, INT_TO_PTR(af)))
1571 continue;
1572
1573 r = seccomp_rule_add_exact(
1574 seccomp,
1575 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1576 SCMP_SYS(socket),
1577 1,
1578 SCMP_A0(SCMP_CMP_EQ, af));
1579 if (r < 0)
1580 break;
1581 }
1582 if (r < 0) {
1583 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1584 continue;
1585 }
1586 }
1587
1588 } else {
1589 void *af;
1590
1591 /* If this is a deny list, then generate one rule for each address family that are
1592 * then combined in OR checks. */
1593
1594 SET_FOREACH(af, address_families) {
1595 r = seccomp_rule_add_exact(
1596 seccomp,
1597 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1598 SCMP_SYS(socket),
1599 1,
1600 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1601 if (r < 0)
1602 break;
1603 }
1604 if (r < 0) {
1605 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 continue;
1607 }
1608 }
1609
1610 r = seccomp_load(seccomp);
1611 if (r < 0) {
1612 if (ERRNO_IS_SECCOMP_FATAL(r))
1613 return r;
1614 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1615 }
1616 }
1617
1618 return 0;
1619 }
1620
1621 int seccomp_restrict_realtime_full(int error_code) {
1622 static const int permitted_policies[] = {
1623 SCHED_OTHER,
1624 SCHED_BATCH,
1625 SCHED_IDLE,
1626 };
1627
1628 int r, max_policy = 0;
1629 uint32_t arch;
1630 unsigned i;
1631
1632 assert(error_code > 0);
1633
1634 /* Determine the highest policy constant we want to allow */
1635 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1636 if (permitted_policies[i] > max_policy)
1637 max_policy = permitted_policies[i];
1638
1639 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1640 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1641 int p;
1642
1643 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1644
1645 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1646 if (r < 0)
1647 return r;
1648
1649 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1650 * allow list. */
1651 for (p = 0; p < max_policy; p++) {
1652 bool good = false;
1653
1654 /* Check if this is in the allow list. */
1655 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1656 if (permitted_policies[i] == p) {
1657 good = true;
1658 break;
1659 }
1660
1661 if (good)
1662 continue;
1663
1664 /* Deny this policy */
1665 r = seccomp_rule_add_exact(
1666 seccomp,
1667 SCMP_ACT_ERRNO(error_code),
1668 SCMP_SYS(sched_setscheduler),
1669 1,
1670 SCMP_A1(SCMP_CMP_EQ, p));
1671 if (r < 0) {
1672 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1673 continue;
1674 }
1675 }
1676
1677 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1678 * are unsigned here, hence no need no check for < 0 values. */
1679 r = seccomp_rule_add_exact(
1680 seccomp,
1681 SCMP_ACT_ERRNO(error_code),
1682 SCMP_SYS(sched_setscheduler),
1683 1,
1684 SCMP_A1(SCMP_CMP_GT, max_policy));
1685 if (r < 0) {
1686 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1687 continue;
1688 }
1689
1690 r = seccomp_load(seccomp);
1691 if (r < 0) {
1692 if (ERRNO_IS_SECCOMP_FATAL(r))
1693 return r;
1694 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1695 }
1696 }
1697
1698 return 0;
1699 }
1700
1701 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1702 uint32_t arch,
1703 int nr,
1704 unsigned arg_cnt,
1705 const struct scmp_arg_cmp arg) {
1706 int r;
1707
1708 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1709 if (r < 0) {
1710 _cleanup_free_ char *n = NULL;
1711
1712 n = seccomp_syscall_resolve_num_arch(arch, nr);
1713 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1714 strna(n),
1715 seccomp_arch_to_string(arch));
1716 }
1717
1718 return r;
1719 }
1720
1721 /* For known architectures, check that syscalls are indeed defined or not. */
1722 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1723 assert_cc(SCMP_SYS(shmget) > 0);
1724 assert_cc(SCMP_SYS(shmat) > 0);
1725 assert_cc(SCMP_SYS(shmdt) > 0);
1726 #endif
1727
1728 int seccomp_memory_deny_write_execute(void) {
1729 uint32_t arch;
1730 unsigned loaded = 0;
1731
1732 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1733 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1734 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1735
1736 log_trace("Operating on architecture: %s", seccomp_arch_to_string(arch));
1737
1738 switch (arch) {
1739
1740 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1741 * We ignore that here, which means there's still a way to get writable/executable
1742 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1743 *
1744 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1745 * on that front (kernel work done in 5.18).
1746 */
1747
1748 case SCMP_ARCH_X86:
1749 case SCMP_ARCH_S390:
1750 filter_syscall = SCMP_SYS(mmap2);
1751 block_syscall = SCMP_SYS(mmap);
1752 /* shmat multiplexed, see above */
1753 break;
1754
1755 case SCMP_ARCH_PPC:
1756 case SCMP_ARCH_PPC64:
1757 case SCMP_ARCH_PPC64LE:
1758 case SCMP_ARCH_S390X:
1759 filter_syscall = SCMP_SYS(mmap);
1760 /* shmat multiplexed, see above */
1761 break;
1762
1763 case SCMP_ARCH_ARM:
1764 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1765 shmat_syscall = SCMP_SYS(shmat);
1766 break;
1767
1768 case SCMP_ARCH_X86_64:
1769 case SCMP_ARCH_X32:
1770 case SCMP_ARCH_AARCH64:
1771 #ifdef SCMP_ARCH_RISCV64
1772 case SCMP_ARCH_RISCV64:
1773 #endif
1774 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1775 shmat_syscall = SCMP_SYS(shmat);
1776 break;
1777
1778 /* Please add more definitions here, if you port systemd to other architectures! */
1779
1780 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1781 #warning "Consider adding the right mmap() syscall definitions here!"
1782 #endif
1783 }
1784
1785 /* Can't filter mmap() on this arch, then skip it */
1786 if (filter_syscall == 0)
1787 continue;
1788
1789 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1790 if (r < 0)
1791 return r;
1792
1793 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1794 1,
1795 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1796 if (r < 0)
1797 continue;
1798
1799 if (block_syscall != 0) {
1800 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1801 if (r < 0)
1802 continue;
1803 }
1804
1805 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1806 1,
1807 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1808 if (r < 0)
1809 continue;
1810
1811 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1812 1,
1813 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1814 if (r < 0)
1815 continue;
1816
1817 if (shmat_syscall > 0) {
1818 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1819 1,
1820 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1821 if (r < 0)
1822 continue;
1823 }
1824
1825 r = seccomp_load(seccomp);
1826 if (r < 0) {
1827 if (ERRNO_IS_SECCOMP_FATAL(r))
1828 return r;
1829 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1830 seccomp_arch_to_string(arch));
1831 }
1832 loaded++;
1833 }
1834
1835 if (loaded == 0)
1836 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1837
1838 return loaded;
1839 }
1840
1841 int seccomp_restrict_archs(Set *archs) {
1842 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1843 int r;
1844 bool blocked_new = false;
1845
1846 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1847 * list.
1848 *
1849 * There are some qualifications. However the most important use is to stop processes from bypassing
1850 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1851 * in a non-native architecture. There are no holes in this use case, at least so far. */
1852
1853 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1854 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1855 * to run a program with the restrictions applied. */
1856 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1857 if (!seccomp)
1858 return -ENOMEM;
1859
1860 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1861 uint32_t arch = seccomp_local_archs[i];
1862
1863 /* See above comment, our "native" architecture is never blocked. */
1864 if (arch == seccomp_arch_native())
1865 continue;
1866
1867 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1868 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1869 continue;
1870
1871 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1872
1873 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1874 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1875 * The important thing is that you can block the old 32-bit x86 syscalls.
1876 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1877 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1878 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1879
1880 if (block) {
1881 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1882 blocked_new = true;
1883 } else {
1884 r = seccomp_arch_add(seccomp, arch);
1885 if (r < 0 && r != -EEXIST)
1886 return r;
1887 }
1888 }
1889
1890 /* All architectures that will be blocked by the seccomp program were
1891 * already blocked. */
1892 if (!blocked_new)
1893 return 0;
1894
1895 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1896 if (r < 0)
1897 return r;
1898
1899 r = seccomp_load(seccomp);
1900 if (r < 0) {
1901 if (ERRNO_IS_SECCOMP_FATAL(r))
1902 return r;
1903 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1904 }
1905
1906 return 0;
1907 }
1908
1909 int parse_syscall_archs(char **l, Set **ret_archs) {
1910 _cleanup_set_free_ Set *archs = NULL;
1911 int r;
1912
1913 assert(l);
1914 assert(ret_archs);
1915
1916 STRV_FOREACH(s, l) {
1917 uint32_t a;
1918
1919 r = seccomp_arch_from_string(*s, &a);
1920 if (r < 0)
1921 return -EINVAL;
1922
1923 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1924 if (r < 0)
1925 return -ENOMEM;
1926 }
1927
1928 *ret_archs = TAKE_PTR(archs);
1929 return 0;
1930 }
1931
1932 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1933 int r;
1934
1935 assert(set);
1936
1937 NULSTR_FOREACH(i, set->value) {
1938
1939 if (i[0] == '@') {
1940 const SyscallFilterSet *more;
1941
1942 more = syscall_filter_set_find(i);
1943 if (!more)
1944 return -ENXIO;
1945
1946 r = seccomp_filter_set_add(filter, add, more);
1947 if (r < 0)
1948 return r;
1949 } else {
1950 int id;
1951
1952 id = seccomp_syscall_resolve_name(i);
1953 if (id == __NR_SCMP_ERROR) {
1954 log_debug("Couldn't resolve system call, ignoring: %s", i);
1955 continue;
1956 }
1957
1958 if (add) {
1959 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1960 if (r < 0)
1961 return r;
1962 } else
1963 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1964 }
1965 }
1966
1967 return 0;
1968 }
1969
1970 int seccomp_lock_personality(unsigned long personality) {
1971 uint32_t arch;
1972 int r;
1973
1974 if (personality >= PERSONALITY_INVALID)
1975 return -EINVAL;
1976
1977 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1978 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1979
1980 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1981 if (r < 0)
1982 return r;
1983
1984 r = seccomp_rule_add_exact(
1985 seccomp,
1986 SCMP_ACT_ERRNO(EPERM),
1987 SCMP_SYS(personality),
1988 1,
1989 SCMP_A0(SCMP_CMP_NE, personality));
1990 if (r < 0) {
1991 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1992 continue;
1993 }
1994
1995 r = seccomp_load(seccomp);
1996 if (r < 0) {
1997 if (ERRNO_IS_SECCOMP_FATAL(r))
1998 return r;
1999 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2000 }
2001 }
2002
2003 return 0;
2004 }
2005
2006 int seccomp_protect_hostname(void) {
2007 uint32_t arch;
2008 int r;
2009
2010 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2011 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2012
2013 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2014 if (r < 0)
2015 return r;
2016
2017 r = seccomp_rule_add_exact(
2018 seccomp,
2019 SCMP_ACT_ERRNO(EPERM),
2020 SCMP_SYS(sethostname),
2021 0);
2022 if (r < 0) {
2023 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2024 continue;
2025 }
2026
2027 r = seccomp_rule_add_exact(
2028 seccomp,
2029 SCMP_ACT_ERRNO(EPERM),
2030 SCMP_SYS(setdomainname),
2031 0);
2032 if (r < 0) {
2033 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2034 continue;
2035 }
2036
2037 r = seccomp_load(seccomp);
2038 if (r < 0) {
2039 if (ERRNO_IS_SECCOMP_FATAL(r))
2040 return r;
2041 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2042 }
2043 }
2044
2045 return 0;
2046 }
2047
2048 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2049 /* Checks the mode_t parameter of the following system calls:
2050 *
2051 * → chmod() + fchmod() + fchmodat()
2052 * → open() + creat() + openat()
2053 * → mkdir() + mkdirat()
2054 * → mknod() + mknodat()
2055 *
2056 * Returns error if *everything* failed, and 0 otherwise.
2057 */
2058 int r;
2059 bool any = false;
2060
2061 r = seccomp_rule_add_exact(
2062 seccomp,
2063 SCMP_ACT_ERRNO(EPERM),
2064 SCMP_SYS(chmod),
2065 1,
2066 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2067 if (r < 0)
2068 log_debug_errno(r, "Failed to add filter for chmod: %m");
2069 else
2070 any = true;
2071
2072 r = seccomp_rule_add_exact(
2073 seccomp,
2074 SCMP_ACT_ERRNO(EPERM),
2075 SCMP_SYS(fchmod),
2076 1,
2077 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2078 if (r < 0)
2079 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2080 else
2081 any = true;
2082
2083 r = seccomp_rule_add_exact(
2084 seccomp,
2085 SCMP_ACT_ERRNO(EPERM),
2086 SCMP_SYS(fchmodat),
2087 1,
2088 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2089 if (r < 0)
2090 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2091 else
2092 any = true;
2093
2094 r = seccomp_rule_add_exact(
2095 seccomp,
2096 SCMP_ACT_ERRNO(EPERM),
2097 SCMP_SYS(mkdir),
2098 1,
2099 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2100 if (r < 0)
2101 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2102 else
2103 any = true;
2104
2105 r = seccomp_rule_add_exact(
2106 seccomp,
2107 SCMP_ACT_ERRNO(EPERM),
2108 SCMP_SYS(mkdirat),
2109 1,
2110 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2111 if (r < 0)
2112 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2113 else
2114 any = true;
2115
2116 r = seccomp_rule_add_exact(
2117 seccomp,
2118 SCMP_ACT_ERRNO(EPERM),
2119 SCMP_SYS(mknod),
2120 1,
2121 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2122 if (r < 0)
2123 log_debug_errno(r, "Failed to add filter for mknod: %m");
2124 else
2125 any = true;
2126
2127 r = seccomp_rule_add_exact(
2128 seccomp,
2129 SCMP_ACT_ERRNO(EPERM),
2130 SCMP_SYS(mknodat),
2131 1,
2132 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2133 if (r < 0)
2134 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2135 else
2136 any = true;
2137
2138 r = seccomp_rule_add_exact(
2139 seccomp,
2140 SCMP_ACT_ERRNO(EPERM),
2141 SCMP_SYS(open),
2142 2,
2143 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2144 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2145 if (r < 0)
2146 log_debug_errno(r, "Failed to add filter for open: %m");
2147 else
2148 any = true;
2149
2150 r = seccomp_rule_add_exact(
2151 seccomp,
2152 SCMP_ACT_ERRNO(EPERM),
2153 SCMP_SYS(openat),
2154 2,
2155 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2156 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2157 if (r < 0)
2158 log_debug_errno(r, "Failed to add filter for openat: %m");
2159 else
2160 any = true;
2161
2162 #if defined(__SNR_openat2)
2163 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2164 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2165 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2166 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2167 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2168 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2169 r = seccomp_rule_add_exact(
2170 seccomp,
2171 SCMP_ACT_ERRNO(ENOSYS),
2172 SCMP_SYS(openat2),
2173 0);
2174 if (r < 0)
2175 log_debug_errno(r, "Failed to add filter for openat2: %m");
2176 else
2177 any = true;
2178 #endif
2179
2180 r = seccomp_rule_add_exact(
2181 seccomp,
2182 SCMP_ACT_ERRNO(EPERM),
2183 SCMP_SYS(creat),
2184 1,
2185 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2186 if (r < 0)
2187 log_debug_errno(r, "Failed to add filter for creat: %m");
2188 else
2189 any = true;
2190
2191 return any ? 0 : r;
2192 }
2193
2194 int seccomp_restrict_suid_sgid(void) {
2195 uint32_t arch;
2196 int r, k;
2197
2198 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2199 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2200
2201 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2202 if (r < 0)
2203 return r;
2204
2205 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2206 if (r < 0)
2207 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2208
2209 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2210 if (k < 0)
2211 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2212
2213 if (r < 0 && k < 0)
2214 continue;
2215
2216 r = seccomp_load(seccomp);
2217 if (r < 0) {
2218 if (ERRNO_IS_SECCOMP_FATAL(r))
2219 return r;
2220 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2221 }
2222 }
2223
2224 return 0;
2225 }
2226
2227 uint32_t scmp_act_kill_process(void) {
2228
2229 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2230 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2231 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2232 * for single-threaded apps does the right thing. */
2233
2234 #ifdef SCMP_ACT_KILL_PROCESS
2235 if (seccomp_api_get() >= 3)
2236 return SCMP_ACT_KILL_PROCESS;
2237 #endif
2238
2239 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2240 }
2241
2242 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2243 _cleanup_free_ char *n = NULL;
2244 char *p;
2245 int e = -1;
2246
2247 assert(in);
2248 assert(name);
2249 assert(error);
2250
2251 /*
2252 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2253 * If errno is omitted, then error is set to -1.
2254 * Empty syscall name is not allowed.
2255 * Here, we do not check that the syscall name is valid or not.
2256 */
2257
2258 p = strchr(in, ':');
2259 if (p) {
2260 e = seccomp_parse_errno_or_action(p + 1);
2261 if (e < 0)
2262 return e;
2263
2264 n = strndup(in, p - in);
2265 } else
2266 n = strdup(in);
2267
2268 if (!n)
2269 return -ENOMEM;
2270
2271 if (isempty(n))
2272 return -EINVAL;
2273
2274 *error = e;
2275 *name = TAKE_PTR(n);
2276
2277 return 0;
2278 }
2279
2280 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2281 bool any = false;
2282 int r;
2283
2284 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2285 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2286
2287 r = seccomp_rule_add_exact(
2288 seccomp,
2289 SCMP_ACT_ERRNO(EINVAL),
2290 SCMP_SYS(open),
2291 1,
2292 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2293 if (r < 0)
2294 log_debug_errno(r, "Failed to add filter for open: %m");
2295 else
2296 any = true;
2297
2298 r = seccomp_rule_add_exact(
2299 seccomp,
2300 SCMP_ACT_ERRNO(EINVAL),
2301 SCMP_SYS(openat),
2302 1,
2303 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2304 if (r < 0)
2305 log_debug_errno(r, "Failed to add filter for openat: %m");
2306 else
2307 any = true;
2308
2309 #if defined(__SNR_openat2)
2310 /* The new openat2() system call can't be filtered sensibly, see above. */
2311 r = seccomp_rule_add_exact(
2312 seccomp,
2313 SCMP_ACT_ERRNO(ENOSYS),
2314 SCMP_SYS(openat2),
2315 0);
2316 if (r < 0)
2317 log_debug_errno(r, "Failed to add filter for openat2: %m");
2318 else
2319 any = true;
2320 #endif
2321
2322 return any ? 0 : r;
2323 }
2324
2325 int seccomp_suppress_sync(void) {
2326 uint32_t arch;
2327 int r;
2328
2329 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2330 * manageable, and also masks O_SYNC/O_DSYNC */
2331
2332 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2333 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2334
2335 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2336 if (r < 0)
2337 return r;
2338
2339 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2340 int id;
2341
2342 id = seccomp_syscall_resolve_name(c);
2343 if (id == __NR_SCMP_ERROR) {
2344 log_debug("System call %s is not known, ignoring.", c);
2345 continue;
2346 }
2347
2348 r = seccomp_rule_add_exact(
2349 seccomp,
2350 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2351 id,
2352 0);
2353 if (r < 0)
2354 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2355 }
2356
2357 (void) block_open_flag(seccomp, O_SYNC);
2358 #if O_DSYNC != O_SYNC
2359 (void) block_open_flag(seccomp, O_DSYNC);
2360 #endif
2361
2362 r = seccomp_load(seccomp);
2363 if (r < 0) {
2364 if (ERRNO_IS_SECCOMP_FATAL(r))
2365 return r;
2366 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2367 }
2368 }
2369
2370 return 0;
2371 }