]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #24670 from keszybz/early-boot-ordering
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10 #include <sys/stat.h>
11
12 /* include missing_syscall_def.h earlier to make __SNR_foo mapped to __NR_foo. */
13 #include "missing_syscall_def.h"
14 #include <seccomp.h>
15
16 #include "af-list.h"
17 #include "alloc-util.h"
18 #include "env-util.h"
19 #include "errno-list.h"
20 #include "macro.h"
21 #include "nsflags.h"
22 #include "nulstr-util.h"
23 #include "process-util.h"
24 #include "seccomp-util.h"
25 #include "set.h"
26 #include "string-util.h"
27 #include "strv.h"
28
29 /* This array will be modified at runtime as seccomp_restrict_archs is called. */
30 uint32_t seccomp_local_archs[] = {
31
32 /* Note: always list the native arch we are compiled as last, so that users can deny-list seccomp(), but our own calls to it still succeed */
33
34 #if defined(__x86_64__) && defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X86_64,
37 SCMP_ARCH_X32, /* native */
38 #elif defined(__x86_64__) && !defined(__ILP32__)
39 SCMP_ARCH_X86,
40 SCMP_ARCH_X32,
41 SCMP_ARCH_X86_64, /* native */
42 #elif defined(__i386__)
43 SCMP_ARCH_X86,
44 #elif defined(__aarch64__)
45 SCMP_ARCH_ARM,
46 SCMP_ARCH_AARCH64, /* native */
47 #elif defined(__arm__)
48 SCMP_ARCH_ARM,
49 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
50 SCMP_ARCH_MIPSEL,
51 SCMP_ARCH_MIPS, /* native */
52 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPSEL,
57 SCMP_ARCH_MIPS,
58 SCMP_ARCH_MIPSEL64N32,
59 SCMP_ARCH_MIPS64N32,
60 SCMP_ARCH_MIPSEL64,
61 SCMP_ARCH_MIPS64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL,
65 SCMP_ARCH_MIPS64N32,
66 SCMP_ARCH_MIPSEL64N32,
67 SCMP_ARCH_MIPS64,
68 SCMP_ARCH_MIPSEL64, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPSEL,
71 SCMP_ARCH_MIPS,
72 SCMP_ARCH_MIPSEL64,
73 SCMP_ARCH_MIPS64,
74 SCMP_ARCH_MIPSEL64N32,
75 SCMP_ARCH_MIPS64N32, /* native */
76 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
77 SCMP_ARCH_MIPS,
78 SCMP_ARCH_MIPSEL,
79 SCMP_ARCH_MIPS64,
80 SCMP_ARCH_MIPSEL64,
81 SCMP_ARCH_MIPS64N32,
82 SCMP_ARCH_MIPSEL64N32, /* native */
83 #elif defined(__hppa64__) && defined(SCMP_ARCH_PARISC) && defined(SCMP_ARCH_PARISC64)
84 SCMP_ARCH_PARISC,
85 SCMP_ARCH_PARISC64, /* native */
86 #elif defined(__hppa__) && defined(SCMP_ARCH_PARISC)
87 SCMP_ARCH_PARISC,
88 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
89 SCMP_ARCH_PPC,
90 SCMP_ARCH_PPC64LE,
91 SCMP_ARCH_PPC64, /* native */
92 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
93 SCMP_ARCH_PPC,
94 SCMP_ARCH_PPC64,
95 SCMP_ARCH_PPC64LE, /* native */
96 #elif defined(__powerpc__)
97 SCMP_ARCH_PPC,
98 #elif defined(__riscv) && __riscv_xlen == 64 && defined(SCMP_ARCH_RISCV64)
99 SCMP_ARCH_RISCV64,
100 #elif defined(__s390x__)
101 SCMP_ARCH_S390,
102 SCMP_ARCH_S390X, /* native */
103 #elif defined(__s390__)
104 SCMP_ARCH_S390,
105 #endif
106 SECCOMP_LOCAL_ARCH_END
107 };
108
109 const char* seccomp_arch_to_string(uint32_t c) {
110 /* Maintain order used in <seccomp.h>.
111 *
112 * Names used here should be the same as those used for ConditionArchitecture=,
113 * except for "subarchitectures" like x32. */
114
115 switch (c) {
116 case SCMP_ARCH_NATIVE:
117 return "native";
118 case SCMP_ARCH_X86:
119 return "x86";
120 case SCMP_ARCH_X86_64:
121 return "x86-64";
122 case SCMP_ARCH_X32:
123 return "x32";
124 case SCMP_ARCH_ARM:
125 return "arm";
126 case SCMP_ARCH_AARCH64:
127 return "arm64";
128 case SCMP_ARCH_MIPS:
129 return "mips";
130 case SCMP_ARCH_MIPS64:
131 return "mips64";
132 case SCMP_ARCH_MIPS64N32:
133 return "mips64-n32";
134 case SCMP_ARCH_MIPSEL:
135 return "mips-le";
136 case SCMP_ARCH_MIPSEL64:
137 return "mips64-le";
138 case SCMP_ARCH_MIPSEL64N32:
139 return "mips64-le-n32";
140 #ifdef SCMP_ARCH_PARISC
141 case SCMP_ARCH_PARISC:
142 return "parisc";
143 #endif
144 #ifdef SCMP_ARCH_PARISC64
145 case SCMP_ARCH_PARISC64:
146 return "parisc64";
147 #endif
148 case SCMP_ARCH_PPC:
149 return "ppc";
150 case SCMP_ARCH_PPC64:
151 return "ppc64";
152 case SCMP_ARCH_PPC64LE:
153 return "ppc64-le";
154 #ifdef SCMP_ARCH_RISCV64
155 case SCMP_ARCH_RISCV64:
156 return "riscv64";
157 #endif
158 case SCMP_ARCH_S390:
159 return "s390";
160 case SCMP_ARCH_S390X:
161 return "s390x";
162 default:
163 return NULL;
164 }
165 }
166
167 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
168 if (!n)
169 return -EINVAL;
170
171 assert(ret);
172
173 if (streq(n, "native"))
174 *ret = SCMP_ARCH_NATIVE;
175 else if (streq(n, "x86"))
176 *ret = SCMP_ARCH_X86;
177 else if (streq(n, "x86-64"))
178 *ret = SCMP_ARCH_X86_64;
179 else if (streq(n, "x32"))
180 *ret = SCMP_ARCH_X32;
181 else if (streq(n, "arm"))
182 *ret = SCMP_ARCH_ARM;
183 else if (streq(n, "arm64"))
184 *ret = SCMP_ARCH_AARCH64;
185 else if (streq(n, "mips"))
186 *ret = SCMP_ARCH_MIPS;
187 else if (streq(n, "mips64"))
188 *ret = SCMP_ARCH_MIPS64;
189 else if (streq(n, "mips64-n32"))
190 *ret = SCMP_ARCH_MIPS64N32;
191 else if (streq(n, "mips-le"))
192 *ret = SCMP_ARCH_MIPSEL;
193 else if (streq(n, "mips64-le"))
194 *ret = SCMP_ARCH_MIPSEL64;
195 else if (streq(n, "mips64-le-n32"))
196 *ret = SCMP_ARCH_MIPSEL64N32;
197 #ifdef SCMP_ARCH_PARISC
198 else if (streq(n, "parisc"))
199 *ret = SCMP_ARCH_PARISC;
200 #endif
201 #ifdef SCMP_ARCH_PARISC64
202 else if (streq(n, "parisc64"))
203 *ret = SCMP_ARCH_PARISC64;
204 #endif
205 else if (streq(n, "ppc"))
206 *ret = SCMP_ARCH_PPC;
207 else if (streq(n, "ppc64"))
208 *ret = SCMP_ARCH_PPC64;
209 else if (streq(n, "ppc64-le"))
210 *ret = SCMP_ARCH_PPC64LE;
211 #ifdef SCMP_ARCH_RISCV64
212 else if (streq(n, "riscv64"))
213 *ret = SCMP_ARCH_RISCV64;
214 #endif
215 else if (streq(n, "s390"))
216 *ret = SCMP_ARCH_S390;
217 else if (streq(n, "s390x"))
218 *ret = SCMP_ARCH_S390X;
219 else
220 return -EINVAL;
221
222 return 0;
223 }
224
225 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
226 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
227 int r;
228
229 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
230 * any others. Also, turns off the NNP fiddling. */
231
232 seccomp = seccomp_init(default_action);
233 if (!seccomp)
234 return -ENOMEM;
235
236 if (arch != SCMP_ARCH_NATIVE &&
237 arch != seccomp_arch_native()) {
238
239 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
240 if (r < 0)
241 return r;
242
243 r = seccomp_arch_add(seccomp, arch);
244 if (r < 0)
245 return r;
246
247 assert(seccomp_arch_exist(seccomp, arch) >= 0);
248 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
249 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
250 } else {
251 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
252 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
253 }
254
255 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
256 if (r < 0)
257 return r;
258
259 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
260 if (r < 0)
261 return r;
262
263 #if SCMP_VER_MAJOR >= 3 || (SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 4)
264 if (getenv_bool("SYSTEMD_LOG_SECCOMP") > 0) {
265 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_LOG, 1);
266 if (r < 0)
267 log_debug_errno(r, "Failed to enable seccomp event logging: %m");
268 }
269 #endif
270
271 *ret = TAKE_PTR(seccomp);
272 return 0;
273 }
274
275 static bool is_basic_seccomp_available(void) {
276 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
277 }
278
279 static bool is_seccomp_filter_available(void) {
280 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
281 errno == EFAULT;
282 }
283
284 bool is_seccomp_available(void) {
285 static int cached_enabled = -1;
286
287 if (cached_enabled < 0) {
288 int b;
289
290 b = getenv_bool_secure("SYSTEMD_SECCOMP");
291 if (b != 0) {
292 if (b < 0 && b != -ENXIO) /* ENXIO: env var unset */
293 log_debug_errno(b, "Failed to parse $SYSTEMD_SECCOMP value, ignoring.");
294
295 cached_enabled =
296 is_basic_seccomp_available() &&
297 is_seccomp_filter_available();
298 } else
299 cached_enabled = false;
300 }
301
302 return cached_enabled;
303 }
304
305 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
306 [SYSCALL_FILTER_SET_DEFAULT] = {
307 .name = "@default",
308 .help = "System calls that are always permitted",
309 .value =
310 "arch_prctl\0" /* Used during platform-specific initialization by ld-linux.so. */
311 "brk\0"
312 "cacheflush\0"
313 "clock_getres\0"
314 "clock_getres_time64\0"
315 "clock_gettime\0"
316 "clock_gettime64\0"
317 "clock_nanosleep\0"
318 "clock_nanosleep_time64\0"
319 "execve\0"
320 "exit\0"
321 "exit_group\0"
322 "futex\0"
323 "futex_time64\0"
324 "get_robust_list\0"
325 "get_thread_area\0"
326 "getegid\0"
327 "getegid32\0"
328 "geteuid\0"
329 "geteuid32\0"
330 "getgid\0"
331 "getgid32\0"
332 "getgroups\0"
333 "getgroups32\0"
334 "getpgid\0"
335 "getpgrp\0"
336 "getpid\0"
337 "getppid\0"
338 "getrandom\0"
339 "getresgid\0"
340 "getresgid32\0"
341 "getresuid\0"
342 "getresuid32\0"
343 "getrlimit\0" /* make sure processes can query stack size and such */
344 "getsid\0"
345 "gettid\0"
346 "gettimeofday\0"
347 "getuid\0"
348 "getuid32\0"
349 "membarrier\0"
350 "mmap\0"
351 "mmap2\0"
352 "mprotect\0"
353 "munmap\0"
354 "nanosleep\0"
355 "pause\0"
356 "prlimit64\0"
357 "restart_syscall\0"
358 "rseq\0"
359 "rt_sigreturn\0"
360 "sched_getaffinity\0"
361 "sched_yield\0"
362 "set_robust_list\0"
363 "set_thread_area\0"
364 "set_tid_address\0"
365 "set_tls\0"
366 "sigreturn\0"
367 "time\0"
368 "ugetrlimit\0"
369 },
370 [SYSCALL_FILTER_SET_AIO] = {
371 .name = "@aio",
372 .help = "Asynchronous IO",
373 .value =
374 "io_cancel\0"
375 "io_destroy\0"
376 "io_getevents\0"
377 "io_pgetevents\0"
378 "io_pgetevents_time64\0"
379 "io_setup\0"
380 "io_submit\0"
381 "io_uring_enter\0"
382 "io_uring_register\0"
383 "io_uring_setup\0"
384 },
385 [SYSCALL_FILTER_SET_BASIC_IO] = {
386 .name = "@basic-io",
387 .help = "Basic IO",
388 .value =
389 "_llseek\0"
390 "close\0"
391 "close_range\0"
392 "dup\0"
393 "dup2\0"
394 "dup3\0"
395 "lseek\0"
396 "pread64\0"
397 "preadv\0"
398 "preadv2\0"
399 "pwrite64\0"
400 "pwritev\0"
401 "pwritev2\0"
402 "read\0"
403 "readv\0"
404 "write\0"
405 "writev\0"
406 },
407 [SYSCALL_FILTER_SET_CHOWN] = {
408 .name = "@chown",
409 .help = "Change ownership of files and directories",
410 .value =
411 "chown\0"
412 "chown32\0"
413 "fchown\0"
414 "fchown32\0"
415 "fchownat\0"
416 "lchown\0"
417 "lchown32\0"
418 },
419 [SYSCALL_FILTER_SET_CLOCK] = {
420 .name = "@clock",
421 .help = "Change the system time",
422 .value =
423 "adjtimex\0"
424 "clock_adjtime\0"
425 "clock_adjtime64\0"
426 "clock_settime\0"
427 "clock_settime64\0"
428 "settimeofday\0"
429 },
430 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
431 .name = "@cpu-emulation",
432 .help = "System calls for CPU emulation functionality",
433 .value =
434 "modify_ldt\0"
435 "subpage_prot\0"
436 "switch_endian\0"
437 "vm86\0"
438 "vm86old\0"
439 },
440 [SYSCALL_FILTER_SET_DEBUG] = {
441 .name = "@debug",
442 .help = "Debugging, performance monitoring and tracing functionality",
443 .value =
444 "lookup_dcookie\0"
445 "perf_event_open\0"
446 "pidfd_getfd\0"
447 "ptrace\0"
448 "rtas\0"
449 #if defined __s390__ || defined __s390x__
450 "s390_runtime_instr\0"
451 #endif
452 "sys_debug_setcontext\0"
453 },
454 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
455 .name = "@file-system",
456 .help = "File system operations",
457 .value =
458 "access\0"
459 "chdir\0"
460 "chmod\0"
461 "close\0"
462 "creat\0"
463 "faccessat\0"
464 "faccessat2\0"
465 "fallocate\0"
466 "fchdir\0"
467 "fchmod\0"
468 "fchmodat\0"
469 "fcntl\0"
470 "fcntl64\0"
471 "fgetxattr\0"
472 "flistxattr\0"
473 "fremovexattr\0"
474 "fsetxattr\0"
475 "fstat\0"
476 "fstat64\0"
477 "fstatat64\0"
478 "fstatfs\0"
479 "fstatfs64\0"
480 "ftruncate\0"
481 "ftruncate64\0"
482 "futimesat\0"
483 "getcwd\0"
484 "getdents\0"
485 "getdents64\0"
486 "getxattr\0"
487 "inotify_add_watch\0"
488 "inotify_init\0"
489 "inotify_init1\0"
490 "inotify_rm_watch\0"
491 "lgetxattr\0"
492 "link\0"
493 "linkat\0"
494 "listxattr\0"
495 "llistxattr\0"
496 "lremovexattr\0"
497 "lsetxattr\0"
498 "lstat\0"
499 "lstat64\0"
500 "mkdir\0"
501 "mkdirat\0"
502 "mknod\0"
503 "mknodat\0"
504 "newfstatat\0"
505 "oldfstat\0"
506 "oldlstat\0"
507 "oldstat\0"
508 "open\0"
509 "openat\0"
510 "openat2\0"
511 "readlink\0"
512 "readlinkat\0"
513 "removexattr\0"
514 "rename\0"
515 "renameat\0"
516 "renameat2\0"
517 "rmdir\0"
518 "setxattr\0"
519 "stat\0"
520 "stat64\0"
521 "statfs\0"
522 "statfs64\0"
523 "statx\0"
524 "symlink\0"
525 "symlinkat\0"
526 "truncate\0"
527 "truncate64\0"
528 "unlink\0"
529 "unlinkat\0"
530 "utime\0"
531 "utimensat\0"
532 "utimensat_time64\0"
533 "utimes\0"
534 },
535 [SYSCALL_FILTER_SET_IO_EVENT] = {
536 .name = "@io-event",
537 .help = "Event loop system calls",
538 .value =
539 "_newselect\0"
540 "epoll_create\0"
541 "epoll_create1\0"
542 "epoll_ctl\0"
543 "epoll_ctl_old\0"
544 "epoll_pwait\0"
545 "epoll_pwait2\0"
546 "epoll_wait\0"
547 "epoll_wait_old\0"
548 "eventfd\0"
549 "eventfd2\0"
550 "poll\0"
551 "ppoll\0"
552 "ppoll_time64\0"
553 "pselect6\0"
554 "pselect6_time64\0"
555 "select\0"
556 },
557 [SYSCALL_FILTER_SET_IPC] = {
558 .name = "@ipc",
559 .help = "SysV IPC, POSIX Message Queues or other IPC",
560 .value =
561 "ipc\0"
562 "memfd_create\0"
563 "mq_getsetattr\0"
564 "mq_notify\0"
565 "mq_open\0"
566 "mq_timedreceive\0"
567 "mq_timedreceive_time64\0"
568 "mq_timedsend\0"
569 "mq_timedsend_time64\0"
570 "mq_unlink\0"
571 "msgctl\0"
572 "msgget\0"
573 "msgrcv\0"
574 "msgsnd\0"
575 "pipe\0"
576 "pipe2\0"
577 "process_madvise\0"
578 "process_vm_readv\0"
579 "process_vm_writev\0"
580 "semctl\0"
581 "semget\0"
582 "semop\0"
583 "semtimedop\0"
584 "semtimedop_time64\0"
585 "shmat\0"
586 "shmctl\0"
587 "shmdt\0"
588 "shmget\0"
589 },
590 [SYSCALL_FILTER_SET_KEYRING] = {
591 .name = "@keyring",
592 .help = "Kernel keyring access",
593 .value =
594 "add_key\0"
595 "keyctl\0"
596 "request_key\0"
597 },
598 [SYSCALL_FILTER_SET_MEMLOCK] = {
599 .name = "@memlock",
600 .help = "Memory locking control",
601 .value =
602 "mlock\0"
603 "mlock2\0"
604 "mlockall\0"
605 "munlock\0"
606 "munlockall\0"
607 },
608 [SYSCALL_FILTER_SET_MODULE] = {
609 .name = "@module",
610 .help = "Loading and unloading of kernel modules",
611 .value =
612 "delete_module\0"
613 "finit_module\0"
614 "init_module\0"
615 },
616 [SYSCALL_FILTER_SET_MOUNT] = {
617 .name = "@mount",
618 .help = "Mounting and unmounting of file systems",
619 .value =
620 "chroot\0"
621 "fsconfig\0"
622 "fsmount\0"
623 "fsopen\0"
624 "fspick\0"
625 "mount\0"
626 "mount_setattr\0"
627 "move_mount\0"
628 "open_tree\0"
629 "pivot_root\0"
630 "umount\0"
631 "umount2\0"
632 },
633 [SYSCALL_FILTER_SET_NETWORK_IO] = {
634 .name = "@network-io",
635 .help = "Network or Unix socket IO, should not be needed if not network facing",
636 .value =
637 "accept\0"
638 "accept4\0"
639 "bind\0"
640 "connect\0"
641 "getpeername\0"
642 "getsockname\0"
643 "getsockopt\0"
644 "listen\0"
645 "recv\0"
646 "recvfrom\0"
647 "recvmmsg\0"
648 "recvmmsg_time64\0"
649 "recvmsg\0"
650 "send\0"
651 "sendmmsg\0"
652 "sendmsg\0"
653 "sendto\0"
654 "setsockopt\0"
655 "shutdown\0"
656 "socket\0"
657 "socketcall\0"
658 "socketpair\0"
659 },
660 [SYSCALL_FILTER_SET_OBSOLETE] = {
661 /* some unknown even to libseccomp */
662 .name = "@obsolete",
663 .help = "Unusual, obsolete or unimplemented system calls",
664 .value =
665 "_sysctl\0"
666 "afs_syscall\0"
667 "bdflush\0"
668 "break\0"
669 "create_module\0"
670 "ftime\0"
671 "get_kernel_syms\0"
672 "getpmsg\0"
673 "gtty\0"
674 "idle\0"
675 "lock\0"
676 "mpx\0"
677 "prof\0"
678 "profil\0"
679 "putpmsg\0"
680 "query_module\0"
681 "security\0"
682 "sgetmask\0"
683 "ssetmask\0"
684 "stime\0"
685 "stty\0"
686 "sysfs\0"
687 "tuxcall\0"
688 "ulimit\0"
689 "uselib\0"
690 "ustat\0"
691 "vserver\0"
692 },
693 [SYSCALL_FILTER_SET_PKEY] = {
694 .name = "@pkey",
695 .help = "System calls used for memory protection keys",
696 .value =
697 "pkey_alloc\0"
698 "pkey_free\0"
699 "pkey_mprotect\0"
700 },
701 [SYSCALL_FILTER_SET_PRIVILEGED] = {
702 .name = "@privileged",
703 .help = "All system calls which need super-user capabilities",
704 .value =
705 "@chown\0"
706 "@clock\0"
707 "@module\0"
708 "@raw-io\0"
709 "@reboot\0"
710 "@swap\0"
711 "_sysctl\0"
712 "acct\0"
713 "bpf\0"
714 "capset\0"
715 "chroot\0"
716 "fanotify_init\0"
717 "fanotify_mark\0"
718 "nfsservctl\0"
719 "open_by_handle_at\0"
720 "pivot_root\0"
721 "quotactl\0"
722 "setdomainname\0"
723 "setfsuid\0"
724 "setfsuid32\0"
725 "setgroups\0"
726 "setgroups32\0"
727 "sethostname\0"
728 "setresuid\0"
729 "setresuid32\0"
730 "setreuid\0"
731 "setreuid32\0"
732 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
733 "setuid32\0"
734 "vhangup\0"
735 },
736 [SYSCALL_FILTER_SET_PROCESS] = {
737 .name = "@process",
738 .help = "Process control, execution, namespacing operations",
739 .value =
740 "capget\0" /* Able to query arbitrary processes */
741 "clone\0"
742 /* ia64 as the only architecture has clone2, a replacement for clone, but ia64 doesn't
743 * implement seccomp, so we don't need to list it at all. C.f.
744 * acce2f71779c54086962fefce3833d886c655f62 in the kernel. */
745 "clone3\0"
746 "execveat\0"
747 "fork\0"
748 "getrusage\0"
749 "kill\0"
750 "pidfd_open\0"
751 "pidfd_send_signal\0"
752 "prctl\0"
753 "rt_sigqueueinfo\0"
754 "rt_tgsigqueueinfo\0"
755 "setns\0"
756 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
757 "tgkill\0"
758 "times\0"
759 "tkill\0"
760 "unshare\0"
761 "vfork\0"
762 "wait4\0"
763 "waitid\0"
764 "waitpid\0"
765 },
766 [SYSCALL_FILTER_SET_RAW_IO] = {
767 .name = "@raw-io",
768 .help = "Raw I/O port access",
769 .value =
770 "ioperm\0"
771 "iopl\0"
772 "pciconfig_iobase\0"
773 "pciconfig_read\0"
774 "pciconfig_write\0"
775 #if defined __s390__ || defined __s390x__
776 "s390_pci_mmio_read\0"
777 "s390_pci_mmio_write\0"
778 #endif
779 },
780 [SYSCALL_FILTER_SET_REBOOT] = {
781 .name = "@reboot",
782 .help = "Reboot and reboot preparation/kexec",
783 .value =
784 "kexec_file_load\0"
785 "kexec_load\0"
786 "reboot\0"
787 },
788 [SYSCALL_FILTER_SET_RESOURCES] = {
789 .name = "@resources",
790 .help = "Alter resource settings",
791 .value =
792 "ioprio_set\0"
793 "mbind\0"
794 "migrate_pages\0"
795 "move_pages\0"
796 "nice\0"
797 "sched_setaffinity\0"
798 "sched_setattr\0"
799 "sched_setparam\0"
800 "sched_setscheduler\0"
801 "set_mempolicy\0"
802 "setpriority\0"
803 "setrlimit\0"
804 },
805 [SYSCALL_FILTER_SET_SETUID] = {
806 .name = "@setuid",
807 .help = "Operations for changing user/group credentials",
808 .value =
809 "setgid\0"
810 "setgid32\0"
811 "setgroups\0"
812 "setgroups32\0"
813 "setregid\0"
814 "setregid32\0"
815 "setresgid\0"
816 "setresgid32\0"
817 "setresuid\0"
818 "setresuid32\0"
819 "setreuid\0"
820 "setreuid32\0"
821 "setuid\0"
822 "setuid32\0"
823 },
824 [SYSCALL_FILTER_SET_SIGNAL] = {
825 .name = "@signal",
826 .help = "Process signal handling",
827 .value =
828 "rt_sigaction\0"
829 "rt_sigpending\0"
830 "rt_sigprocmask\0"
831 "rt_sigsuspend\0"
832 "rt_sigtimedwait\0"
833 "rt_sigtimedwait_time64\0"
834 "sigaction\0"
835 "sigaltstack\0"
836 "signal\0"
837 "signalfd\0"
838 "signalfd4\0"
839 "sigpending\0"
840 "sigprocmask\0"
841 "sigsuspend\0"
842 },
843 [SYSCALL_FILTER_SET_SWAP] = {
844 .name = "@swap",
845 .help = "Enable/disable swap devices",
846 .value =
847 "swapoff\0"
848 "swapon\0"
849 },
850 [SYSCALL_FILTER_SET_SYNC] = {
851 .name = "@sync",
852 .help = "Synchronize files and memory to storage",
853 .value =
854 "fdatasync\0"
855 "fsync\0"
856 "msync\0"
857 "sync\0"
858 "sync_file_range\0"
859 "sync_file_range2\0"
860 "syncfs\0"
861 },
862 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
863 .name = "@system-service",
864 .help = "General system service operations",
865 .value =
866 "@aio\0"
867 "@basic-io\0"
868 "@chown\0"
869 "@default\0"
870 "@file-system\0"
871 "@io-event\0"
872 "@ipc\0"
873 "@keyring\0"
874 "@memlock\0"
875 "@network-io\0"
876 "@process\0"
877 "@resources\0"
878 "@setuid\0"
879 "@signal\0"
880 "@sync\0"
881 "@timer\0"
882 "capget\0"
883 "capset\0"
884 "copy_file_range\0"
885 "fadvise64\0"
886 "fadvise64_64\0"
887 "flock\0"
888 "get_mempolicy\0"
889 "getcpu\0"
890 "getpriority\0"
891 "ioctl\0"
892 "ioprio_get\0"
893 "kcmp\0"
894 "madvise\0"
895 "mremap\0"
896 "name_to_handle_at\0"
897 "oldolduname\0"
898 "olduname\0"
899 "personality\0"
900 "readahead\0"
901 "readdir\0"
902 "remap_file_pages\0"
903 "sched_get_priority_max\0"
904 "sched_get_priority_min\0"
905 "sched_getattr\0"
906 "sched_getparam\0"
907 "sched_getscheduler\0"
908 "sched_rr_get_interval\0"
909 "sched_rr_get_interval_time64\0"
910 "sched_yield\0"
911 "sendfile\0"
912 "sendfile64\0"
913 "setfsgid\0"
914 "setfsgid32\0"
915 "setfsuid\0"
916 "setfsuid32\0"
917 "setpgid\0"
918 "setsid\0"
919 "splice\0"
920 "sysinfo\0"
921 "tee\0"
922 "umask\0"
923 "uname\0"
924 "userfaultfd\0"
925 "vmsplice\0"
926 },
927 [SYSCALL_FILTER_SET_TIMER] = {
928 .name = "@timer",
929 .help = "Schedule operations by time",
930 .value =
931 "alarm\0"
932 "getitimer\0"
933 "setitimer\0"
934 "timer_create\0"
935 "timer_delete\0"
936 "timer_getoverrun\0"
937 "timer_gettime\0"
938 "timer_gettime64\0"
939 "timer_settime\0"
940 "timer_settime64\0"
941 "timerfd_create\0"
942 "timerfd_gettime\0"
943 "timerfd_gettime64\0"
944 "timerfd_settime\0"
945 "timerfd_settime64\0"
946 "times\0"
947 },
948 [SYSCALL_FILTER_SET_KNOWN] = {
949 .name = "@known",
950 .help = "All known syscalls declared in the kernel",
951 .value =
952 "@obsolete\0"
953 #include "syscall-list.h"
954 },
955 };
956
957 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
958 if (isempty(name) || name[0] != '@')
959 return NULL;
960
961 for (unsigned i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
962 if (streq(syscall_filter_sets[i].name, name))
963 return syscall_filter_sets + i;
964
965 return NULL;
966 }
967
968 static int add_syscall_filter_set(
969 scmp_filter_ctx seccomp,
970 const SyscallFilterSet *set,
971 uint32_t action,
972 char **exclude,
973 bool log_missing,
974 char ***added);
975
976 int seccomp_add_syscall_filter_item(
977 scmp_filter_ctx *seccomp,
978 const char *name,
979 uint32_t action,
980 char **exclude,
981 bool log_missing,
982 char ***added) {
983
984 assert(seccomp);
985 assert(name);
986
987 if (strv_contains(exclude, name))
988 return 0;
989
990 /* Any syscalls that are handled are added to the *added strv. The pointer
991 * must be either NULL or point to a valid pre-initialized possibly-empty strv. */
992
993 if (name[0] == '@') {
994 const SyscallFilterSet *other;
995
996 other = syscall_filter_set_find(name);
997 if (!other)
998 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
999 "Filter set %s is not known!",
1000 name);
1001
1002 return add_syscall_filter_set(seccomp, other, action, exclude, log_missing, added);
1003
1004 } else {
1005 int id, r;
1006
1007 id = seccomp_syscall_resolve_name(name);
1008 if (id == __NR_SCMP_ERROR) {
1009 if (log_missing)
1010 log_debug("System call %s is not known, ignoring.", name);
1011 return 0;
1012 }
1013
1014 r = seccomp_rule_add_exact(seccomp, action, id, 0);
1015 if (r < 0) {
1016 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
1017 bool ignore = r == -EDOM;
1018
1019 if (!ignore || log_missing)
1020 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1021 name, id, ignore ? ", ignoring" : "");
1022 if (!ignore)
1023 return r;
1024 }
1025
1026 if (added) {
1027 r = strv_extend(added, name);
1028 if (r < 0)
1029 return r;
1030 }
1031
1032 return 0;
1033 }
1034 }
1035
1036 static int add_syscall_filter_set(
1037 scmp_filter_ctx seccomp,
1038 const SyscallFilterSet *set,
1039 uint32_t action,
1040 char **exclude,
1041 bool log_missing,
1042 char ***added) {
1043
1044 const char *sys;
1045 int r;
1046
1047 /* Any syscalls that are handled are added to the *added strv. It needs to be initialized. */
1048
1049 assert(seccomp);
1050 assert(set);
1051
1052 NULSTR_FOREACH(sys, set->value) {
1053 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing, added);
1054 if (r < 0)
1055 return r;
1056 }
1057
1058 return 0;
1059 }
1060
1061 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
1062 uint32_t arch;
1063 int r;
1064
1065 assert(set);
1066
1067 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
1068 * each local arch. */
1069
1070 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1071 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1072
1073 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1074
1075 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1076 if (r < 0)
1077 return r;
1078
1079 r = add_syscall_filter_set(seccomp, set, action, NULL, log_missing, NULL);
1080 if (r < 0)
1081 return log_debug_errno(r, "Failed to add filter set: %m");
1082
1083 r = seccomp_load(seccomp);
1084 if (ERRNO_IS_SECCOMP_FATAL(r))
1085 return r;
1086 if (r < 0)
1087 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1088 }
1089
1090 return 0;
1091 }
1092
1093 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* filter, uint32_t action, bool log_missing) {
1094 uint32_t arch;
1095 int r;
1096
1097 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Hashmap* of syscalls, instead
1098 * of a SyscallFilterSet* table. */
1099
1100 if (hashmap_isempty(filter) && default_action == SCMP_ACT_ALLOW)
1101 return 0;
1102
1103 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1104 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1105 void *syscall_id, *val;
1106
1107 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1108
1109 r = seccomp_init_for_arch(&seccomp, arch, default_action);
1110 if (r < 0)
1111 return r;
1112
1113 HASHMAP_FOREACH_KEY(val, syscall_id, filter) {
1114 uint32_t a = action;
1115 int id = PTR_TO_INT(syscall_id) - 1;
1116 int error = PTR_TO_INT(val);
1117
1118 if (error == SECCOMP_ERROR_NUMBER_KILL)
1119 a = scmp_act_kill_process();
1120 #ifdef SCMP_ACT_LOG
1121 else if (action == SCMP_ACT_LOG)
1122 a = SCMP_ACT_LOG;
1123 #endif
1124 else if (error >= 0)
1125 a = SCMP_ACT_ERRNO(error);
1126
1127 r = seccomp_rule_add_exact(seccomp, a, id, 0);
1128 if (r < 0) {
1129 /* If the system call is not known on this architecture, then that's
1130 * fine, let's ignore it */
1131 _cleanup_free_ char *n = NULL;
1132 bool ignore;
1133
1134 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1135 ignore = r == -EDOM;
1136 if (!ignore || log_missing)
1137 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1138 strna(n), id, ignore ? ", ignoring" : "");
1139 if (!ignore)
1140 return r;
1141 }
1142 }
1143
1144 r = seccomp_load(seccomp);
1145 if (ERRNO_IS_SECCOMP_FATAL(r))
1146 return r;
1147 if (r < 0)
1148 log_debug_errno(r, "Failed to install system call filter for architecture %s, skipping: %m",
1149 seccomp_arch_to_string(arch));
1150 }
1151
1152 return 0;
1153 }
1154
1155 int seccomp_parse_syscall_filter(
1156 const char *name,
1157 int errno_num,
1158 Hashmap *filter,
1159 SeccompParseFlags flags,
1160 const char *unit,
1161 const char *filename,
1162 unsigned line) {
1163
1164 int r;
1165
1166 assert(name);
1167 assert(filter);
1168
1169 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) && errno_num >= 0)
1170 return -EINVAL;
1171
1172 if (name[0] == '@') {
1173 const SyscallFilterSet *set;
1174 const char *i;
1175
1176 set = syscall_filter_set_find(name);
1177 if (!set) {
1178 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1179 return -EINVAL;
1180
1181 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1182 "Unknown system call group, ignoring: %s", name);
1183 return 0;
1184 }
1185
1186 NULSTR_FOREACH(i, set->value) {
1187 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1188 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1189 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1190 * about them. */
1191 r = seccomp_parse_syscall_filter(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1192 if (r < 0)
1193 return r;
1194 }
1195 } else {
1196 int id;
1197
1198 id = seccomp_syscall_resolve_name(name);
1199 if (id == __NR_SCMP_ERROR) {
1200 if (!FLAGS_SET(flags, SECCOMP_PARSE_PERMISSIVE))
1201 return -EINVAL;
1202
1203 log_syntax(unit, FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1204 "Failed to parse system call, ignoring: %s", name);
1205 return 0;
1206 }
1207
1208 /* If we previously wanted to forbid a syscall and now we want to allow it, then remove
1209 * it from the list. The entries in allow-list with non-negative error value will be
1210 * handled with SCMP_ACT_ERRNO() instead of the default action. */
1211 if (!FLAGS_SET(flags, SECCOMP_PARSE_INVERT) == FLAGS_SET(flags, SECCOMP_PARSE_ALLOW_LIST) ||
1212 (FLAGS_SET(flags, SECCOMP_PARSE_INVERT | SECCOMP_PARSE_ALLOW_LIST) && errno_num >= 0)) {
1213 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1214 if (r < 0)
1215 switch (r) {
1216 case -ENOMEM:
1217 return FLAGS_SET(flags, SECCOMP_PARSE_LOG) ? log_oom() : -ENOMEM;
1218 case -EEXIST:
1219 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1220 break;
1221 default:
1222 return r;
1223 }
1224 } else
1225 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1226 }
1227
1228 return 0;
1229 }
1230
1231 int seccomp_restrict_namespaces(unsigned long retain) {
1232 uint32_t arch;
1233 int r;
1234
1235 if (DEBUG_LOGGING) {
1236 _cleanup_free_ char *s = NULL;
1237
1238 (void) namespace_flags_to_string(retain, &s);
1239 log_debug("Restricting namespace to: %s.", strna(s));
1240 }
1241
1242 /* NOOP? */
1243 if (FLAGS_SET(retain, NAMESPACE_FLAGS_ALL))
1244 return 0;
1245
1246 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1247 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1248
1249 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1250
1251 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1252 if (r < 0)
1253 return r;
1254
1255 /* We cannot filter on individual flags to clone3(), and we need to disable the
1256 * syscall altogether. ENOSYS is used instead of EPERM, so that glibc and other
1257 * users shall fall back to clone(), as if on an older kernel.
1258 *
1259 * C.f. https://github.com/flatpak/flatpak/commit/a10f52a7565c549612c92b8e736a6698a53db330,
1260 * https://github.com/moby/moby/issues/42680. */
1261
1262 r = seccomp_rule_add_exact(
1263 seccomp,
1264 SCMP_ACT_ERRNO(ENOSYS),
1265 SCMP_SYS(clone3),
1266 0);
1267 if (r < 0)
1268 log_debug_errno(r, "Failed to add clone3() rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
1269
1270 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1271 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1272 * altogether. */
1273 r = seccomp_rule_add_exact(
1274 seccomp,
1275 SCMP_ACT_ERRNO(EPERM),
1276 SCMP_SYS(setns),
1277 0);
1278 else
1279 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1280 * special invocation with a zero flags argument, right here. */
1281 r = seccomp_rule_add_exact(
1282 seccomp,
1283 SCMP_ACT_ERRNO(EPERM),
1284 SCMP_SYS(setns),
1285 1,
1286 SCMP_A1(SCMP_CMP_EQ, 0));
1287 if (r < 0) {
1288 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1289 continue;
1290 }
1291
1292 for (unsigned i = 0; namespace_flag_map[i].name; i++) {
1293 unsigned long f;
1294
1295 f = namespace_flag_map[i].flag;
1296 if (FLAGS_SET(retain, f)) {
1297 log_debug("Permitting %s.", namespace_flag_map[i].name);
1298 continue;
1299 }
1300
1301 log_debug("Blocking %s.", namespace_flag_map[i].name);
1302
1303 r = seccomp_rule_add_exact(
1304 seccomp,
1305 SCMP_ACT_ERRNO(EPERM),
1306 SCMP_SYS(unshare),
1307 1,
1308 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1309 if (r < 0) {
1310 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1311 break;
1312 }
1313
1314 /* On s390/s390x the first two parameters to clone are switched */
1315 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1316 r = seccomp_rule_add_exact(
1317 seccomp,
1318 SCMP_ACT_ERRNO(EPERM),
1319 SCMP_SYS(clone),
1320 1,
1321 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1322 else
1323 r = seccomp_rule_add_exact(
1324 seccomp,
1325 SCMP_ACT_ERRNO(EPERM),
1326 SCMP_SYS(clone),
1327 1,
1328 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1329 if (r < 0) {
1330 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1331 break;
1332 }
1333
1334 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1335 r = seccomp_rule_add_exact(
1336 seccomp,
1337 SCMP_ACT_ERRNO(EPERM),
1338 SCMP_SYS(setns),
1339 1,
1340 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1341 if (r < 0) {
1342 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1343 break;
1344 }
1345 }
1346 }
1347 if (r < 0)
1348 continue;
1349
1350 r = seccomp_load(seccomp);
1351 if (ERRNO_IS_SECCOMP_FATAL(r))
1352 return r;
1353 if (r < 0)
1354 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1355 }
1356
1357 return 0;
1358 }
1359
1360 int seccomp_protect_sysctl(void) {
1361 uint32_t arch;
1362 int r;
1363
1364 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1365 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1366
1367 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1368
1369 if (IN_SET(arch,
1370 SCMP_ARCH_AARCH64,
1371 #ifdef SCMP_ARCH_RISCV64
1372 SCMP_ARCH_RISCV64,
1373 #endif
1374 SCMP_ARCH_X32
1375 ))
1376 /* No _sysctl syscall */
1377 continue;
1378
1379 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1380 if (r < 0)
1381 return r;
1382
1383 r = seccomp_rule_add_exact(
1384 seccomp,
1385 SCMP_ACT_ERRNO(EPERM),
1386 SCMP_SYS(_sysctl),
1387 0);
1388 if (r < 0) {
1389 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1390 continue;
1391 }
1392
1393 r = seccomp_load(seccomp);
1394 if (ERRNO_IS_SECCOMP_FATAL(r))
1395 return r;
1396 if (r < 0)
1397 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 }
1399
1400 return 0;
1401 }
1402
1403 int seccomp_protect_syslog(void) {
1404 uint32_t arch;
1405 int r;
1406
1407 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1408 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1409
1410 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1411 if (r < 0)
1412 return r;
1413
1414 r = seccomp_rule_add_exact(
1415 seccomp,
1416 SCMP_ACT_ERRNO(EPERM),
1417 SCMP_SYS(syslog),
1418 0);
1419
1420 if (r < 0) {
1421 log_debug_errno(r, "Failed to add syslog() rule for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1422 continue;
1423 }
1424
1425 r = seccomp_load(seccomp);
1426 if (ERRNO_IS_SECCOMP_FATAL(r))
1427 return r;
1428 if (r < 0)
1429 log_debug_errno(r, "Failed to install syslog protection rules for architecture %s, skipping %m", seccomp_arch_to_string(arch));
1430 }
1431
1432 return 0;
1433 }
1434
1435 int seccomp_restrict_address_families(Set *address_families, bool allow_list) {
1436 uint32_t arch;
1437 int r;
1438
1439 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1440 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1441 bool supported;
1442
1443 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1444
1445 switch (arch) {
1446
1447 case SCMP_ARCH_X86_64:
1448 case SCMP_ARCH_X32:
1449 case SCMP_ARCH_ARM:
1450 case SCMP_ARCH_AARCH64:
1451 case SCMP_ARCH_MIPSEL64N32:
1452 case SCMP_ARCH_MIPS64N32:
1453 case SCMP_ARCH_MIPSEL64:
1454 case SCMP_ARCH_MIPS64:
1455 #ifdef SCMP_ARCH_RISCV64
1456 case SCMP_ARCH_RISCV64:
1457 #endif
1458 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1459 supported = true;
1460 break;
1461
1462 case SCMP_ARCH_S390:
1463 case SCMP_ARCH_S390X:
1464 case SCMP_ARCH_X86:
1465 case SCMP_ARCH_MIPSEL:
1466 case SCMP_ARCH_MIPS:
1467 #ifdef SCMP_ARCH_PARISC
1468 case SCMP_ARCH_PARISC:
1469 #endif
1470 #ifdef SCMP_ARCH_PARISC64
1471 case SCMP_ARCH_PARISC64:
1472 #endif
1473 case SCMP_ARCH_PPC:
1474 case SCMP_ARCH_PPC64:
1475 case SCMP_ARCH_PPC64LE:
1476 default:
1477 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1478 * don't know */
1479 supported = false;
1480 break;
1481 }
1482
1483 if (!supported)
1484 continue;
1485
1486 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1487 if (r < 0)
1488 return r;
1489
1490 if (allow_list) {
1491 int first = 0, last = 0;
1492 void *afp;
1493
1494 /* If this is an allow list, we first block the address families that are out of
1495 * range and then everything that is not in the set. First, we find the lowest and
1496 * highest address family in the set. */
1497
1498 SET_FOREACH(afp, address_families) {
1499 int af = PTR_TO_INT(afp);
1500
1501 if (af <= 0 || af >= af_max())
1502 continue;
1503
1504 if (first == 0 || af < first)
1505 first = af;
1506
1507 if (last == 0 || af > last)
1508 last = af;
1509 }
1510
1511 assert((first == 0) == (last == 0));
1512
1513 if (first == 0) {
1514
1515 /* No entries in the valid range, block everything */
1516 r = seccomp_rule_add_exact(
1517 seccomp,
1518 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1519 SCMP_SYS(socket),
1520 0);
1521 if (r < 0) {
1522 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1523 continue;
1524 }
1525
1526 } else {
1527
1528 /* Block everything below the first entry */
1529 r = seccomp_rule_add_exact(
1530 seccomp,
1531 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1532 SCMP_SYS(socket),
1533 1,
1534 SCMP_A0(SCMP_CMP_LT, first));
1535 if (r < 0) {
1536 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1537 continue;
1538 }
1539
1540 /* Block everything above the last entry */
1541 r = seccomp_rule_add_exact(
1542 seccomp,
1543 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1544 SCMP_SYS(socket),
1545 1,
1546 SCMP_A0(SCMP_CMP_GT, last));
1547 if (r < 0) {
1548 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1549 continue;
1550 }
1551
1552 /* Block everything between the first and last entry */
1553 for (int af = 1; af < af_max(); af++) {
1554
1555 if (set_contains(address_families, INT_TO_PTR(af)))
1556 continue;
1557
1558 r = seccomp_rule_add_exact(
1559 seccomp,
1560 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1561 SCMP_SYS(socket),
1562 1,
1563 SCMP_A0(SCMP_CMP_EQ, af));
1564 if (r < 0)
1565 break;
1566 }
1567 if (r < 0) {
1568 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1569 continue;
1570 }
1571 }
1572
1573 } else {
1574 void *af;
1575
1576 /* If this is a deny list, then generate one rule for each address family that are
1577 * then combined in OR checks. */
1578
1579 SET_FOREACH(af, address_families) {
1580 r = seccomp_rule_add_exact(
1581 seccomp,
1582 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1583 SCMP_SYS(socket),
1584 1,
1585 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1586 if (r < 0)
1587 break;
1588 }
1589 if (r < 0) {
1590 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1591 continue;
1592 }
1593 }
1594
1595 r = seccomp_load(seccomp);
1596 if (ERRNO_IS_SECCOMP_FATAL(r))
1597 return r;
1598 if (r < 0)
1599 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1600 }
1601
1602 return 0;
1603 }
1604
1605 int seccomp_restrict_realtime_full(int error_code) {
1606 static const int permitted_policies[] = {
1607 SCHED_OTHER,
1608 SCHED_BATCH,
1609 SCHED_IDLE,
1610 };
1611
1612 int r, max_policy = 0;
1613 uint32_t arch;
1614 unsigned i;
1615
1616 assert(error_code > 0);
1617
1618 /* Determine the highest policy constant we want to allow */
1619 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1620 if (permitted_policies[i] > max_policy)
1621 max_policy = permitted_policies[i];
1622
1623 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1624 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1625 int p;
1626
1627 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1628
1629 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1630 if (r < 0)
1631 return r;
1632
1633 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1634 * allow list. */
1635 for (p = 0; p < max_policy; p++) {
1636 bool good = false;
1637
1638 /* Check if this is in the allow list. */
1639 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1640 if (permitted_policies[i] == p) {
1641 good = true;
1642 break;
1643 }
1644
1645 if (good)
1646 continue;
1647
1648 /* Deny this policy */
1649 r = seccomp_rule_add_exact(
1650 seccomp,
1651 SCMP_ACT_ERRNO(error_code),
1652 SCMP_SYS(sched_setscheduler),
1653 1,
1654 SCMP_A1(SCMP_CMP_EQ, p));
1655 if (r < 0) {
1656 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1657 continue;
1658 }
1659 }
1660
1661 /* Deny-list all other policies, i.e. the ones with higher values. Note that all comparisons
1662 * are unsigned here, hence no need no check for < 0 values. */
1663 r = seccomp_rule_add_exact(
1664 seccomp,
1665 SCMP_ACT_ERRNO(error_code),
1666 SCMP_SYS(sched_setscheduler),
1667 1,
1668 SCMP_A1(SCMP_CMP_GT, max_policy));
1669 if (r < 0) {
1670 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1671 continue;
1672 }
1673
1674 r = seccomp_load(seccomp);
1675 if (ERRNO_IS_SECCOMP_FATAL(r))
1676 return r;
1677 if (r < 0)
1678 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1679 }
1680
1681 return 0;
1682 }
1683
1684 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1685 uint32_t arch,
1686 int nr,
1687 unsigned arg_cnt,
1688 const struct scmp_arg_cmp arg) {
1689 int r;
1690
1691 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1692 if (r < 0) {
1693 _cleanup_free_ char *n = NULL;
1694
1695 n = seccomp_syscall_resolve_num_arch(arch, nr);
1696 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1697 strna(n),
1698 seccomp_arch_to_string(arch));
1699 }
1700
1701 return r;
1702 }
1703
1704 /* For known architectures, check that syscalls are indeed defined or not. */
1705 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__) || (defined(__riscv) && __riscv_xlen == 64)
1706 assert_cc(SCMP_SYS(shmget) > 0);
1707 assert_cc(SCMP_SYS(shmat) > 0);
1708 assert_cc(SCMP_SYS(shmdt) > 0);
1709 #endif
1710
1711 int seccomp_memory_deny_write_execute(void) {
1712 uint32_t arch;
1713 unsigned loaded = 0;
1714
1715 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1716 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1717 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0, r;
1718
1719 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1720
1721 switch (arch) {
1722
1723 /* Note that on some architectures shmat() isn't available, and the call is multiplexed through ipc().
1724 * We ignore that here, which means there's still a way to get writable/executable
1725 * memory, if an IPC key is mapped like this. That's a pity, but no total loss.
1726 *
1727 * Also, PARISC isn't here right now because it still needs executable memory, but work is in progress
1728 * on that front (kernel work done in 5.18).
1729 */
1730
1731 case SCMP_ARCH_X86:
1732 case SCMP_ARCH_S390:
1733 filter_syscall = SCMP_SYS(mmap2);
1734 block_syscall = SCMP_SYS(mmap);
1735 /* shmat multiplexed, see above */
1736 break;
1737
1738 case SCMP_ARCH_PPC:
1739 case SCMP_ARCH_PPC64:
1740 case SCMP_ARCH_PPC64LE:
1741 case SCMP_ARCH_S390X:
1742 filter_syscall = SCMP_SYS(mmap);
1743 /* shmat multiplexed, see above */
1744 break;
1745
1746 case SCMP_ARCH_ARM:
1747 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1748 shmat_syscall = SCMP_SYS(shmat);
1749 break;
1750
1751 case SCMP_ARCH_X86_64:
1752 case SCMP_ARCH_X32:
1753 case SCMP_ARCH_AARCH64:
1754 #ifdef SCMP_ARCH_RISCV64
1755 case SCMP_ARCH_RISCV64:
1756 #endif
1757 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, arm64 and riscv64 have only mmap */
1758 shmat_syscall = SCMP_SYS(shmat);
1759 break;
1760
1761 /* Please add more definitions here, if you port systemd to other architectures! */
1762
1763 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__hppa__) && !defined(__hppa64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__) && !defined(__s390__) && !defined(__s390x__) && !(defined(__riscv) && __riscv_xlen == 64)
1764 #warning "Consider adding the right mmap() syscall definitions here!"
1765 #endif
1766 }
1767
1768 /* Can't filter mmap() on this arch, then skip it */
1769 if (filter_syscall == 0)
1770 continue;
1771
1772 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1773 if (r < 0)
1774 return r;
1775
1776 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1777 1,
1778 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1779 if (r < 0)
1780 continue;
1781
1782 if (block_syscall != 0) {
1783 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1784 if (r < 0)
1785 continue;
1786 }
1787
1788 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1789 1,
1790 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1791 if (r < 0)
1792 continue;
1793
1794 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1795 1,
1796 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1797 if (r < 0)
1798 continue;
1799
1800 if (shmat_syscall > 0) {
1801 r = add_seccomp_syscall_filter(seccomp, arch, shmat_syscall,
1802 1,
1803 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1804 if (r < 0)
1805 continue;
1806 }
1807
1808 r = seccomp_load(seccomp);
1809 if (ERRNO_IS_SECCOMP_FATAL(r))
1810 return r;
1811 if (r < 0)
1812 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m",
1813 seccomp_arch_to_string(arch));
1814 loaded++;
1815 }
1816
1817 if (loaded == 0)
1818 log_debug("Failed to install any seccomp rules for MemoryDenyWriteExecute=.");
1819
1820 return loaded;
1821 }
1822
1823 int seccomp_restrict_archs(Set *archs) {
1824 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1825 int r;
1826 bool blocked_new = false;
1827
1828 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1829 * list.
1830 *
1831 * There are some qualifications. However the most important use is to stop processes from bypassing
1832 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1833 * in a non-native architecture. There are no holes in this use case, at least so far. */
1834
1835 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1836 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1837 * to run a program with the restrictions applied. */
1838 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1839 if (!seccomp)
1840 return -ENOMEM;
1841
1842 for (unsigned i = 0; seccomp_local_archs[i] != SECCOMP_LOCAL_ARCH_END; ++i) {
1843 uint32_t arch = seccomp_local_archs[i];
1844
1845 /* See above comment, our "native" architecture is never blocked. */
1846 if (arch == seccomp_arch_native())
1847 continue;
1848
1849 /* That architecture might have already been blocked by a previous call to seccomp_restrict_archs. */
1850 if (arch == SECCOMP_LOCAL_ARCH_BLOCKED)
1851 continue;
1852
1853 bool block = !set_contains(archs, UINT32_TO_PTR(arch + 1));
1854
1855 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1856 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1857 * The important thing is that you can block the old 32-bit x86 syscalls.
1858 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1859 if (block && arch == SCMP_ARCH_X86_64 && seccomp_arch_native() == SCMP_ARCH_X32)
1860 block = !set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1));
1861
1862 if (block) {
1863 seccomp_local_archs[i] = SECCOMP_LOCAL_ARCH_BLOCKED;
1864 blocked_new = true;
1865 } else {
1866 r = seccomp_arch_add(seccomp, arch);
1867 if (r < 0 && r != -EEXIST)
1868 return r;
1869 }
1870 }
1871
1872 /* All architectures that will be blocked by the seccomp program were
1873 * already blocked. */
1874 if (!blocked_new)
1875 return 0;
1876
1877 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1878 if (r < 0)
1879 return r;
1880
1881 r = seccomp_load(seccomp);
1882 if (ERRNO_IS_SECCOMP_FATAL(r))
1883 return r;
1884 if (r < 0)
1885 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1886
1887 return 0;
1888 }
1889
1890 int parse_syscall_archs(char **l, Set **ret_archs) {
1891 _cleanup_set_free_ Set *archs = NULL;
1892 int r;
1893
1894 assert(l);
1895 assert(ret_archs);
1896
1897 STRV_FOREACH(s, l) {
1898 uint32_t a;
1899
1900 r = seccomp_arch_from_string(*s, &a);
1901 if (r < 0)
1902 return -EINVAL;
1903
1904 r = set_ensure_put(&archs, NULL, UINT32_TO_PTR(a + 1));
1905 if (r < 0)
1906 return -ENOMEM;
1907 }
1908
1909 *ret_archs = TAKE_PTR(archs);
1910 return 0;
1911 }
1912
1913 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1914 const char *i;
1915 int r;
1916
1917 assert(set);
1918
1919 NULSTR_FOREACH(i, set->value) {
1920
1921 if (i[0] == '@') {
1922 const SyscallFilterSet *more;
1923
1924 more = syscall_filter_set_find(i);
1925 if (!more)
1926 return -ENXIO;
1927
1928 r = seccomp_filter_set_add(filter, add, more);
1929 if (r < 0)
1930 return r;
1931 } else {
1932 int id;
1933
1934 id = seccomp_syscall_resolve_name(i);
1935 if (id == __NR_SCMP_ERROR) {
1936 log_debug("Couldn't resolve system call, ignoring: %s", i);
1937 continue;
1938 }
1939
1940 if (add) {
1941 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1942 if (r < 0)
1943 return r;
1944 } else
1945 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1946 }
1947 }
1948
1949 return 0;
1950 }
1951
1952 int seccomp_lock_personality(unsigned long personality) {
1953 uint32_t arch;
1954 int r;
1955
1956 if (personality >= PERSONALITY_INVALID)
1957 return -EINVAL;
1958
1959 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1960 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1961
1962 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1963 if (r < 0)
1964 return r;
1965
1966 r = seccomp_rule_add_exact(
1967 seccomp,
1968 SCMP_ACT_ERRNO(EPERM),
1969 SCMP_SYS(personality),
1970 1,
1971 SCMP_A0(SCMP_CMP_NE, personality));
1972 if (r < 0) {
1973 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1974 continue;
1975 }
1976
1977 r = seccomp_load(seccomp);
1978 if (ERRNO_IS_SECCOMP_FATAL(r))
1979 return r;
1980 if (r < 0)
1981 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1982 }
1983
1984 return 0;
1985 }
1986
1987 int seccomp_protect_hostname(void) {
1988 uint32_t arch;
1989 int r;
1990
1991 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1992 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1993
1994 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1995 if (r < 0)
1996 return r;
1997
1998 r = seccomp_rule_add_exact(
1999 seccomp,
2000 SCMP_ACT_ERRNO(EPERM),
2001 SCMP_SYS(sethostname),
2002 0);
2003 if (r < 0) {
2004 log_debug_errno(r, "Failed to add sethostname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2005 continue;
2006 }
2007
2008 r = seccomp_rule_add_exact(
2009 seccomp,
2010 SCMP_ACT_ERRNO(EPERM),
2011 SCMP_SYS(setdomainname),
2012 0);
2013 if (r < 0) {
2014 log_debug_errno(r, "Failed to add setdomainname() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2015 continue;
2016 }
2017
2018 r = seccomp_load(seccomp);
2019 if (ERRNO_IS_SECCOMP_FATAL(r))
2020 return r;
2021 if (r < 0)
2022 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2023 }
2024
2025 return 0;
2026 }
2027
2028 static int seccomp_restrict_sxid(scmp_filter_ctx seccomp, mode_t m) {
2029 /* Checks the mode_t parameter of the following system calls:
2030 *
2031 * → chmod() + fchmod() + fchmodat()
2032 * → open() + creat() + openat()
2033 * → mkdir() + mkdirat()
2034 * → mknod() + mknodat()
2035 *
2036 * Returns error if *everything* failed, and 0 otherwise.
2037 */
2038 int r;
2039 bool any = false;
2040
2041 r = seccomp_rule_add_exact(
2042 seccomp,
2043 SCMP_ACT_ERRNO(EPERM),
2044 SCMP_SYS(chmod),
2045 1,
2046 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2047 if (r < 0)
2048 log_debug_errno(r, "Failed to add filter for chmod: %m");
2049 else
2050 any = true;
2051
2052 r = seccomp_rule_add_exact(
2053 seccomp,
2054 SCMP_ACT_ERRNO(EPERM),
2055 SCMP_SYS(fchmod),
2056 1,
2057 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2058 if (r < 0)
2059 log_debug_errno(r, "Failed to add filter for fchmod: %m");
2060 else
2061 any = true;
2062
2063 r = seccomp_rule_add_exact(
2064 seccomp,
2065 SCMP_ACT_ERRNO(EPERM),
2066 SCMP_SYS(fchmodat),
2067 1,
2068 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2069 if (r < 0)
2070 log_debug_errno(r, "Failed to add filter for fchmodat: %m");
2071 else
2072 any = true;
2073
2074 r = seccomp_rule_add_exact(
2075 seccomp,
2076 SCMP_ACT_ERRNO(EPERM),
2077 SCMP_SYS(mkdir),
2078 1,
2079 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2080 if (r < 0)
2081 log_debug_errno(r, "Failed to add filter for mkdir: %m");
2082 else
2083 any = true;
2084
2085 r = seccomp_rule_add_exact(
2086 seccomp,
2087 SCMP_ACT_ERRNO(EPERM),
2088 SCMP_SYS(mkdirat),
2089 1,
2090 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2091 if (r < 0)
2092 log_debug_errno(r, "Failed to add filter for mkdirat: %m");
2093 else
2094 any = true;
2095
2096 r = seccomp_rule_add_exact(
2097 seccomp,
2098 SCMP_ACT_ERRNO(EPERM),
2099 SCMP_SYS(mknod),
2100 1,
2101 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2102 if (r < 0)
2103 log_debug_errno(r, "Failed to add filter for mknod: %m");
2104 else
2105 any = true;
2106
2107 r = seccomp_rule_add_exact(
2108 seccomp,
2109 SCMP_ACT_ERRNO(EPERM),
2110 SCMP_SYS(mknodat),
2111 1,
2112 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2113 if (r < 0)
2114 log_debug_errno(r, "Failed to add filter for mknodat: %m");
2115 else
2116 any = true;
2117
2118 r = seccomp_rule_add_exact(
2119 seccomp,
2120 SCMP_ACT_ERRNO(EPERM),
2121 SCMP_SYS(open),
2122 2,
2123 SCMP_A1(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2124 SCMP_A2(SCMP_CMP_MASKED_EQ, m, m));
2125 if (r < 0)
2126 log_debug_errno(r, "Failed to add filter for open: %m");
2127 else
2128 any = true;
2129
2130 r = seccomp_rule_add_exact(
2131 seccomp,
2132 SCMP_ACT_ERRNO(EPERM),
2133 SCMP_SYS(openat),
2134 2,
2135 SCMP_A2(SCMP_CMP_MASKED_EQ, O_CREAT, O_CREAT),
2136 SCMP_A3(SCMP_CMP_MASKED_EQ, m, m));
2137 if (r < 0)
2138 log_debug_errno(r, "Failed to add filter for openat: %m");
2139 else
2140 any = true;
2141
2142 #if defined(__SNR_openat2)
2143 /* The new openat2() system call can't be filtered sensibly, since it moves the flags parameter into
2144 * an indirect structure. Let's block it entirely for now. That should be a reasonably OK thing to do
2145 * for now, since openat2() is very new and code generally needs fallback logic anyway to be
2146 * compatible with kernels that are not absolutely recent. We would normally return EPERM for a
2147 * policy check, but this isn't strictly a policy check. Instead, we return ENOSYS to force programs
2148 * to call open() or openat() instead. We can properly enforce policy for those functions. */
2149 r = seccomp_rule_add_exact(
2150 seccomp,
2151 SCMP_ACT_ERRNO(ENOSYS),
2152 SCMP_SYS(openat2),
2153 0);
2154 if (r < 0)
2155 log_debug_errno(r, "Failed to add filter for openat2: %m");
2156 else
2157 any = true;
2158 #endif
2159
2160 r = seccomp_rule_add_exact(
2161 seccomp,
2162 SCMP_ACT_ERRNO(EPERM),
2163 SCMP_SYS(creat),
2164 1,
2165 SCMP_A1(SCMP_CMP_MASKED_EQ, m, m));
2166 if (r < 0)
2167 log_debug_errno(r, "Failed to add filter for creat: %m");
2168 else
2169 any = true;
2170
2171 return any ? 0 : r;
2172 }
2173
2174 int seccomp_restrict_suid_sgid(void) {
2175 uint32_t arch;
2176 int r, k;
2177
2178 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2179 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2180
2181 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2182 if (r < 0)
2183 return r;
2184
2185 r = seccomp_restrict_sxid(seccomp, S_ISUID);
2186 if (r < 0)
2187 log_debug_errno(r, "Failed to add suid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2188
2189 k = seccomp_restrict_sxid(seccomp, S_ISGID);
2190 if (k < 0)
2191 log_debug_errno(r, "Failed to add sgid rule for architecture %s, ignoring: %m", seccomp_arch_to_string(arch));
2192
2193 if (r < 0 && k < 0)
2194 continue;
2195
2196 r = seccomp_load(seccomp);
2197 if (ERRNO_IS_SECCOMP_FATAL(r))
2198 return r;
2199 if (r < 0)
2200 log_debug_errno(r, "Failed to apply suid/sgid restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2201 }
2202
2203 return 0;
2204 }
2205
2206 uint32_t scmp_act_kill_process(void) {
2207
2208 /* Returns SCMP_ACT_KILL_PROCESS if it's supported, and SCMP_ACT_KILL_THREAD otherwise. We never
2209 * actually want to use SCMP_ACT_KILL_THREAD as its semantics are nuts (killing arbitrary threads of
2210 * a program is just a bad idea), but on old kernels/old libseccomp it is all we have, and at least
2211 * for single-threaded apps does the right thing. */
2212
2213 #ifdef SCMP_ACT_KILL_PROCESS
2214 if (seccomp_api_get() >= 3)
2215 return SCMP_ACT_KILL_PROCESS;
2216 #endif
2217
2218 return SCMP_ACT_KILL; /* same as SCMP_ACT_KILL_THREAD */
2219 }
2220
2221 int parse_syscall_and_errno(const char *in, char **name, int *error) {
2222 _cleanup_free_ char *n = NULL;
2223 char *p;
2224 int e = -1;
2225
2226 assert(in);
2227 assert(name);
2228 assert(error);
2229
2230 /*
2231 * This parse "syscall:errno" like "uname:EILSEQ", "@sync:255".
2232 * If errno is omitted, then error is set to -1.
2233 * Empty syscall name is not allowed.
2234 * Here, we do not check that the syscall name is valid or not.
2235 */
2236
2237 p = strchr(in, ':');
2238 if (p) {
2239 e = seccomp_parse_errno_or_action(p + 1);
2240 if (e < 0)
2241 return e;
2242
2243 n = strndup(in, p - in);
2244 } else
2245 n = strdup(in);
2246
2247 if (!n)
2248 return -ENOMEM;
2249
2250 if (isempty(n))
2251 return -EINVAL;
2252
2253 *error = e;
2254 *name = TAKE_PTR(n);
2255
2256 return 0;
2257 }
2258
2259 static int block_open_flag(scmp_filter_ctx seccomp, int flag) {
2260 bool any = false;
2261 int r;
2262
2263 /* Blocks open() with the specified flag, where flag is O_SYNC or so. This makes these calls return
2264 * EINVAL, in the hope the client code will retry without O_SYNC then. */
2265
2266 r = seccomp_rule_add_exact(
2267 seccomp,
2268 SCMP_ACT_ERRNO(EINVAL),
2269 SCMP_SYS(open),
2270 1,
2271 SCMP_A1(SCMP_CMP_MASKED_EQ, flag, flag));
2272 if (r < 0)
2273 log_debug_errno(r, "Failed to add filter for open: %m");
2274 else
2275 any = true;
2276
2277 r = seccomp_rule_add_exact(
2278 seccomp,
2279 SCMP_ACT_ERRNO(EINVAL),
2280 SCMP_SYS(openat),
2281 1,
2282 SCMP_A2(SCMP_CMP_MASKED_EQ, flag, flag));
2283 if (r < 0)
2284 log_debug_errno(r, "Failed to add filter for openat: %m");
2285 else
2286 any = true;
2287
2288 #if defined(__SNR_openat2)
2289 /* The new openat2() system call can't be filtered sensibly, see above. */
2290 r = seccomp_rule_add_exact(
2291 seccomp,
2292 SCMP_ACT_ERRNO(ENOSYS),
2293 SCMP_SYS(openat2),
2294 0);
2295 if (r < 0)
2296 log_debug_errno(r, "Failed to add filter for openat2: %m");
2297 else
2298 any = true;
2299 #endif
2300
2301 return any ? 0 : r;
2302 }
2303
2304 int seccomp_suppress_sync(void) {
2305 uint32_t arch;
2306 int r;
2307
2308 /* This is mostly identical to SystemCallFilter=~@sync:0, but simpler to use, and separately
2309 * manageable, and also masks O_SYNC/O_DSYNC */
2310
2311 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
2312 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
2313 const char *c;
2314
2315 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
2316 if (r < 0)
2317 return r;
2318
2319 NULSTR_FOREACH(c, syscall_filter_sets[SYSCALL_FILTER_SET_SYNC].value) {
2320 int id;
2321
2322 id = seccomp_syscall_resolve_name(c);
2323 if (id == __NR_SCMP_ERROR) {
2324 log_debug("System call %s is not known, ignoring.", c);
2325 continue;
2326 }
2327
2328 r = seccomp_rule_add_exact(
2329 seccomp,
2330 SCMP_ACT_ERRNO(0), /* success → we want this to be a NOP after all */
2331 id,
2332 0);
2333 if (r < 0)
2334 log_debug_errno(r, "Failed to add filter for system call %s, ignoring: %m", c);
2335 }
2336
2337 (void) block_open_flag(seccomp, O_SYNC);
2338 #if O_DSYNC != O_SYNC
2339 (void) block_open_flag(seccomp, O_DSYNC);
2340 #endif
2341
2342 r = seccomp_load(seccomp);
2343 if (ERRNO_IS_SECCOMP_FATAL(r))
2344 return r;
2345 if (r < 0)
2346 log_debug_errno(r, "Failed to apply sync() suppression for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
2347 }
2348
2349 return 0;
2350 }