]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
Merge pull request #11827 from keszybz/pkgconfig-variables
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <linux/seccomp.h>
5 #include <seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10
11 #include "af-list.h"
12 #include "alloc-util.h"
13 #include "macro.h"
14 #include "nsflags.h"
15 #include "process-util.h"
16 #include "seccomp-util.h"
17 #include "set.h"
18 #include "string-util.h"
19 #include "strv.h"
20 #include "util.h"
21 #include "errno-list.h"
22
23 const uint32_t seccomp_local_archs[] = {
24
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27 #if defined(__x86_64__) && defined(__ILP32__)
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
30 SCMP_ARCH_X32, /* native */
31 #elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
33 SCMP_ARCH_X32,
34 SCMP_ARCH_X86_64, /* native */
35 #elif defined(__i386__)
36 SCMP_ARCH_X86,
37 #elif defined(__aarch64__)
38 SCMP_ARCH_ARM,
39 SCMP_ARCH_AARCH64, /* native */
40 #elif defined(__arm__)
41 SCMP_ARCH_ARM,
42 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPS,
47 SCMP_ARCH_MIPSEL, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
52 SCMP_ARCH_MIPS64N32,
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL64,
66 SCMP_ARCH_MIPS64,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64N32, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
77 SCMP_ARCH_PPC,
78 SCMP_ARCH_PPC64LE,
79 SCMP_ARCH_PPC64, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84 #elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86 #elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89 #elif defined(__s390__)
90 SCMP_ARCH_S390,
91 #endif
92 (uint32_t) -1
93 };
94
95 const char* seccomp_arch_to_string(uint32_t c) {
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
100
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
103 return "native";
104 case SCMP_ARCH_X86:
105 return "x86";
106 case SCMP_ARCH_X86_64:
107 return "x86-64";
108 case SCMP_ARCH_X32:
109 return "x32";
110 case SCMP_ARCH_ARM:
111 return "arm";
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
133 return "s390";
134 case SCMP_ARCH_S390X:
135 return "s390x";
136 default:
137 return NULL;
138 }
139 }
140
141 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
181 else
182 return -EINVAL;
183
184 return 0;
185 }
186
187 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
188 scmp_filter_ctx seccomp;
189 int r;
190
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
202 if (r < 0)
203 goto finish;
204
205 r = seccomp_arch_add(seccomp, arch);
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228 finish:
229 seccomp_release(seccomp);
230 return r;
231 }
232
233 static bool is_basic_seccomp_available(void) {
234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
235 }
236
237 static bool is_seccomp_filter_available(void) {
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
240 }
241
242 bool is_seccomp_available(void) {
243 static int cached_enabled = -1;
244
245 if (cached_enabled < 0)
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
250 return cached_enabled;
251 }
252
253 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
254 [SYSCALL_FILTER_SET_DEFAULT] = {
255 .name = "@default",
256 .help = "System calls that are always permitted",
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
283 "getrlimit\0" /* make sure processes can query stack size and such */
284 "getsid\0"
285 "gettid\0"
286 "gettimeofday\0"
287 "getuid\0"
288 "getuid32\0"
289 "membarrier\0"
290 "nanosleep\0"
291 "pause\0"
292 "prlimit64\0"
293 "restart_syscall\0"
294 "rt_sigreturn\0"
295 "sched_yield\0"
296 "set_robust_list\0"
297 "set_thread_area\0"
298 "set_tid_address\0"
299 "set_tls\0"
300 "sigreturn\0"
301 "time\0"
302 "ugetrlimit\0"
303 },
304 [SYSCALL_FILTER_SET_AIO] = {
305 .name = "@aio",
306 .help = "Asynchronous IO",
307 .value =
308 "io_cancel\0"
309 "io_destroy\0"
310 "io_getevents\0"
311 "io_pgetevents\0"
312 "io_setup\0"
313 "io_submit\0"
314 },
315 [SYSCALL_FILTER_SET_BASIC_IO] = {
316 .name = "@basic-io",
317 .help = "Basic IO",
318 .value =
319 "_llseek\0"
320 "close\0"
321 "dup\0"
322 "dup2\0"
323 "dup3\0"
324 "lseek\0"
325 "pread64\0"
326 "preadv\0"
327 "preadv2\0"
328 "pwrite64\0"
329 "pwritev\0"
330 "pwritev2\0"
331 "read\0"
332 "readv\0"
333 "write\0"
334 "writev\0"
335 },
336 [SYSCALL_FILTER_SET_CHOWN] = {
337 .name = "@chown",
338 .help = "Change ownership of files and directories",
339 .value =
340 "chown\0"
341 "chown32\0"
342 "fchown\0"
343 "fchown32\0"
344 "fchownat\0"
345 "lchown\0"
346 "lchown32\0"
347 },
348 [SYSCALL_FILTER_SET_CLOCK] = {
349 .name = "@clock",
350 .help = "Change the system time",
351 .value =
352 "adjtimex\0"
353 "clock_adjtime\0"
354 "clock_settime\0"
355 "settimeofday\0"
356 "stime\0"
357 },
358 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
359 .name = "@cpu-emulation",
360 .help = "System calls for CPU emulation functionality",
361 .value =
362 "modify_ldt\0"
363 "subpage_prot\0"
364 "switch_endian\0"
365 "vm86\0"
366 "vm86old\0"
367 },
368 [SYSCALL_FILTER_SET_DEBUG] = {
369 .name = "@debug",
370 .help = "Debugging, performance monitoring and tracing functionality",
371 .value =
372 "lookup_dcookie\0"
373 "perf_event_open\0"
374 "ptrace\0"
375 "rtas\0"
376 #ifdef __NR_s390_runtime_instr
377 "s390_runtime_instr\0"
378 #endif
379 "sys_debug_setcontext\0"
380 },
381 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
382 .name = "@file-system",
383 .help = "File system operations",
384 .value =
385 "access\0"
386 "chdir\0"
387 "chmod\0"
388 "close\0"
389 "creat\0"
390 "faccessat\0"
391 "fallocate\0"
392 "fchdir\0"
393 "fchmod\0"
394 "fchmodat\0"
395 "fcntl\0"
396 "fcntl64\0"
397 "fgetxattr\0"
398 "flistxattr\0"
399 "fremovexattr\0"
400 "fsetxattr\0"
401 "fstat\0"
402 "fstat64\0"
403 "fstatat64\0"
404 "fstatfs\0"
405 "fstatfs64\0"
406 "ftruncate\0"
407 "ftruncate64\0"
408 "futimesat\0"
409 "getcwd\0"
410 "getdents\0"
411 "getdents64\0"
412 "getxattr\0"
413 "inotify_add_watch\0"
414 "inotify_init\0"
415 "inotify_init1\0"
416 "inotify_rm_watch\0"
417 "lgetxattr\0"
418 "link\0"
419 "linkat\0"
420 "listxattr\0"
421 "llistxattr\0"
422 "lremovexattr\0"
423 "lsetxattr\0"
424 "lstat\0"
425 "lstat64\0"
426 "mkdir\0"
427 "mkdirat\0"
428 "mknod\0"
429 "mknodat\0"
430 "mmap\0"
431 "mmap2\0"
432 "munmap\0"
433 "newfstatat\0"
434 "oldfstat\0"
435 "oldlstat\0"
436 "oldstat\0"
437 "open\0"
438 "openat\0"
439 "readlink\0"
440 "readlinkat\0"
441 "removexattr\0"
442 "rename\0"
443 "renameat\0"
444 "renameat2\0"
445 "rmdir\0"
446 "setxattr\0"
447 "stat\0"
448 "stat64\0"
449 "statfs\0"
450 "statfs64\0"
451 #ifdef __NR_statx
452 "statx\0"
453 #endif
454 "symlink\0"
455 "symlinkat\0"
456 "truncate\0"
457 "truncate64\0"
458 "unlink\0"
459 "unlinkat\0"
460 "utime\0"
461 "utimensat\0"
462 "utimes\0"
463 },
464 [SYSCALL_FILTER_SET_IO_EVENT] = {
465 .name = "@io-event",
466 .help = "Event loop system calls",
467 .value =
468 "_newselect\0"
469 "epoll_create\0"
470 "epoll_create1\0"
471 "epoll_ctl\0"
472 "epoll_ctl_old\0"
473 "epoll_pwait\0"
474 "epoll_wait\0"
475 "epoll_wait_old\0"
476 "eventfd\0"
477 "eventfd2\0"
478 "poll\0"
479 "ppoll\0"
480 "pselect6\0"
481 "select\0"
482 },
483 [SYSCALL_FILTER_SET_IPC] = {
484 .name = "@ipc",
485 .help = "SysV IPC, POSIX Message Queues or other IPC",
486 .value =
487 "ipc\0"
488 "memfd_create\0"
489 "mq_getsetattr\0"
490 "mq_notify\0"
491 "mq_open\0"
492 "mq_timedreceive\0"
493 "mq_timedsend\0"
494 "mq_unlink\0"
495 "msgctl\0"
496 "msgget\0"
497 "msgrcv\0"
498 "msgsnd\0"
499 "pipe\0"
500 "pipe2\0"
501 "process_vm_readv\0"
502 "process_vm_writev\0"
503 "semctl\0"
504 "semget\0"
505 "semop\0"
506 "semtimedop\0"
507 "shmat\0"
508 "shmctl\0"
509 "shmdt\0"
510 "shmget\0"
511 },
512 [SYSCALL_FILTER_SET_KEYRING] = {
513 .name = "@keyring",
514 .help = "Kernel keyring access",
515 .value =
516 "add_key\0"
517 "keyctl\0"
518 "request_key\0"
519 },
520 [SYSCALL_FILTER_SET_MEMLOCK] = {
521 .name = "@memlock",
522 .help = "Memory locking control",
523 .value =
524 "mlock\0"
525 "mlock2\0"
526 "mlockall\0"
527 "munlock\0"
528 "munlockall\0"
529 },
530 [SYSCALL_FILTER_SET_MODULE] = {
531 .name = "@module",
532 .help = "Loading and unloading of kernel modules",
533 .value =
534 "delete_module\0"
535 "finit_module\0"
536 "init_module\0"
537 },
538 [SYSCALL_FILTER_SET_MOUNT] = {
539 .name = "@mount",
540 .help = "Mounting and unmounting of file systems",
541 .value =
542 "chroot\0"
543 "mount\0"
544 "pivot_root\0"
545 "umount\0"
546 "umount2\0"
547 },
548 [SYSCALL_FILTER_SET_NETWORK_IO] = {
549 .name = "@network-io",
550 .help = "Network or Unix socket IO, should not be needed if not network facing",
551 .value =
552 "accept\0"
553 "accept4\0"
554 "bind\0"
555 "connect\0"
556 "getpeername\0"
557 "getsockname\0"
558 "getsockopt\0"
559 "listen\0"
560 "recv\0"
561 "recvfrom\0"
562 "recvmmsg\0"
563 "recvmsg\0"
564 "send\0"
565 "sendmmsg\0"
566 "sendmsg\0"
567 "sendto\0"
568 "setsockopt\0"
569 "shutdown\0"
570 "socket\0"
571 "socketcall\0"
572 "socketpair\0"
573 },
574 [SYSCALL_FILTER_SET_OBSOLETE] = {
575 /* some unknown even to libseccomp */
576 .name = "@obsolete",
577 .help = "Unusual, obsolete or unimplemented system calls",
578 .value =
579 "_sysctl\0"
580 "afs_syscall\0"
581 "bdflush\0"
582 "break\0"
583 "create_module\0"
584 "ftime\0"
585 "get_kernel_syms\0"
586 "getpmsg\0"
587 "gtty\0"
588 "idle\0"
589 "lock\0"
590 "mpx\0"
591 "prof\0"
592 "profil\0"
593 "putpmsg\0"
594 "query_module\0"
595 "security\0"
596 "sgetmask\0"
597 "ssetmask\0"
598 "stty\0"
599 "sysfs\0"
600 "tuxcall\0"
601 "ulimit\0"
602 "uselib\0"
603 "ustat\0"
604 "vserver\0"
605 },
606 [SYSCALL_FILTER_SET_PRIVILEGED] = {
607 .name = "@privileged",
608 .help = "All system calls which need super-user capabilities",
609 .value =
610 "@chown\0"
611 "@clock\0"
612 "@module\0"
613 "@raw-io\0"
614 "@reboot\0"
615 "@swap\0"
616 "_sysctl\0"
617 "acct\0"
618 "bpf\0"
619 "capset\0"
620 "chroot\0"
621 "fanotify_init\0"
622 "nfsservctl\0"
623 "open_by_handle_at\0"
624 "pivot_root\0"
625 "quotactl\0"
626 "setdomainname\0"
627 "setfsuid\0"
628 "setfsuid32\0"
629 "setgroups\0"
630 "setgroups32\0"
631 "sethostname\0"
632 "setresuid\0"
633 "setresuid32\0"
634 "setreuid\0"
635 "setreuid32\0"
636 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
637 "setuid32\0"
638 "vhangup\0"
639 },
640 [SYSCALL_FILTER_SET_PROCESS] = {
641 .name = "@process",
642 .help = "Process control, execution, namespaceing operations",
643 .value =
644 "arch_prctl\0"
645 "capget\0" /* Able to query arbitrary processes */
646 "clone\0"
647 "execveat\0"
648 "fork\0"
649 "getrusage\0"
650 "kill\0"
651 "prctl\0"
652 "rt_sigqueueinfo\0"
653 "rt_tgsigqueueinfo\0"
654 "setns\0"
655 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
656 "tgkill\0"
657 "times\0"
658 "tkill\0"
659 "unshare\0"
660 "vfork\0"
661 "wait4\0"
662 "waitid\0"
663 "waitpid\0"
664 },
665 [SYSCALL_FILTER_SET_RAW_IO] = {
666 .name = "@raw-io",
667 .help = "Raw I/O port access",
668 .value =
669 "ioperm\0"
670 "iopl\0"
671 "pciconfig_iobase\0"
672 "pciconfig_read\0"
673 "pciconfig_write\0"
674 #ifdef __NR_s390_pci_mmio_read
675 "s390_pci_mmio_read\0"
676 #endif
677 #ifdef __NR_s390_pci_mmio_write
678 "s390_pci_mmio_write\0"
679 #endif
680 },
681 [SYSCALL_FILTER_SET_REBOOT] = {
682 .name = "@reboot",
683 .help = "Reboot and reboot preparation/kexec",
684 .value =
685 "kexec_file_load\0"
686 "kexec_load\0"
687 "reboot\0"
688 },
689 [SYSCALL_FILTER_SET_RESOURCES] = {
690 .name = "@resources",
691 .help = "Alter resource settings",
692 .value =
693 "ioprio_set\0"
694 "mbind\0"
695 "migrate_pages\0"
696 "move_pages\0"
697 "nice\0"
698 "sched_setaffinity\0"
699 "sched_setattr\0"
700 "sched_setparam\0"
701 "sched_setscheduler\0"
702 "set_mempolicy\0"
703 "setpriority\0"
704 "setrlimit\0"
705 },
706 [SYSCALL_FILTER_SET_SETUID] = {
707 .name = "@setuid",
708 .help = "Operations for changing user/group credentials",
709 .value =
710 "setgid\0"
711 "setgid32\0"
712 "setgroups\0"
713 "setgroups32\0"
714 "setregid\0"
715 "setregid32\0"
716 "setresgid\0"
717 "setresgid32\0"
718 "setresuid\0"
719 "setresuid32\0"
720 "setreuid\0"
721 "setreuid32\0"
722 "setuid\0"
723 "setuid32\0"
724 },
725 [SYSCALL_FILTER_SET_SIGNAL] = {
726 .name = "@signal",
727 .help = "Process signal handling",
728 .value =
729 "rt_sigaction\0"
730 "rt_sigpending\0"
731 "rt_sigprocmask\0"
732 "rt_sigsuspend\0"
733 "rt_sigtimedwait\0"
734 "sigaction\0"
735 "sigaltstack\0"
736 "signal\0"
737 "signalfd\0"
738 "signalfd4\0"
739 "sigpending\0"
740 "sigprocmask\0"
741 "sigsuspend\0"
742 },
743 [SYSCALL_FILTER_SET_SWAP] = {
744 .name = "@swap",
745 .help = "Enable/disable swap devices",
746 .value =
747 "swapoff\0"
748 "swapon\0"
749 },
750 [SYSCALL_FILTER_SET_SYNC] = {
751 .name = "@sync",
752 .help = "Synchronize files and memory to storage",
753 .value =
754 "fdatasync\0"
755 "fsync\0"
756 "msync\0"
757 "sync\0"
758 "sync_file_range\0"
759 "syncfs\0"
760 },
761 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
762 .name = "@system-service",
763 .help = "General system service operations",
764 .value =
765 "@aio\0"
766 "@basic-io\0"
767 "@chown\0"
768 "@default\0"
769 "@file-system\0"
770 "@io-event\0"
771 "@ipc\0"
772 "@keyring\0"
773 "@memlock\0"
774 "@network-io\0"
775 "@process\0"
776 "@resources\0"
777 "@setuid\0"
778 "@signal\0"
779 "@sync\0"
780 "@timer\0"
781 "brk\0"
782 "capget\0"
783 "capset\0"
784 "copy_file_range\0"
785 "fadvise64\0"
786 "fadvise64_64\0"
787 "flock\0"
788 "get_mempolicy\0"
789 "getcpu\0"
790 "getpriority\0"
791 "getrandom\0"
792 "ioctl\0"
793 "ioprio_get\0"
794 "kcmp\0"
795 "madvise\0"
796 "mprotect\0"
797 "mremap\0"
798 "name_to_handle_at\0"
799 "oldolduname\0"
800 "olduname\0"
801 "personality\0"
802 "readahead\0"
803 "readdir\0"
804 "remap_file_pages\0"
805 "sched_get_priority_max\0"
806 "sched_get_priority_min\0"
807 "sched_getaffinity\0"
808 "sched_getattr\0"
809 "sched_getparam\0"
810 "sched_getscheduler\0"
811 "sched_rr_get_interval\0"
812 "sched_yield\0"
813 "sendfile\0"
814 "sendfile64\0"
815 "setfsgid\0"
816 "setfsgid32\0"
817 "setfsuid\0"
818 "setfsuid32\0"
819 "setpgid\0"
820 "setsid\0"
821 "splice\0"
822 "sysinfo\0"
823 "tee\0"
824 "umask\0"
825 "uname\0"
826 "userfaultfd\0"
827 "vmsplice\0"
828 },
829 [SYSCALL_FILTER_SET_TIMER] = {
830 .name = "@timer",
831 .help = "Schedule operations by time",
832 .value =
833 "alarm\0"
834 "getitimer\0"
835 "setitimer\0"
836 "timer_create\0"
837 "timer_delete\0"
838 "timer_getoverrun\0"
839 "timer_gettime\0"
840 "timer_settime\0"
841 "timerfd_create\0"
842 "timerfd_gettime\0"
843 "timerfd_settime\0"
844 "times\0"
845 },
846 };
847
848 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
849 unsigned i;
850
851 if (isempty(name) || name[0] != '@')
852 return NULL;
853
854 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
855 if (streq(syscall_filter_sets[i].name, name))
856 return syscall_filter_sets + i;
857
858 return NULL;
859 }
860
861 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
862
863 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
864 assert(seccomp);
865 assert(name);
866
867 if (strv_contains(exclude, name))
868 return 0;
869
870 if (name[0] == '@') {
871 const SyscallFilterSet *other;
872
873 other = syscall_filter_set_find(name);
874 if (!other)
875 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
876 "Filter set %s is not known!",
877 name);
878
879 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
880
881 } else {
882 int id, r;
883
884 id = seccomp_syscall_resolve_name(name);
885 if (id == __NR_SCMP_ERROR) {
886 if (log_missing)
887 log_debug("System call %s is not known, ignoring.", name);
888 return 0;
889 }
890
891 r = seccomp_rule_add_exact(seccomp, action, id, 0);
892 if (r < 0) {
893 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
894 bool ignore = r == -EDOM;
895
896 if (!ignore || log_missing)
897 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
898 name, id, ignore ? ", ignoring" : "");
899 if (!ignore)
900 return r;
901 }
902
903 return 0;
904 }
905 }
906
907 static int seccomp_add_syscall_filter_set(
908 scmp_filter_ctx seccomp,
909 const SyscallFilterSet *set,
910 uint32_t action,
911 char **exclude,
912 bool log_missing) {
913
914 const char *sys;
915 int r;
916
917 assert(seccomp);
918 assert(set);
919
920 NULSTR_FOREACH(sys, set->value) {
921 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
922 if (r < 0)
923 return r;
924 }
925
926 return 0;
927 }
928
929 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
930 uint32_t arch;
931 int r;
932
933 assert(set);
934
935 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
936 * each local arch. */
937
938 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
939 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
940
941 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
942
943 r = seccomp_init_for_arch(&seccomp, arch, default_action);
944 if (r < 0)
945 return r;
946
947 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
948 if (r < 0)
949 return log_debug_errno(r, "Failed to add filter set: %m");
950
951 r = seccomp_load(seccomp);
952 if (IN_SET(r, -EPERM, -EACCES))
953 return r;
954 if (r < 0)
955 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
956 }
957
958 return 0;
959 }
960
961 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
962 uint32_t arch;
963 int r;
964
965 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
966 * SyscallFilterSet* table. */
967
968 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
969 return 0;
970
971 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
972 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
973 Iterator i;
974 void *syscall_id, *val;
975
976 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
977
978 r = seccomp_init_for_arch(&seccomp, arch, default_action);
979 if (r < 0)
980 return r;
981
982 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
983 uint32_t a = action;
984 int id = PTR_TO_INT(syscall_id) - 1;
985 int error = PTR_TO_INT(val);
986
987 if (action != SCMP_ACT_ALLOW && error >= 0)
988 a = SCMP_ACT_ERRNO(error);
989
990 r = seccomp_rule_add_exact(seccomp, a, id, 0);
991 if (r < 0) {
992 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
993 _cleanup_free_ char *n = NULL;
994 bool ignore;
995
996 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
997 ignore = r == -EDOM;
998 if (!ignore || log_missing)
999 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1000 strna(n), id, ignore ? ", ignoring" : "");
1001 if (!ignore)
1002 return r;
1003 }
1004 }
1005
1006 r = seccomp_load(seccomp);
1007 if (IN_SET(r, -EPERM, -EACCES))
1008 return r;
1009 if (r < 0)
1010 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1011 }
1012
1013 return 0;
1014 }
1015
1016 int seccomp_parse_syscall_filter_full(
1017 const char *name,
1018 int errno_num,
1019 Hashmap *filter,
1020 SeccompParseFlags flags,
1021 const char *unit,
1022 const char *filename,
1023 unsigned line) {
1024
1025 int r;
1026
1027 assert(name);
1028 assert(filter);
1029
1030 if (name[0] == '@') {
1031 const SyscallFilterSet *set;
1032 const char *i;
1033
1034 set = syscall_filter_set_find(name);
1035 if (!set) {
1036 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1037 return -EINVAL;
1038
1039 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1040 "Unknown system call group, ignoring: %s", name);
1041 return 0;
1042 }
1043
1044 NULSTR_FOREACH(i, set->value) {
1045 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1046 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1047 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1048 * about them. */
1049 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1050 if (r < 0)
1051 return r;
1052 }
1053 } else {
1054 int id;
1055
1056 id = seccomp_syscall_resolve_name(name);
1057 if (id == __NR_SCMP_ERROR) {
1058 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1059 return -EINVAL;
1060
1061 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1062 "Failed to parse system call, ignoring: %s", name);
1063 return 0;
1064 }
1065
1066 /* If we previously wanted to forbid a syscall and now
1067 * we want to allow it, then remove it from the list. */
1068 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1069 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1070 if (r < 0)
1071 switch (r) {
1072 case -ENOMEM:
1073 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1074 case -EEXIST:
1075 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1076 break;
1077 default:
1078 return r;
1079 }
1080 } else
1081 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1082 }
1083
1084 return 0;
1085 }
1086
1087 int seccomp_restrict_namespaces(unsigned long retain) {
1088 uint32_t arch;
1089 int r;
1090
1091 if (DEBUG_LOGGING) {
1092 _cleanup_free_ char *s = NULL;
1093
1094 (void) namespace_flags_to_string(retain, &s);
1095 log_debug("Restricting namespace to: %s.", strna(s));
1096 }
1097
1098 /* NOOP? */
1099 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1100 return 0;
1101
1102 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1103 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1104 unsigned i;
1105
1106 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1107
1108 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1109 if (r < 0)
1110 return r;
1111
1112 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1113 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1114 * altogether. */
1115 r = seccomp_rule_add_exact(
1116 seccomp,
1117 SCMP_ACT_ERRNO(EPERM),
1118 SCMP_SYS(setns),
1119 0);
1120 else
1121 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1122 * special invocation with a zero flags argument, right here. */
1123 r = seccomp_rule_add_exact(
1124 seccomp,
1125 SCMP_ACT_ERRNO(EPERM),
1126 SCMP_SYS(setns),
1127 1,
1128 SCMP_A1(SCMP_CMP_EQ, 0));
1129 if (r < 0) {
1130 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1131 continue;
1132 }
1133
1134 for (i = 0; namespace_flag_map[i].name; i++) {
1135 unsigned long f;
1136
1137 f = namespace_flag_map[i].flag;
1138 if ((retain & f) == f) {
1139 log_debug("Permitting %s.", namespace_flag_map[i].name);
1140 continue;
1141 }
1142
1143 log_debug("Blocking %s.", namespace_flag_map[i].name);
1144
1145 r = seccomp_rule_add_exact(
1146 seccomp,
1147 SCMP_ACT_ERRNO(EPERM),
1148 SCMP_SYS(unshare),
1149 1,
1150 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1151 if (r < 0) {
1152 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1153 break;
1154 }
1155
1156 /* On s390/s390x the first two parameters to clone are switched */
1157 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1158 r = seccomp_rule_add_exact(
1159 seccomp,
1160 SCMP_ACT_ERRNO(EPERM),
1161 SCMP_SYS(clone),
1162 1,
1163 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1164 else
1165 r = seccomp_rule_add_exact(
1166 seccomp,
1167 SCMP_ACT_ERRNO(EPERM),
1168 SCMP_SYS(clone),
1169 1,
1170 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1171 if (r < 0) {
1172 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1173 break;
1174 }
1175
1176 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1177 r = seccomp_rule_add_exact(
1178 seccomp,
1179 SCMP_ACT_ERRNO(EPERM),
1180 SCMP_SYS(setns),
1181 1,
1182 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1183 if (r < 0) {
1184 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1185 break;
1186 }
1187 }
1188 }
1189 if (r < 0)
1190 continue;
1191
1192 r = seccomp_load(seccomp);
1193 if (IN_SET(r, -EPERM, -EACCES))
1194 return r;
1195 if (r < 0)
1196 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1197 }
1198
1199 return 0;
1200 }
1201
1202 int seccomp_protect_sysctl(void) {
1203 uint32_t arch;
1204 int r;
1205
1206 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1207 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1208
1209 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1210
1211 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1212 /* No _sysctl syscall */
1213 continue;
1214
1215 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1216 if (r < 0)
1217 return r;
1218
1219 r = seccomp_rule_add_exact(
1220 seccomp,
1221 SCMP_ACT_ERRNO(EPERM),
1222 SCMP_SYS(_sysctl),
1223 0);
1224 if (r < 0) {
1225 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1226 continue;
1227 }
1228
1229 r = seccomp_load(seccomp);
1230 if (IN_SET(r, -EPERM, -EACCES))
1231 return r;
1232 if (r < 0)
1233 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1234 }
1235
1236 return 0;
1237 }
1238
1239 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1240 uint32_t arch;
1241 int r;
1242
1243 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1244 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1245 bool supported;
1246 Iterator i;
1247
1248 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1249
1250 switch (arch) {
1251
1252 case SCMP_ARCH_X86_64:
1253 case SCMP_ARCH_X32:
1254 case SCMP_ARCH_ARM:
1255 case SCMP_ARCH_AARCH64:
1256 case SCMP_ARCH_PPC:
1257 case SCMP_ARCH_PPC64:
1258 case SCMP_ARCH_PPC64LE:
1259 case SCMP_ARCH_MIPSEL64N32:
1260 case SCMP_ARCH_MIPS64N32:
1261 case SCMP_ARCH_MIPSEL64:
1262 case SCMP_ARCH_MIPS64:
1263 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1264 supported = true;
1265 break;
1266
1267 case SCMP_ARCH_S390:
1268 case SCMP_ARCH_S390X:
1269 case SCMP_ARCH_X86:
1270 case SCMP_ARCH_MIPSEL:
1271 case SCMP_ARCH_MIPS:
1272 default:
1273 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1274 * don't know */
1275 supported = false;
1276 break;
1277 }
1278
1279 if (!supported)
1280 continue;
1281
1282 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1283 if (r < 0)
1284 return r;
1285
1286 if (whitelist) {
1287 int af, first = 0, last = 0;
1288 void *afp;
1289
1290 /* If this is a whitelist, we first block the address families that are out of range and then
1291 * everything that is not in the set. First, we find the lowest and highest address family in
1292 * the set. */
1293
1294 SET_FOREACH(afp, address_families, i) {
1295 af = PTR_TO_INT(afp);
1296
1297 if (af <= 0 || af >= af_max())
1298 continue;
1299
1300 if (first == 0 || af < first)
1301 first = af;
1302
1303 if (last == 0 || af > last)
1304 last = af;
1305 }
1306
1307 assert((first == 0) == (last == 0));
1308
1309 if (first == 0) {
1310
1311 /* No entries in the valid range, block everything */
1312 r = seccomp_rule_add_exact(
1313 seccomp,
1314 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1315 SCMP_SYS(socket),
1316 0);
1317 if (r < 0) {
1318 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1319 continue;
1320 }
1321
1322 } else {
1323
1324 /* Block everything below the first entry */
1325 r = seccomp_rule_add_exact(
1326 seccomp,
1327 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1328 SCMP_SYS(socket),
1329 1,
1330 SCMP_A0(SCMP_CMP_LT, first));
1331 if (r < 0) {
1332 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1333 continue;
1334 }
1335
1336 /* Block everything above the last entry */
1337 r = seccomp_rule_add_exact(
1338 seccomp,
1339 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1340 SCMP_SYS(socket),
1341 1,
1342 SCMP_A0(SCMP_CMP_GT, last));
1343 if (r < 0) {
1344 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1345 continue;
1346 }
1347
1348 /* Block everything between the first and last entry */
1349 for (af = 1; af < af_max(); af++) {
1350
1351 if (set_contains(address_families, INT_TO_PTR(af)))
1352 continue;
1353
1354 r = seccomp_rule_add_exact(
1355 seccomp,
1356 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1357 SCMP_SYS(socket),
1358 1,
1359 SCMP_A0(SCMP_CMP_EQ, af));
1360 if (r < 0)
1361 break;
1362 }
1363 if (r < 0) {
1364 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1365 continue;
1366 }
1367 }
1368
1369 } else {
1370 void *af;
1371
1372 /* If this is a blacklist, then generate one rule for
1373 * each address family that are then combined in OR
1374 * checks. */
1375
1376 SET_FOREACH(af, address_families, i) {
1377
1378 r = seccomp_rule_add_exact(
1379 seccomp,
1380 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1381 SCMP_SYS(socket),
1382 1,
1383 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1384 if (r < 0)
1385 break;
1386 }
1387 if (r < 0) {
1388 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1389 continue;
1390 }
1391 }
1392
1393 r = seccomp_load(seccomp);
1394 if (IN_SET(r, -EPERM, -EACCES))
1395 return r;
1396 if (r < 0)
1397 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1398 }
1399
1400 return 0;
1401 }
1402
1403 int seccomp_restrict_realtime(void) {
1404 static const int permitted_policies[] = {
1405 SCHED_OTHER,
1406 SCHED_BATCH,
1407 SCHED_IDLE,
1408 };
1409
1410 int r, max_policy = 0;
1411 uint32_t arch;
1412 unsigned i;
1413
1414 /* Determine the highest policy constant we want to allow */
1415 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1416 if (permitted_policies[i] > max_policy)
1417 max_policy = permitted_policies[i];
1418
1419 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1420 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1421 int p;
1422
1423 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1424
1425 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1426 if (r < 0)
1427 return r;
1428
1429 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1430 * whitelist. */
1431 for (p = 0; p < max_policy; p++) {
1432 bool good = false;
1433
1434 /* Check if this is in the whitelist. */
1435 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1436 if (permitted_policies[i] == p) {
1437 good = true;
1438 break;
1439 }
1440
1441 if (good)
1442 continue;
1443
1444 /* Deny this policy */
1445 r = seccomp_rule_add_exact(
1446 seccomp,
1447 SCMP_ACT_ERRNO(EPERM),
1448 SCMP_SYS(sched_setscheduler),
1449 1,
1450 SCMP_A1(SCMP_CMP_EQ, p));
1451 if (r < 0) {
1452 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1453 continue;
1454 }
1455 }
1456
1457 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1458 * unsigned here, hence no need no check for < 0 values. */
1459 r = seccomp_rule_add_exact(
1460 seccomp,
1461 SCMP_ACT_ERRNO(EPERM),
1462 SCMP_SYS(sched_setscheduler),
1463 1,
1464 SCMP_A1(SCMP_CMP_GT, max_policy));
1465 if (r < 0) {
1466 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1467 continue;
1468 }
1469
1470 r = seccomp_load(seccomp);
1471 if (IN_SET(r, -EPERM, -EACCES))
1472 return r;
1473 if (r < 0)
1474 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1475 }
1476
1477 return 0;
1478 }
1479
1480 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1481 uint32_t arch,
1482 int nr,
1483 unsigned arg_cnt,
1484 const struct scmp_arg_cmp arg) {
1485 int r;
1486
1487 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1488 if (r < 0) {
1489 _cleanup_free_ char *n = NULL;
1490
1491 n = seccomp_syscall_resolve_num_arch(arch, nr);
1492 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1493 strna(n),
1494 seccomp_arch_to_string(arch));
1495 }
1496
1497 return r;
1498 }
1499
1500 /* For known architectures, check that syscalls are indeed defined or not. */
1501 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1502 assert_cc(SCMP_SYS(shmget) > 0);
1503 assert_cc(SCMP_SYS(shmat) > 0);
1504 assert_cc(SCMP_SYS(shmdt) > 0);
1505 #elif defined(__i386__) || defined(__powerpc64__)
1506 assert_cc(SCMP_SYS(shmget) < 0);
1507 assert_cc(SCMP_SYS(shmat) < 0);
1508 assert_cc(SCMP_SYS(shmdt) < 0);
1509 #endif
1510
1511 int seccomp_memory_deny_write_execute(void) {
1512
1513 uint32_t arch;
1514 int r;
1515
1516 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1517 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1518 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1519
1520 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1521
1522 switch (arch) {
1523
1524 case SCMP_ARCH_X86:
1525 filter_syscall = SCMP_SYS(mmap2);
1526 block_syscall = SCMP_SYS(mmap);
1527 break;
1528
1529 case SCMP_ARCH_PPC:
1530 case SCMP_ARCH_PPC64:
1531 case SCMP_ARCH_PPC64LE:
1532 filter_syscall = SCMP_SYS(mmap);
1533
1534 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1535 * We ignore that here, which means there's still a way to get writable/executable
1536 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1537
1538 break;
1539
1540 case SCMP_ARCH_ARM:
1541 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1542 shmat_syscall = SCMP_SYS(shmat);
1543 break;
1544
1545 case SCMP_ARCH_X86_64:
1546 case SCMP_ARCH_X32:
1547 case SCMP_ARCH_AARCH64:
1548 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1549 shmat_syscall = SCMP_SYS(shmat);
1550 break;
1551
1552 /* Please add more definitions here, if you port systemd to other architectures! */
1553
1554 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1555 #warning "Consider adding the right mmap() syscall definitions here!"
1556 #endif
1557 }
1558
1559 /* Can't filter mmap() on this arch, then skip it */
1560 if (filter_syscall == 0)
1561 continue;
1562
1563 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1564 if (r < 0)
1565 return r;
1566
1567 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1568 1,
1569 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1570 if (r < 0)
1571 continue;
1572
1573 if (block_syscall != 0) {
1574 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1575 if (r < 0)
1576 continue;
1577 }
1578
1579 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1580 1,
1581 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1582 if (r < 0)
1583 continue;
1584
1585 #ifdef __NR_pkey_mprotect
1586 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1587 1,
1588 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1589 if (r < 0)
1590 continue;
1591 #endif
1592
1593 if (shmat_syscall != 0) {
1594 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1595 1,
1596 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1597 if (r < 0)
1598 continue;
1599 }
1600
1601 r = seccomp_load(seccomp);
1602 if (IN_SET(r, -EPERM, -EACCES))
1603 return r;
1604 if (r < 0)
1605 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1606 }
1607
1608 return 0;
1609 }
1610
1611 int seccomp_restrict_archs(Set *archs) {
1612 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1613 Iterator i;
1614 void *id;
1615 int r;
1616
1617 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1618 * list.
1619 *
1620 * There are some qualifications. However the most important use is to stop processes from bypassing
1621 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1622 * in a non-native architecture. There are no holes in this use case, at least so far. */
1623
1624 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1625 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1626 * to run a program with the restrictions applied. */
1627 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1628 if (!seccomp)
1629 return -ENOMEM;
1630
1631 SET_FOREACH(id, archs, i) {
1632 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1633 if (r < 0 && r != -EEXIST)
1634 return r;
1635 }
1636
1637 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1638 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1639 * The important thing is that you can block the old 32-bit x86 syscalls.
1640 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1641
1642 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1643 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1644
1645 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1646 if (r < 0 && r != -EEXIST)
1647 return r;
1648 }
1649
1650 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1651 if (r < 0)
1652 return r;
1653
1654 r = seccomp_load(seccomp);
1655 if (IN_SET(r, -EPERM, -EACCES))
1656 return r;
1657 if (r < 0)
1658 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1659
1660 return 0;
1661 }
1662
1663 int parse_syscall_archs(char **l, Set **archs) {
1664 _cleanup_set_free_ Set *_archs;
1665 char **s;
1666 int r;
1667
1668 assert(l);
1669 assert(archs);
1670
1671 r = set_ensure_allocated(&_archs, NULL);
1672 if (r < 0)
1673 return r;
1674
1675 STRV_FOREACH(s, l) {
1676 uint32_t a;
1677
1678 r = seccomp_arch_from_string(*s, &a);
1679 if (r < 0)
1680 return -EINVAL;
1681
1682 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1683 if (r < 0)
1684 return -ENOMEM;
1685 }
1686
1687 *archs = TAKE_PTR(_archs);
1688
1689 return 0;
1690 }
1691
1692 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1693 const char *i;
1694 int r;
1695
1696 assert(set);
1697
1698 NULSTR_FOREACH(i, set->value) {
1699
1700 if (i[0] == '@') {
1701 const SyscallFilterSet *more;
1702
1703 more = syscall_filter_set_find(i);
1704 if (!more)
1705 return -ENXIO;
1706
1707 r = seccomp_filter_set_add(filter, add, more);
1708 if (r < 0)
1709 return r;
1710 } else {
1711 int id;
1712
1713 id = seccomp_syscall_resolve_name(i);
1714 if (id == __NR_SCMP_ERROR) {
1715 log_debug("Couldn't resolve system call, ignoring: %s", i);
1716 continue;
1717 }
1718
1719 if (add) {
1720 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1721 if (r < 0)
1722 return r;
1723 } else
1724 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1725 }
1726 }
1727
1728 return 0;
1729 }
1730
1731 int seccomp_lock_personality(unsigned long personality) {
1732 uint32_t arch;
1733 int r;
1734
1735 if (personality >= PERSONALITY_INVALID)
1736 return -EINVAL;
1737
1738 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1739 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1740
1741 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1742 if (r < 0)
1743 return r;
1744
1745 r = seccomp_rule_add_exact(
1746 seccomp,
1747 SCMP_ACT_ERRNO(EPERM),
1748 SCMP_SYS(personality),
1749 1,
1750 SCMP_A0(SCMP_CMP_NE, personality));
1751 if (r < 0) {
1752 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1753 continue;
1754 }
1755
1756 r = seccomp_load(seccomp);
1757 if (IN_SET(r, -EPERM, -EACCES))
1758 return r;
1759 if (r < 0)
1760 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1761 }
1762
1763 return 0;
1764 }
1765
1766 int seccomp_protect_hostname(void) {
1767 uint32_t arch;
1768 int r;
1769
1770 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1771 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1772
1773 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1774 if (r < 0)
1775 return r;
1776
1777 r = seccomp_rule_add_exact(
1778 seccomp,
1779 SCMP_ACT_ERRNO(EPERM),
1780 SCMP_SYS(sethostname),
1781 0);
1782 if (r < 0)
1783 continue;
1784
1785 r = seccomp_rule_add_exact(
1786 seccomp,
1787 SCMP_ACT_ERRNO(EPERM),
1788 SCMP_SYS(setdomainname),
1789 0);
1790 if (r < 0)
1791 continue;
1792
1793 r = seccomp_load(seccomp);
1794 if (IN_SET(r, -EPERM, -EACCES))
1795 return r;
1796 if (r < 0)
1797 log_debug_errno(r, "Failed to apply hostname restrictions for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1798 }
1799
1800 return 0;
1801 }