]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
42d6dd2a94342ba4d7f7cb17e568cea2bda39f85
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <linux/seccomp.h>
5 #include <seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10
11 #include "af-list.h"
12 #include "alloc-util.h"
13 #include "macro.h"
14 #include "nsflags.h"
15 #include "process-util.h"
16 #include "seccomp-util.h"
17 #include "set.h"
18 #include "string-util.h"
19 #include "strv.h"
20 #include "util.h"
21 #include "errno-list.h"
22
23 const uint32_t seccomp_local_archs[] = {
24
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27 #if defined(__x86_64__) && defined(__ILP32__)
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
30 SCMP_ARCH_X32, /* native */
31 #elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
33 SCMP_ARCH_X32,
34 SCMP_ARCH_X86_64, /* native */
35 #elif defined(__i386__)
36 SCMP_ARCH_X86,
37 #elif defined(__aarch64__)
38 SCMP_ARCH_ARM,
39 SCMP_ARCH_AARCH64, /* native */
40 #elif defined(__arm__)
41 SCMP_ARCH_ARM,
42 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPS,
47 SCMP_ARCH_MIPSEL, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
52 SCMP_ARCH_MIPS64N32,
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL64,
66 SCMP_ARCH_MIPS64,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64N32, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
77 SCMP_ARCH_PPC,
78 SCMP_ARCH_PPC64LE,
79 SCMP_ARCH_PPC64, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84 #elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86 #elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89 #elif defined(__s390__)
90 SCMP_ARCH_S390,
91 #endif
92 (uint32_t) -1
93 };
94
95 const char* seccomp_arch_to_string(uint32_t c) {
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
100
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
103 return "native";
104 case SCMP_ARCH_X86:
105 return "x86";
106 case SCMP_ARCH_X86_64:
107 return "x86-64";
108 case SCMP_ARCH_X32:
109 return "x32";
110 case SCMP_ARCH_ARM:
111 return "arm";
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
133 return "s390";
134 case SCMP_ARCH_S390X:
135 return "s390x";
136 default:
137 return NULL;
138 }
139 }
140
141 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
181 else
182 return -EINVAL;
183
184 return 0;
185 }
186
187 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
188 scmp_filter_ctx seccomp;
189 int r;
190
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
202 if (r < 0)
203 goto finish;
204
205 r = seccomp_arch_add(seccomp, arch);
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228 finish:
229 seccomp_release(seccomp);
230 return r;
231 }
232
233 static bool is_basic_seccomp_available(void) {
234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
235 }
236
237 static bool is_seccomp_filter_available(void) {
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
240 }
241
242 bool is_seccomp_available(void) {
243 static int cached_enabled = -1;
244
245 if (cached_enabled < 0)
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
250 return cached_enabled;
251 }
252
253 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
254 [SYSCALL_FILTER_SET_DEFAULT] = {
255 .name = "@default",
256 .help = "System calls that are always permitted",
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
283 "getrlimit\0" /* make sure processes can query stack size and such */
284 "getsid\0"
285 "gettid\0"
286 "gettimeofday\0"
287 "getuid\0"
288 "getuid32\0"
289 "membarrier\0"
290 "nanosleep\0"
291 "pause\0"
292 "prlimit64\0"
293 "restart_syscall\0"
294 "rt_sigreturn\0"
295 "sched_yield\0"
296 "set_robust_list\0"
297 "set_thread_area\0"
298 "set_tid_address\0"
299 "set_tls\0"
300 "sigreturn\0"
301 "time\0"
302 "ugetrlimit\0"
303 },
304 [SYSCALL_FILTER_SET_AIO] = {
305 .name = "@aio",
306 .help = "Asynchronous IO",
307 .value =
308 "io_cancel\0"
309 "io_destroy\0"
310 "io_getevents\0"
311 "io_pgetevents\0"
312 "io_setup\0"
313 "io_submit\0"
314 },
315 [SYSCALL_FILTER_SET_BASIC_IO] = {
316 .name = "@basic-io",
317 .help = "Basic IO",
318 .value =
319 "_llseek\0"
320 "close\0"
321 "dup\0"
322 "dup2\0"
323 "dup3\0"
324 "lseek\0"
325 "pread64\0"
326 "preadv\0"
327 "preadv2\0"
328 "pwrite64\0"
329 "pwritev\0"
330 "pwritev2\0"
331 "read\0"
332 "readv\0"
333 "write\0"
334 "writev\0"
335 },
336 [SYSCALL_FILTER_SET_CHOWN] = {
337 .name = "@chown",
338 .help = "Change ownership of files and directories",
339 .value =
340 "chown\0"
341 "chown32\0"
342 "fchown\0"
343 "fchown32\0"
344 "fchownat\0"
345 "lchown\0"
346 "lchown32\0"
347 },
348 [SYSCALL_FILTER_SET_CLOCK] = {
349 .name = "@clock",
350 .help = "Change the system time",
351 .value =
352 "adjtimex\0"
353 "clock_adjtime\0"
354 "clock_settime\0"
355 "settimeofday\0"
356 "stime\0"
357 },
358 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
359 .name = "@cpu-emulation",
360 .help = "System calls for CPU emulation functionality",
361 .value =
362 "modify_ldt\0"
363 "subpage_prot\0"
364 "switch_endian\0"
365 "vm86\0"
366 "vm86old\0"
367 },
368 [SYSCALL_FILTER_SET_DEBUG] = {
369 .name = "@debug",
370 .help = "Debugging, performance monitoring and tracing functionality",
371 .value =
372 "lookup_dcookie\0"
373 "perf_event_open\0"
374 "ptrace\0"
375 "rtas\0"
376 #ifdef __NR_s390_runtime_instr
377 "s390_runtime_instr\0"
378 #endif
379 "sys_debug_setcontext\0"
380 },
381 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
382 .name = "@file-system",
383 .help = "File system operations",
384 .value =
385 "access\0"
386 "chdir\0"
387 "chmod\0"
388 "close\0"
389 "creat\0"
390 "faccessat\0"
391 "fallocate\0"
392 "fchdir\0"
393 "fchmod\0"
394 "fchmodat\0"
395 "fcntl\0"
396 "fcntl64\0"
397 "fgetxattr\0"
398 "flistxattr\0"
399 "fremovexattr\0"
400 "fsetxattr\0"
401 "fstat\0"
402 "fstat64\0"
403 "fstatat64\0"
404 "fstatfs\0"
405 "fstatfs64\0"
406 "ftruncate\0"
407 "ftruncate64\0"
408 "futimesat\0"
409 "getcwd\0"
410 "getdents\0"
411 "getdents64\0"
412 "getxattr\0"
413 "inotify_add_watch\0"
414 "inotify_init\0"
415 "inotify_init1\0"
416 "inotify_rm_watch\0"
417 "lgetxattr\0"
418 "link\0"
419 "linkat\0"
420 "listxattr\0"
421 "llistxattr\0"
422 "lremovexattr\0"
423 "lsetxattr\0"
424 "lstat\0"
425 "lstat64\0"
426 "mkdir\0"
427 "mkdirat\0"
428 "mknod\0"
429 "mknodat\0"
430 "mmap\0"
431 "mmap2\0"
432 "munmap\0"
433 "newfstatat\0"
434 "oldfstat\0"
435 "oldlstat\0"
436 "oldstat\0"
437 "open\0"
438 "openat\0"
439 "readlink\0"
440 "readlinkat\0"
441 "removexattr\0"
442 "rename\0"
443 "renameat\0"
444 "renameat2\0"
445 "rmdir\0"
446 "setxattr\0"
447 "stat\0"
448 "stat64\0"
449 "statfs\0"
450 "statfs64\0"
451 #ifdef __NR_statx
452 "statx\0"
453 #endif
454 "symlink\0"
455 "symlinkat\0"
456 "truncate\0"
457 "truncate64\0"
458 "unlink\0"
459 "unlinkat\0"
460 "utime\0"
461 "utimensat\0"
462 "utimes\0"
463 },
464 [SYSCALL_FILTER_SET_IO_EVENT] = {
465 .name = "@io-event",
466 .help = "Event loop system calls",
467 .value =
468 "_newselect\0"
469 "epoll_create\0"
470 "epoll_create1\0"
471 "epoll_ctl\0"
472 "epoll_ctl_old\0"
473 "epoll_pwait\0"
474 "epoll_wait\0"
475 "epoll_wait_old\0"
476 "eventfd\0"
477 "eventfd2\0"
478 "poll\0"
479 "ppoll\0"
480 "pselect6\0"
481 "select\0"
482 },
483 [SYSCALL_FILTER_SET_IPC] = {
484 .name = "@ipc",
485 .help = "SysV IPC, POSIX Message Queues or other IPC",
486 .value =
487 "ipc\0"
488 "memfd_create\0"
489 "mq_getsetattr\0"
490 "mq_notify\0"
491 "mq_open\0"
492 "mq_timedreceive\0"
493 "mq_timedsend\0"
494 "mq_unlink\0"
495 "msgctl\0"
496 "msgget\0"
497 "msgrcv\0"
498 "msgsnd\0"
499 "pipe\0"
500 "pipe2\0"
501 "process_vm_readv\0"
502 "process_vm_writev\0"
503 "semctl\0"
504 "semget\0"
505 "semop\0"
506 "semtimedop\0"
507 "shmat\0"
508 "shmctl\0"
509 "shmdt\0"
510 "shmget\0"
511 },
512 [SYSCALL_FILTER_SET_KEYRING] = {
513 .name = "@keyring",
514 .help = "Kernel keyring access",
515 .value =
516 "add_key\0"
517 "keyctl\0"
518 "request_key\0"
519 },
520 [SYSCALL_FILTER_SET_MEMLOCK] = {
521 .name = "@memlock",
522 .help = "Memory locking control",
523 .value =
524 "mlock\0"
525 "mlock2\0"
526 "mlockall\0"
527 "munlock\0"
528 "munlockall\0"
529 },
530 [SYSCALL_FILTER_SET_MODULE] = {
531 .name = "@module",
532 .help = "Loading and unloading of kernel modules",
533 .value =
534 "delete_module\0"
535 "finit_module\0"
536 "init_module\0"
537 },
538 [SYSCALL_FILTER_SET_MOUNT] = {
539 .name = "@mount",
540 .help = "Mounting and unmounting of file systems",
541 .value =
542 "chroot\0"
543 "mount\0"
544 "pivot_root\0"
545 "umount\0"
546 "umount2\0"
547 },
548 [SYSCALL_FILTER_SET_NETWORK_IO] = {
549 .name = "@network-io",
550 .help = "Network or Unix socket IO, should not be needed if not network facing",
551 .value =
552 "accept\0"
553 "accept4\0"
554 "bind\0"
555 "connect\0"
556 "getpeername\0"
557 "getsockname\0"
558 "getsockopt\0"
559 "listen\0"
560 "recv\0"
561 "recvfrom\0"
562 "recvmmsg\0"
563 "recvmsg\0"
564 "send\0"
565 "sendmmsg\0"
566 "sendmsg\0"
567 "sendto\0"
568 "setsockopt\0"
569 "shutdown\0"
570 "socket\0"
571 "socketcall\0"
572 "socketpair\0"
573 },
574 [SYSCALL_FILTER_SET_OBSOLETE] = {
575 /* some unknown even to libseccomp */
576 .name = "@obsolete",
577 .help = "Unusual, obsolete or unimplemented system calls",
578 .value =
579 "_sysctl\0"
580 "afs_syscall\0"
581 "bdflush\0"
582 "break\0"
583 "create_module\0"
584 "ftime\0"
585 "get_kernel_syms\0"
586 "getpmsg\0"
587 "gtty\0"
588 "idle\0"
589 "lock\0"
590 "mpx\0"
591 "prof\0"
592 "profil\0"
593 "putpmsg\0"
594 "query_module\0"
595 "security\0"
596 "sgetmask\0"
597 "ssetmask\0"
598 "stty\0"
599 "sysfs\0"
600 "tuxcall\0"
601 "ulimit\0"
602 "uselib\0"
603 "ustat\0"
604 "vserver\0"
605 },
606 [SYSCALL_FILTER_SET_PRIVILEGED] = {
607 .name = "@privileged",
608 .help = "All system calls which need super-user capabilities",
609 .value =
610 "@chown\0"
611 "@clock\0"
612 "@module\0"
613 "@raw-io\0"
614 "@reboot\0"
615 "@swap\0"
616 "_sysctl\0"
617 "acct\0"
618 "bpf\0"
619 "capset\0"
620 "chroot\0"
621 "fanotify_init\0"
622 "nfsservctl\0"
623 "open_by_handle_at\0"
624 "pivot_root\0"
625 "quotactl\0"
626 "setdomainname\0"
627 "setfsuid\0"
628 "setfsuid32\0"
629 "setgroups\0"
630 "setgroups32\0"
631 "sethostname\0"
632 "setresuid\0"
633 "setresuid32\0"
634 "setreuid\0"
635 "setreuid32\0"
636 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
637 "setuid32\0"
638 "vhangup\0"
639 },
640 [SYSCALL_FILTER_SET_PROCESS] = {
641 .name = "@process",
642 .help = "Process control, execution, namespaceing operations",
643 .value =
644 "arch_prctl\0"
645 "capget\0" /* Able to query arbitrary processes */
646 "clone\0"
647 "execveat\0"
648 "fork\0"
649 "getrusage\0"
650 "kill\0"
651 "prctl\0"
652 "rt_sigqueueinfo\0"
653 "rt_tgsigqueueinfo\0"
654 "setns\0"
655 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
656 "tgkill\0"
657 "times\0"
658 "tkill\0"
659 "unshare\0"
660 "vfork\0"
661 "wait4\0"
662 "waitid\0"
663 "waitpid\0"
664 },
665 [SYSCALL_FILTER_SET_RAW_IO] = {
666 .name = "@raw-io",
667 .help = "Raw I/O port access",
668 .value =
669 "ioperm\0"
670 "iopl\0"
671 "pciconfig_iobase\0"
672 "pciconfig_read\0"
673 "pciconfig_write\0"
674 #ifdef __NR_s390_pci_mmio_read
675 "s390_pci_mmio_read\0"
676 #endif
677 #ifdef __NR_s390_pci_mmio_write
678 "s390_pci_mmio_write\0"
679 #endif
680 },
681 [SYSCALL_FILTER_SET_REBOOT] = {
682 .name = "@reboot",
683 .help = "Reboot and reboot preparation/kexec",
684 .value =
685 "kexec_file_load\0"
686 "kexec_load\0"
687 "reboot\0"
688 },
689 [SYSCALL_FILTER_SET_RESOURCES] = {
690 .name = "@resources",
691 .help = "Alter resource settings",
692 .value =
693 "ioprio_set\0"
694 "mbind\0"
695 "migrate_pages\0"
696 "move_pages\0"
697 "nice\0"
698 "sched_setaffinity\0"
699 "sched_setattr\0"
700 "sched_setparam\0"
701 "sched_setscheduler\0"
702 "set_mempolicy\0"
703 "setpriority\0"
704 "setrlimit\0"
705 },
706 [SYSCALL_FILTER_SET_SETUID] = {
707 .name = "@setuid",
708 .help = "Operations for changing user/group credentials",
709 .value =
710 "setgid\0"
711 "setgid32\0"
712 "setgroups\0"
713 "setgroups32\0"
714 "setregid\0"
715 "setregid32\0"
716 "setresgid\0"
717 "setresgid32\0"
718 "setresuid\0"
719 "setresuid32\0"
720 "setreuid\0"
721 "setreuid32\0"
722 "setuid\0"
723 "setuid32\0"
724 },
725 [SYSCALL_FILTER_SET_SIGNAL] = {
726 .name = "@signal",
727 .help = "Process signal handling",
728 .value =
729 "rt_sigaction\0"
730 "rt_sigpending\0"
731 "rt_sigprocmask\0"
732 "rt_sigsuspend\0"
733 "rt_sigtimedwait\0"
734 "sigaction\0"
735 "sigaltstack\0"
736 "signal\0"
737 "signalfd\0"
738 "signalfd4\0"
739 "sigpending\0"
740 "sigprocmask\0"
741 "sigsuspend\0"
742 },
743 [SYSCALL_FILTER_SET_SWAP] = {
744 .name = "@swap",
745 .help = "Enable/disable swap devices",
746 .value =
747 "swapoff\0"
748 "swapon\0"
749 },
750 [SYSCALL_FILTER_SET_SYNC] = {
751 .name = "@sync",
752 .help = "Synchronize files and memory to storage",
753 .value =
754 "fdatasync\0"
755 "fsync\0"
756 "msync\0"
757 "sync\0"
758 "sync_file_range\0"
759 "syncfs\0"
760 },
761 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
762 .name = "@system-service",
763 .help = "General system service operations",
764 .value =
765 "@aio\0"
766 "@basic-io\0"
767 "@chown\0"
768 "@default\0"
769 "@file-system\0"
770 "@io-event\0"
771 "@ipc\0"
772 "@keyring\0"
773 "@memlock\0"
774 "@network-io\0"
775 "@process\0"
776 "@resources\0"
777 "@setuid\0"
778 "@signal\0"
779 "@sync\0"
780 "@timer\0"
781 "brk\0"
782 "capget\0"
783 "capset\0"
784 "copy_file_range\0"
785 "fadvise64\0"
786 "fadvise64_64\0"
787 "flock\0"
788 "get_mempolicy\0"
789 "getcpu\0"
790 "getpriority\0"
791 "getrandom\0"
792 "ioctl\0"
793 "ioprio_get\0"
794 "kcmp\0"
795 "madvise\0"
796 "mincore\0"
797 "mprotect\0"
798 "mremap\0"
799 "name_to_handle_at\0"
800 "oldolduname\0"
801 "olduname\0"
802 "personality\0"
803 "readahead\0"
804 "readdir\0"
805 "remap_file_pages\0"
806 "sched_get_priority_max\0"
807 "sched_get_priority_min\0"
808 "sched_getaffinity\0"
809 "sched_getattr\0"
810 "sched_getparam\0"
811 "sched_getscheduler\0"
812 "sched_rr_get_interval\0"
813 "sched_yield\0"
814 "sendfile\0"
815 "sendfile64\0"
816 "setfsgid\0"
817 "setfsgid32\0"
818 "setfsuid\0"
819 "setfsuid32\0"
820 "setpgid\0"
821 "setsid\0"
822 "splice\0"
823 "sysinfo\0"
824 "tee\0"
825 "umask\0"
826 "uname\0"
827 "userfaultfd\0"
828 "vmsplice\0"
829 },
830 [SYSCALL_FILTER_SET_TIMER] = {
831 .name = "@timer",
832 .help = "Schedule operations by time",
833 .value =
834 "alarm\0"
835 "getitimer\0"
836 "setitimer\0"
837 "timer_create\0"
838 "timer_delete\0"
839 "timer_getoverrun\0"
840 "timer_gettime\0"
841 "timer_settime\0"
842 "timerfd_create\0"
843 "timerfd_gettime\0"
844 "timerfd_settime\0"
845 "times\0"
846 },
847 };
848
849 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
850 unsigned i;
851
852 if (isempty(name) || name[0] != '@')
853 return NULL;
854
855 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
856 if (streq(syscall_filter_sets[i].name, name))
857 return syscall_filter_sets + i;
858
859 return NULL;
860 }
861
862 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
863
864 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
865 assert(seccomp);
866 assert(name);
867
868 if (strv_contains(exclude, name))
869 return 0;
870
871 if (name[0] == '@') {
872 const SyscallFilterSet *other;
873
874 other = syscall_filter_set_find(name);
875 if (!other)
876 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL),
877 "Filter set %s is not known!",
878 name);
879
880 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
881
882 } else {
883 int id, r;
884
885 id = seccomp_syscall_resolve_name(name);
886 if (id == __NR_SCMP_ERROR) {
887 if (log_missing)
888 log_debug("System call %s is not known, ignoring.", name);
889 return 0;
890 }
891
892 r = seccomp_rule_add_exact(seccomp, action, id, 0);
893 if (r < 0) {
894 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
895 bool ignore = r == -EDOM;
896
897 if (!ignore || log_missing)
898 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
899 name, id, ignore ? ", ignoring" : "");
900 if (!ignore)
901 return r;
902 }
903
904 return 0;
905 }
906 }
907
908 static int seccomp_add_syscall_filter_set(
909 scmp_filter_ctx seccomp,
910 const SyscallFilterSet *set,
911 uint32_t action,
912 char **exclude,
913 bool log_missing) {
914
915 const char *sys;
916 int r;
917
918 assert(seccomp);
919 assert(set);
920
921 NULSTR_FOREACH(sys, set->value) {
922 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
923 if (r < 0)
924 return r;
925 }
926
927 return 0;
928 }
929
930 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
931 uint32_t arch;
932 int r;
933
934 assert(set);
935
936 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
937 * each local arch. */
938
939 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
940 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
941
942 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
943
944 r = seccomp_init_for_arch(&seccomp, arch, default_action);
945 if (r < 0)
946 return r;
947
948 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
949 if (r < 0)
950 return log_debug_errno(r, "Failed to add filter set: %m");
951
952 r = seccomp_load(seccomp);
953 if (IN_SET(r, -EPERM, -EACCES))
954 return r;
955 if (r < 0)
956 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
957 }
958
959 return 0;
960 }
961
962 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
963 uint32_t arch;
964 int r;
965
966 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
967 * SyscallFilterSet* table. */
968
969 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
970 return 0;
971
972 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
973 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
974 Iterator i;
975 void *syscall_id, *val;
976
977 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
978
979 r = seccomp_init_for_arch(&seccomp, arch, default_action);
980 if (r < 0)
981 return r;
982
983 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
984 uint32_t a = action;
985 int id = PTR_TO_INT(syscall_id) - 1;
986 int error = PTR_TO_INT(val);
987
988 if (action != SCMP_ACT_ALLOW && error >= 0)
989 a = SCMP_ACT_ERRNO(error);
990
991 r = seccomp_rule_add_exact(seccomp, a, id, 0);
992 if (r < 0) {
993 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
994 _cleanup_free_ char *n = NULL;
995 bool ignore;
996
997 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
998 ignore = r == -EDOM;
999 if (!ignore || log_missing)
1000 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1001 strna(n), id, ignore ? ", ignoring" : "");
1002 if (!ignore)
1003 return r;
1004 }
1005 }
1006
1007 r = seccomp_load(seccomp);
1008 if (IN_SET(r, -EPERM, -EACCES))
1009 return r;
1010 if (r < 0)
1011 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1012 }
1013
1014 return 0;
1015 }
1016
1017 int seccomp_parse_syscall_filter_full(
1018 const char *name,
1019 int errno_num,
1020 Hashmap *filter,
1021 SeccompParseFlags flags,
1022 const char *unit,
1023 const char *filename,
1024 unsigned line) {
1025
1026 int r;
1027
1028 assert(name);
1029 assert(filter);
1030
1031 if (name[0] == '@') {
1032 const SyscallFilterSet *set;
1033 const char *i;
1034
1035 set = syscall_filter_set_find(name);
1036 if (!set) {
1037 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1038 return -EINVAL;
1039
1040 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1041 "Unknown system call group, ignoring: %s", name);
1042 return 0;
1043 }
1044
1045 NULSTR_FOREACH(i, set->value) {
1046 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1047 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1048 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1049 * about them. */
1050 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1051 if (r < 0)
1052 return r;
1053 }
1054 } else {
1055 int id;
1056
1057 id = seccomp_syscall_resolve_name(name);
1058 if (id == __NR_SCMP_ERROR) {
1059 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1060 return -EINVAL;
1061
1062 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1063 "Failed to parse system call, ignoring: %s", name);
1064 return 0;
1065 }
1066
1067 /* If we previously wanted to forbid a syscall and now
1068 * we want to allow it, then remove it from the list. */
1069 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1070 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1071 if (r < 0)
1072 switch (r) {
1073 case -ENOMEM:
1074 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1075 case -EEXIST:
1076 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1077 break;
1078 default:
1079 return r;
1080 }
1081 } else
1082 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1083 }
1084
1085 return 0;
1086 }
1087
1088 int seccomp_restrict_namespaces(unsigned long retain) {
1089 uint32_t arch;
1090 int r;
1091
1092 if (DEBUG_LOGGING) {
1093 _cleanup_free_ char *s = NULL;
1094
1095 (void) namespace_flags_to_string(retain, &s);
1096 log_debug("Restricting namespace to: %s.", strna(s));
1097 }
1098
1099 /* NOOP? */
1100 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1101 return 0;
1102
1103 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1104 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1105 unsigned i;
1106
1107 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1108
1109 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1110 if (r < 0)
1111 return r;
1112
1113 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1114 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1115 * altogether. */
1116 r = seccomp_rule_add_exact(
1117 seccomp,
1118 SCMP_ACT_ERRNO(EPERM),
1119 SCMP_SYS(setns),
1120 0);
1121 else
1122 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1123 * special invocation with a zero flags argument, right here. */
1124 r = seccomp_rule_add_exact(
1125 seccomp,
1126 SCMP_ACT_ERRNO(EPERM),
1127 SCMP_SYS(setns),
1128 1,
1129 SCMP_A1(SCMP_CMP_EQ, 0));
1130 if (r < 0) {
1131 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1132 continue;
1133 }
1134
1135 for (i = 0; namespace_flag_map[i].name; i++) {
1136 unsigned long f;
1137
1138 f = namespace_flag_map[i].flag;
1139 if ((retain & f) == f) {
1140 log_debug("Permitting %s.", namespace_flag_map[i].name);
1141 continue;
1142 }
1143
1144 log_debug("Blocking %s.", namespace_flag_map[i].name);
1145
1146 r = seccomp_rule_add_exact(
1147 seccomp,
1148 SCMP_ACT_ERRNO(EPERM),
1149 SCMP_SYS(unshare),
1150 1,
1151 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1152 if (r < 0) {
1153 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1154 break;
1155 }
1156
1157 /* On s390/s390x the first two parameters to clone are switched */
1158 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1159 r = seccomp_rule_add_exact(
1160 seccomp,
1161 SCMP_ACT_ERRNO(EPERM),
1162 SCMP_SYS(clone),
1163 1,
1164 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1165 else
1166 r = seccomp_rule_add_exact(
1167 seccomp,
1168 SCMP_ACT_ERRNO(EPERM),
1169 SCMP_SYS(clone),
1170 1,
1171 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1172 if (r < 0) {
1173 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1174 break;
1175 }
1176
1177 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1178 r = seccomp_rule_add_exact(
1179 seccomp,
1180 SCMP_ACT_ERRNO(EPERM),
1181 SCMP_SYS(setns),
1182 1,
1183 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1184 if (r < 0) {
1185 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1186 break;
1187 }
1188 }
1189 }
1190 if (r < 0)
1191 continue;
1192
1193 r = seccomp_load(seccomp);
1194 if (IN_SET(r, -EPERM, -EACCES))
1195 return r;
1196 if (r < 0)
1197 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1198 }
1199
1200 return 0;
1201 }
1202
1203 int seccomp_protect_sysctl(void) {
1204 uint32_t arch;
1205 int r;
1206
1207 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1208 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1209
1210 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1211
1212 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1213 /* No _sysctl syscall */
1214 continue;
1215
1216 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1217 if (r < 0)
1218 return r;
1219
1220 r = seccomp_rule_add_exact(
1221 seccomp,
1222 SCMP_ACT_ERRNO(EPERM),
1223 SCMP_SYS(_sysctl),
1224 0);
1225 if (r < 0) {
1226 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1227 continue;
1228 }
1229
1230 r = seccomp_load(seccomp);
1231 if (IN_SET(r, -EPERM, -EACCES))
1232 return r;
1233 if (r < 0)
1234 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1235 }
1236
1237 return 0;
1238 }
1239
1240 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1241 uint32_t arch;
1242 int r;
1243
1244 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1245 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1246 bool supported;
1247 Iterator i;
1248
1249 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1250
1251 switch (arch) {
1252
1253 case SCMP_ARCH_X86_64:
1254 case SCMP_ARCH_X32:
1255 case SCMP_ARCH_ARM:
1256 case SCMP_ARCH_AARCH64:
1257 case SCMP_ARCH_PPC:
1258 case SCMP_ARCH_PPC64:
1259 case SCMP_ARCH_PPC64LE:
1260 case SCMP_ARCH_MIPSEL64N32:
1261 case SCMP_ARCH_MIPS64N32:
1262 case SCMP_ARCH_MIPSEL64:
1263 case SCMP_ARCH_MIPS64:
1264 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1265 supported = true;
1266 break;
1267
1268 case SCMP_ARCH_S390:
1269 case SCMP_ARCH_S390X:
1270 case SCMP_ARCH_X86:
1271 case SCMP_ARCH_MIPSEL:
1272 case SCMP_ARCH_MIPS:
1273 default:
1274 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1275 * don't know */
1276 supported = false;
1277 break;
1278 }
1279
1280 if (!supported)
1281 continue;
1282
1283 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1284 if (r < 0)
1285 return r;
1286
1287 if (whitelist) {
1288 int af, first = 0, last = 0;
1289 void *afp;
1290
1291 /* If this is a whitelist, we first block the address families that are out of range and then
1292 * everything that is not in the set. First, we find the lowest and highest address family in
1293 * the set. */
1294
1295 SET_FOREACH(afp, address_families, i) {
1296 af = PTR_TO_INT(afp);
1297
1298 if (af <= 0 || af >= af_max())
1299 continue;
1300
1301 if (first == 0 || af < first)
1302 first = af;
1303
1304 if (last == 0 || af > last)
1305 last = af;
1306 }
1307
1308 assert((first == 0) == (last == 0));
1309
1310 if (first == 0) {
1311
1312 /* No entries in the valid range, block everything */
1313 r = seccomp_rule_add_exact(
1314 seccomp,
1315 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1316 SCMP_SYS(socket),
1317 0);
1318 if (r < 0) {
1319 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1320 continue;
1321 }
1322
1323 } else {
1324
1325 /* Block everything below the first entry */
1326 r = seccomp_rule_add_exact(
1327 seccomp,
1328 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1329 SCMP_SYS(socket),
1330 1,
1331 SCMP_A0(SCMP_CMP_LT, first));
1332 if (r < 0) {
1333 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1334 continue;
1335 }
1336
1337 /* Block everything above the last entry */
1338 r = seccomp_rule_add_exact(
1339 seccomp,
1340 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1341 SCMP_SYS(socket),
1342 1,
1343 SCMP_A0(SCMP_CMP_GT, last));
1344 if (r < 0) {
1345 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1346 continue;
1347 }
1348
1349 /* Block everything between the first and last entry */
1350 for (af = 1; af < af_max(); af++) {
1351
1352 if (set_contains(address_families, INT_TO_PTR(af)))
1353 continue;
1354
1355 r = seccomp_rule_add_exact(
1356 seccomp,
1357 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1358 SCMP_SYS(socket),
1359 1,
1360 SCMP_A0(SCMP_CMP_EQ, af));
1361 if (r < 0)
1362 break;
1363 }
1364 if (r < 0) {
1365 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1366 continue;
1367 }
1368 }
1369
1370 } else {
1371 void *af;
1372
1373 /* If this is a blacklist, then generate one rule for
1374 * each address family that are then combined in OR
1375 * checks. */
1376
1377 SET_FOREACH(af, address_families, i) {
1378
1379 r = seccomp_rule_add_exact(
1380 seccomp,
1381 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1382 SCMP_SYS(socket),
1383 1,
1384 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1385 if (r < 0)
1386 break;
1387 }
1388 if (r < 0) {
1389 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1390 continue;
1391 }
1392 }
1393
1394 r = seccomp_load(seccomp);
1395 if (IN_SET(r, -EPERM, -EACCES))
1396 return r;
1397 if (r < 0)
1398 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1399 }
1400
1401 return 0;
1402 }
1403
1404 int seccomp_restrict_realtime(void) {
1405 static const int permitted_policies[] = {
1406 SCHED_OTHER,
1407 SCHED_BATCH,
1408 SCHED_IDLE,
1409 };
1410
1411 int r, max_policy = 0;
1412 uint32_t arch;
1413 unsigned i;
1414
1415 /* Determine the highest policy constant we want to allow */
1416 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1417 if (permitted_policies[i] > max_policy)
1418 max_policy = permitted_policies[i];
1419
1420 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1421 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1422 int p;
1423
1424 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1425
1426 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1427 if (r < 0)
1428 return r;
1429
1430 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1431 * whitelist. */
1432 for (p = 0; p < max_policy; p++) {
1433 bool good = false;
1434
1435 /* Check if this is in the whitelist. */
1436 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1437 if (permitted_policies[i] == p) {
1438 good = true;
1439 break;
1440 }
1441
1442 if (good)
1443 continue;
1444
1445 /* Deny this policy */
1446 r = seccomp_rule_add_exact(
1447 seccomp,
1448 SCMP_ACT_ERRNO(EPERM),
1449 SCMP_SYS(sched_setscheduler),
1450 1,
1451 SCMP_A1(SCMP_CMP_EQ, p));
1452 if (r < 0) {
1453 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1454 continue;
1455 }
1456 }
1457
1458 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1459 * unsigned here, hence no need no check for < 0 values. */
1460 r = seccomp_rule_add_exact(
1461 seccomp,
1462 SCMP_ACT_ERRNO(EPERM),
1463 SCMP_SYS(sched_setscheduler),
1464 1,
1465 SCMP_A1(SCMP_CMP_GT, max_policy));
1466 if (r < 0) {
1467 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1468 continue;
1469 }
1470
1471 r = seccomp_load(seccomp);
1472 if (IN_SET(r, -EPERM, -EACCES))
1473 return r;
1474 if (r < 0)
1475 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1476 }
1477
1478 return 0;
1479 }
1480
1481 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1482 uint32_t arch,
1483 int nr,
1484 unsigned arg_cnt,
1485 const struct scmp_arg_cmp arg) {
1486 int r;
1487
1488 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1489 if (r < 0) {
1490 _cleanup_free_ char *n = NULL;
1491
1492 n = seccomp_syscall_resolve_num_arch(arch, nr);
1493 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1494 strna(n),
1495 seccomp_arch_to_string(arch));
1496 }
1497
1498 return r;
1499 }
1500
1501 /* For known architectures, check that syscalls are indeed defined or not. */
1502 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1503 assert_cc(SCMP_SYS(shmget) > 0);
1504 assert_cc(SCMP_SYS(shmat) > 0);
1505 assert_cc(SCMP_SYS(shmdt) > 0);
1506 #elif defined(__i386__) || defined(__powerpc64__)
1507 assert_cc(SCMP_SYS(shmget) < 0);
1508 assert_cc(SCMP_SYS(shmat) < 0);
1509 assert_cc(SCMP_SYS(shmdt) < 0);
1510 #endif
1511
1512 int seccomp_memory_deny_write_execute(void) {
1513
1514 uint32_t arch;
1515 int r;
1516
1517 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1518 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1519 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1520
1521 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1522
1523 switch (arch) {
1524
1525 case SCMP_ARCH_X86:
1526 filter_syscall = SCMP_SYS(mmap2);
1527 block_syscall = SCMP_SYS(mmap);
1528 break;
1529
1530 case SCMP_ARCH_PPC:
1531 case SCMP_ARCH_PPC64:
1532 case SCMP_ARCH_PPC64LE:
1533 filter_syscall = SCMP_SYS(mmap);
1534
1535 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1536 * We ignore that here, which means there's still a way to get writable/executable
1537 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1538
1539 break;
1540
1541 case SCMP_ARCH_ARM:
1542 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1543 shmat_syscall = SCMP_SYS(shmat);
1544 break;
1545
1546 case SCMP_ARCH_X86_64:
1547 case SCMP_ARCH_X32:
1548 case SCMP_ARCH_AARCH64:
1549 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1550 shmat_syscall = SCMP_SYS(shmat);
1551 break;
1552
1553 /* Please add more definitions here, if you port systemd to other architectures! */
1554
1555 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1556 #warning "Consider adding the right mmap() syscall definitions here!"
1557 #endif
1558 }
1559
1560 /* Can't filter mmap() on this arch, then skip it */
1561 if (filter_syscall == 0)
1562 continue;
1563
1564 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1565 if (r < 0)
1566 return r;
1567
1568 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1569 1,
1570 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1571 if (r < 0)
1572 continue;
1573
1574 if (block_syscall != 0) {
1575 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1576 if (r < 0)
1577 continue;
1578 }
1579
1580 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1581 1,
1582 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1583 if (r < 0)
1584 continue;
1585
1586 #ifdef __NR_pkey_mprotect
1587 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1588 1,
1589 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1590 if (r < 0)
1591 continue;
1592 #endif
1593
1594 if (shmat_syscall != 0) {
1595 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1596 1,
1597 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1598 if (r < 0)
1599 continue;
1600 }
1601
1602 r = seccomp_load(seccomp);
1603 if (IN_SET(r, -EPERM, -EACCES))
1604 return r;
1605 if (r < 0)
1606 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1607 }
1608
1609 return 0;
1610 }
1611
1612 int seccomp_restrict_archs(Set *archs) {
1613 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1614 Iterator i;
1615 void *id;
1616 int r;
1617
1618 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1619 * list.
1620 *
1621 * There are some qualifications. However the most important use is to stop processes from bypassing
1622 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1623 * in a non-native architecture. There are no holes in this use case, at least so far. */
1624
1625 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1626 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1627 * to run a program with the restrictions applied. */
1628 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1629 if (!seccomp)
1630 return -ENOMEM;
1631
1632 SET_FOREACH(id, archs, i) {
1633 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1634 if (r < 0 && r != -EEXIST)
1635 return r;
1636 }
1637
1638 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1639 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1640 * The important thing is that you can block the old 32-bit x86 syscalls.
1641 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1642
1643 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1644 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1645
1646 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1647 if (r < 0 && r != -EEXIST)
1648 return r;
1649 }
1650
1651 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1652 if (r < 0)
1653 return r;
1654
1655 r = seccomp_load(seccomp);
1656 if (IN_SET(r, -EPERM, -EACCES))
1657 return r;
1658 if (r < 0)
1659 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1660
1661 return 0;
1662 }
1663
1664 int parse_syscall_archs(char **l, Set **archs) {
1665 _cleanup_set_free_ Set *_archs;
1666 char **s;
1667 int r;
1668
1669 assert(l);
1670 assert(archs);
1671
1672 r = set_ensure_allocated(&_archs, NULL);
1673 if (r < 0)
1674 return r;
1675
1676 STRV_FOREACH(s, l) {
1677 uint32_t a;
1678
1679 r = seccomp_arch_from_string(*s, &a);
1680 if (r < 0)
1681 return -EINVAL;
1682
1683 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1684 if (r < 0)
1685 return -ENOMEM;
1686 }
1687
1688 *archs = TAKE_PTR(_archs);
1689
1690 return 0;
1691 }
1692
1693 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1694 const char *i;
1695 int r;
1696
1697 assert(set);
1698
1699 NULSTR_FOREACH(i, set->value) {
1700
1701 if (i[0] == '@') {
1702 const SyscallFilterSet *more;
1703
1704 more = syscall_filter_set_find(i);
1705 if (!more)
1706 return -ENXIO;
1707
1708 r = seccomp_filter_set_add(filter, add, more);
1709 if (r < 0)
1710 return r;
1711 } else {
1712 int id;
1713
1714 id = seccomp_syscall_resolve_name(i);
1715 if (id == __NR_SCMP_ERROR) {
1716 log_debug("Couldn't resolve system call, ignoring: %s", i);
1717 continue;
1718 }
1719
1720 if (add) {
1721 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1722 if (r < 0)
1723 return r;
1724 } else
1725 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1726 }
1727 }
1728
1729 return 0;
1730 }
1731
1732 int seccomp_lock_personality(unsigned long personality) {
1733 uint32_t arch;
1734 int r;
1735
1736 if (personality >= PERSONALITY_INVALID)
1737 return -EINVAL;
1738
1739 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1740 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1741
1742 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1743 if (r < 0)
1744 return r;
1745
1746 r = seccomp_rule_add_exact(
1747 seccomp,
1748 SCMP_ACT_ERRNO(EPERM),
1749 SCMP_SYS(personality),
1750 1,
1751 SCMP_A0(SCMP_CMP_NE, personality));
1752 if (r < 0) {
1753 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1754 continue;
1755 }
1756
1757 r = seccomp_load(seccomp);
1758 if (IN_SET(r, -EPERM, -EACCES))
1759 return r;
1760 if (r < 0)
1761 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1762 }
1763
1764 return 0;
1765 }