]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
pkgconfig: define variables relative to ${prefix}/${rootprefix}/${sysconfdir}
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <errno.h>
4 #include <linux/seccomp.h>
5 #include <seccomp.h>
6 #include <stddef.h>
7 #include <sys/mman.h>
8 #include <sys/prctl.h>
9 #include <sys/shm.h>
10
11 #include "af-list.h"
12 #include "alloc-util.h"
13 #include "macro.h"
14 #include "nsflags.h"
15 #include "process-util.h"
16 #include "seccomp-util.h"
17 #include "set.h"
18 #include "string-util.h"
19 #include "strv.h"
20 #include "util.h"
21 #include "errno-list.h"
22
23 const uint32_t seccomp_local_archs[] = {
24
25 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
26
27 #if defined(__x86_64__) && defined(__ILP32__)
28 SCMP_ARCH_X86,
29 SCMP_ARCH_X86_64,
30 SCMP_ARCH_X32, /* native */
31 #elif defined(__x86_64__) && !defined(__ILP32__)
32 SCMP_ARCH_X86,
33 SCMP_ARCH_X32,
34 SCMP_ARCH_X86_64, /* native */
35 #elif defined(__i386__)
36 SCMP_ARCH_X86,
37 #elif defined(__aarch64__)
38 SCMP_ARCH_ARM,
39 SCMP_ARCH_AARCH64, /* native */
40 #elif defined(__arm__)
41 SCMP_ARCH_ARM,
42 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
43 SCMP_ARCH_MIPSEL,
44 SCMP_ARCH_MIPS, /* native */
45 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPS,
47 SCMP_ARCH_MIPSEL, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
49 SCMP_ARCH_MIPSEL,
50 SCMP_ARCH_MIPS,
51 SCMP_ARCH_MIPSEL64N32,
52 SCMP_ARCH_MIPS64N32,
53 SCMP_ARCH_MIPSEL64,
54 SCMP_ARCH_MIPS64, /* native */
55 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
56 SCMP_ARCH_MIPS,
57 SCMP_ARCH_MIPSEL,
58 SCMP_ARCH_MIPS64N32,
59 SCMP_ARCH_MIPSEL64N32,
60 SCMP_ARCH_MIPS64,
61 SCMP_ARCH_MIPSEL64, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
63 SCMP_ARCH_MIPSEL,
64 SCMP_ARCH_MIPS,
65 SCMP_ARCH_MIPSEL64,
66 SCMP_ARCH_MIPS64,
67 SCMP_ARCH_MIPSEL64N32,
68 SCMP_ARCH_MIPS64N32, /* native */
69 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
70 SCMP_ARCH_MIPS,
71 SCMP_ARCH_MIPSEL,
72 SCMP_ARCH_MIPS64,
73 SCMP_ARCH_MIPSEL64,
74 SCMP_ARCH_MIPS64N32,
75 SCMP_ARCH_MIPSEL64N32, /* native */
76 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
77 SCMP_ARCH_PPC,
78 SCMP_ARCH_PPC64LE,
79 SCMP_ARCH_PPC64, /* native */
80 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
81 SCMP_ARCH_PPC,
82 SCMP_ARCH_PPC64,
83 SCMP_ARCH_PPC64LE, /* native */
84 #elif defined(__powerpc__)
85 SCMP_ARCH_PPC,
86 #elif defined(__s390x__)
87 SCMP_ARCH_S390,
88 SCMP_ARCH_S390X, /* native */
89 #elif defined(__s390__)
90 SCMP_ARCH_S390,
91 #endif
92 (uint32_t) -1
93 };
94
95 const char* seccomp_arch_to_string(uint32_t c) {
96 /* Maintain order used in <seccomp.h>.
97 *
98 * Names used here should be the same as those used for ConditionArchitecture=,
99 * except for "subarchitectures" like x32. */
100
101 switch(c) {
102 case SCMP_ARCH_NATIVE:
103 return "native";
104 case SCMP_ARCH_X86:
105 return "x86";
106 case SCMP_ARCH_X86_64:
107 return "x86-64";
108 case SCMP_ARCH_X32:
109 return "x32";
110 case SCMP_ARCH_ARM:
111 return "arm";
112 case SCMP_ARCH_AARCH64:
113 return "arm64";
114 case SCMP_ARCH_MIPS:
115 return "mips";
116 case SCMP_ARCH_MIPS64:
117 return "mips64";
118 case SCMP_ARCH_MIPS64N32:
119 return "mips64-n32";
120 case SCMP_ARCH_MIPSEL:
121 return "mips-le";
122 case SCMP_ARCH_MIPSEL64:
123 return "mips64-le";
124 case SCMP_ARCH_MIPSEL64N32:
125 return "mips64-le-n32";
126 case SCMP_ARCH_PPC:
127 return "ppc";
128 case SCMP_ARCH_PPC64:
129 return "ppc64";
130 case SCMP_ARCH_PPC64LE:
131 return "ppc64-le";
132 case SCMP_ARCH_S390:
133 return "s390";
134 case SCMP_ARCH_S390X:
135 return "s390x";
136 default:
137 return NULL;
138 }
139 }
140
141 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
142 if (!n)
143 return -EINVAL;
144
145 assert(ret);
146
147 if (streq(n, "native"))
148 *ret = SCMP_ARCH_NATIVE;
149 else if (streq(n, "x86"))
150 *ret = SCMP_ARCH_X86;
151 else if (streq(n, "x86-64"))
152 *ret = SCMP_ARCH_X86_64;
153 else if (streq(n, "x32"))
154 *ret = SCMP_ARCH_X32;
155 else if (streq(n, "arm"))
156 *ret = SCMP_ARCH_ARM;
157 else if (streq(n, "arm64"))
158 *ret = SCMP_ARCH_AARCH64;
159 else if (streq(n, "mips"))
160 *ret = SCMP_ARCH_MIPS;
161 else if (streq(n, "mips64"))
162 *ret = SCMP_ARCH_MIPS64;
163 else if (streq(n, "mips64-n32"))
164 *ret = SCMP_ARCH_MIPS64N32;
165 else if (streq(n, "mips-le"))
166 *ret = SCMP_ARCH_MIPSEL;
167 else if (streq(n, "mips64-le"))
168 *ret = SCMP_ARCH_MIPSEL64;
169 else if (streq(n, "mips64-le-n32"))
170 *ret = SCMP_ARCH_MIPSEL64N32;
171 else if (streq(n, "ppc"))
172 *ret = SCMP_ARCH_PPC;
173 else if (streq(n, "ppc64"))
174 *ret = SCMP_ARCH_PPC64;
175 else if (streq(n, "ppc64-le"))
176 *ret = SCMP_ARCH_PPC64LE;
177 else if (streq(n, "s390"))
178 *ret = SCMP_ARCH_S390;
179 else if (streq(n, "s390x"))
180 *ret = SCMP_ARCH_S390X;
181 else
182 return -EINVAL;
183
184 return 0;
185 }
186
187 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
188 scmp_filter_ctx seccomp;
189 int r;
190
191 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
192 * any others. Also, turns off the NNP fiddling. */
193
194 seccomp = seccomp_init(default_action);
195 if (!seccomp)
196 return -ENOMEM;
197
198 if (arch != SCMP_ARCH_NATIVE &&
199 arch != seccomp_arch_native()) {
200
201 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
202 if (r < 0)
203 goto finish;
204
205 r = seccomp_arch_add(seccomp, arch);
206 if (r < 0)
207 goto finish;
208
209 assert(seccomp_arch_exist(seccomp, arch) >= 0);
210 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
211 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
212 } else {
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
215 }
216
217 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
218 if (r < 0)
219 goto finish;
220
221 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
222 if (r < 0)
223 goto finish;
224
225 *ret = seccomp;
226 return 0;
227
228 finish:
229 seccomp_release(seccomp);
230 return r;
231 }
232
233 static bool is_basic_seccomp_available(void) {
234 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
235 }
236
237 static bool is_seccomp_filter_available(void) {
238 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
239 errno == EFAULT;
240 }
241
242 bool is_seccomp_available(void) {
243 static int cached_enabled = -1;
244
245 if (cached_enabled < 0)
246 cached_enabled =
247 is_basic_seccomp_available() &&
248 is_seccomp_filter_available();
249
250 return cached_enabled;
251 }
252
253 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
254 [SYSCALL_FILTER_SET_DEFAULT] = {
255 .name = "@default",
256 .help = "System calls that are always permitted",
257 .value =
258 "clock_getres\0"
259 "clock_gettime\0"
260 "clock_nanosleep\0"
261 "execve\0"
262 "exit\0"
263 "exit_group\0"
264 "futex\0"
265 "get_robust_list\0"
266 "get_thread_area\0"
267 "getegid\0"
268 "getegid32\0"
269 "geteuid\0"
270 "geteuid32\0"
271 "getgid\0"
272 "getgid32\0"
273 "getgroups\0"
274 "getgroups32\0"
275 "getpgid\0"
276 "getpgrp\0"
277 "getpid\0"
278 "getppid\0"
279 "getresgid\0"
280 "getresgid32\0"
281 "getresuid\0"
282 "getresuid32\0"
283 "getrlimit\0" /* make sure processes can query stack size and such */
284 "getsid\0"
285 "gettid\0"
286 "gettimeofday\0"
287 "getuid\0"
288 "getuid32\0"
289 "membarrier\0"
290 "nanosleep\0"
291 "pause\0"
292 "prlimit64\0"
293 "restart_syscall\0"
294 "rt_sigreturn\0"
295 "sched_yield\0"
296 "set_robust_list\0"
297 "set_thread_area\0"
298 "set_tid_address\0"
299 "set_tls\0"
300 "sigreturn\0"
301 "time\0"
302 "ugetrlimit\0"
303 },
304 [SYSCALL_FILTER_SET_AIO] = {
305 .name = "@aio",
306 .help = "Asynchronous IO",
307 .value =
308 "io_cancel\0"
309 "io_destroy\0"
310 "io_getevents\0"
311 "io_pgetevents\0"
312 "io_setup\0"
313 "io_submit\0"
314 },
315 [SYSCALL_FILTER_SET_BASIC_IO] = {
316 .name = "@basic-io",
317 .help = "Basic IO",
318 .value =
319 "_llseek\0"
320 "close\0"
321 "dup\0"
322 "dup2\0"
323 "dup3\0"
324 "lseek\0"
325 "pread64\0"
326 "preadv\0"
327 "preadv2\0"
328 "pwrite64\0"
329 "pwritev\0"
330 "pwritev2\0"
331 "read\0"
332 "readv\0"
333 "write\0"
334 "writev\0"
335 },
336 [SYSCALL_FILTER_SET_CHOWN] = {
337 .name = "@chown",
338 .help = "Change ownership of files and directories",
339 .value =
340 "chown\0"
341 "chown32\0"
342 "fchown\0"
343 "fchown32\0"
344 "fchownat\0"
345 "lchown\0"
346 "lchown32\0"
347 },
348 [SYSCALL_FILTER_SET_CLOCK] = {
349 .name = "@clock",
350 .help = "Change the system time",
351 .value =
352 "adjtimex\0"
353 "clock_adjtime\0"
354 "clock_settime\0"
355 "settimeofday\0"
356 "stime\0"
357 },
358 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
359 .name = "@cpu-emulation",
360 .help = "System calls for CPU emulation functionality",
361 .value =
362 "modify_ldt\0"
363 "subpage_prot\0"
364 "switch_endian\0"
365 "vm86\0"
366 "vm86old\0"
367 },
368 [SYSCALL_FILTER_SET_DEBUG] = {
369 .name = "@debug",
370 .help = "Debugging, performance monitoring and tracing functionality",
371 .value =
372 "lookup_dcookie\0"
373 "perf_event_open\0"
374 "process_vm_readv\0"
375 "process_vm_writev\0"
376 "ptrace\0"
377 "rtas\0"
378 #ifdef __NR_s390_runtime_instr
379 "s390_runtime_instr\0"
380 #endif
381 "sys_debug_setcontext\0"
382 },
383 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
384 .name = "@file-system",
385 .help = "File system operations",
386 .value =
387 "access\0"
388 "chdir\0"
389 "chmod\0"
390 "close\0"
391 "creat\0"
392 "faccessat\0"
393 "fallocate\0"
394 "fchdir\0"
395 "fchmod\0"
396 "fchmodat\0"
397 "fcntl\0"
398 "fcntl64\0"
399 "fgetxattr\0"
400 "flistxattr\0"
401 "fremovexattr\0"
402 "fsetxattr\0"
403 "fstat\0"
404 "fstat64\0"
405 "fstatat64\0"
406 "fstatfs\0"
407 "fstatfs64\0"
408 "ftruncate\0"
409 "ftruncate64\0"
410 "futimesat\0"
411 "getcwd\0"
412 "getdents\0"
413 "getdents64\0"
414 "getxattr\0"
415 "inotify_add_watch\0"
416 "inotify_init\0"
417 "inotify_init1\0"
418 "inotify_rm_watch\0"
419 "lgetxattr\0"
420 "link\0"
421 "linkat\0"
422 "listxattr\0"
423 "llistxattr\0"
424 "lremovexattr\0"
425 "lsetxattr\0"
426 "lstat\0"
427 "lstat64\0"
428 "mkdir\0"
429 "mkdirat\0"
430 "mknod\0"
431 "mknodat\0"
432 "mmap\0"
433 "mmap2\0"
434 "munmap\0"
435 "newfstatat\0"
436 "oldfstat\0"
437 "oldlstat\0"
438 "oldstat\0"
439 "open\0"
440 "openat\0"
441 "readlink\0"
442 "readlinkat\0"
443 "removexattr\0"
444 "rename\0"
445 "renameat\0"
446 "renameat2\0"
447 "rmdir\0"
448 "setxattr\0"
449 "stat\0"
450 "stat64\0"
451 "statfs\0"
452 "statfs64\0"
453 #ifdef __NR_statx
454 "statx\0"
455 #endif
456 "symlink\0"
457 "symlinkat\0"
458 "truncate\0"
459 "truncate64\0"
460 "unlink\0"
461 "unlinkat\0"
462 "utime\0"
463 "utimensat\0"
464 "utimes\0"
465 },
466 [SYSCALL_FILTER_SET_IO_EVENT] = {
467 .name = "@io-event",
468 .help = "Event loop system calls",
469 .value =
470 "_newselect\0"
471 "epoll_create\0"
472 "epoll_create1\0"
473 "epoll_ctl\0"
474 "epoll_ctl_old\0"
475 "epoll_pwait\0"
476 "epoll_wait\0"
477 "epoll_wait_old\0"
478 "eventfd\0"
479 "eventfd2\0"
480 "poll\0"
481 "ppoll\0"
482 "pselect6\0"
483 "select\0"
484 },
485 [SYSCALL_FILTER_SET_IPC] = {
486 .name = "@ipc",
487 .help = "SysV IPC, POSIX Message Queues or other IPC",
488 .value =
489 "ipc\0"
490 "memfd_create\0"
491 "mq_getsetattr\0"
492 "mq_notify\0"
493 "mq_open\0"
494 "mq_timedreceive\0"
495 "mq_timedsend\0"
496 "mq_unlink\0"
497 "msgctl\0"
498 "msgget\0"
499 "msgrcv\0"
500 "msgsnd\0"
501 "pipe\0"
502 "pipe2\0"
503 "process_vm_readv\0"
504 "process_vm_writev\0"
505 "semctl\0"
506 "semget\0"
507 "semop\0"
508 "semtimedop\0"
509 "shmat\0"
510 "shmctl\0"
511 "shmdt\0"
512 "shmget\0"
513 },
514 [SYSCALL_FILTER_SET_KEYRING] = {
515 .name = "@keyring",
516 .help = "Kernel keyring access",
517 .value =
518 "add_key\0"
519 "keyctl\0"
520 "request_key\0"
521 },
522 [SYSCALL_FILTER_SET_MEMLOCK] = {
523 .name = "@memlock",
524 .help = "Memory locking control",
525 .value =
526 "mlock\0"
527 "mlock2\0"
528 "mlockall\0"
529 "munlock\0"
530 "munlockall\0"
531 },
532 [SYSCALL_FILTER_SET_MODULE] = {
533 .name = "@module",
534 .help = "Loading and unloading of kernel modules",
535 .value =
536 "delete_module\0"
537 "finit_module\0"
538 "init_module\0"
539 },
540 [SYSCALL_FILTER_SET_MOUNT] = {
541 .name = "@mount",
542 .help = "Mounting and unmounting of file systems",
543 .value =
544 "chroot\0"
545 "mount\0"
546 "pivot_root\0"
547 "umount\0"
548 "umount2\0"
549 },
550 [SYSCALL_FILTER_SET_NETWORK_IO] = {
551 .name = "@network-io",
552 .help = "Network or Unix socket IO, should not be needed if not network facing",
553 .value =
554 "accept\0"
555 "accept4\0"
556 "bind\0"
557 "connect\0"
558 "getpeername\0"
559 "getsockname\0"
560 "getsockopt\0"
561 "listen\0"
562 "recv\0"
563 "recvfrom\0"
564 "recvmmsg\0"
565 "recvmsg\0"
566 "send\0"
567 "sendmmsg\0"
568 "sendmsg\0"
569 "sendto\0"
570 "setsockopt\0"
571 "shutdown\0"
572 "socket\0"
573 "socketcall\0"
574 "socketpair\0"
575 },
576 [SYSCALL_FILTER_SET_OBSOLETE] = {
577 /* some unknown even to libseccomp */
578 .name = "@obsolete",
579 .help = "Unusual, obsolete or unimplemented system calls",
580 .value =
581 "_sysctl\0"
582 "afs_syscall\0"
583 "bdflush\0"
584 "break\0"
585 "create_module\0"
586 "ftime\0"
587 "get_kernel_syms\0"
588 "getpmsg\0"
589 "gtty\0"
590 "idle\0"
591 "lock\0"
592 "mpx\0"
593 "prof\0"
594 "profil\0"
595 "putpmsg\0"
596 "query_module\0"
597 "security\0"
598 "sgetmask\0"
599 "ssetmask\0"
600 "stty\0"
601 "sysfs\0"
602 "tuxcall\0"
603 "ulimit\0"
604 "uselib\0"
605 "ustat\0"
606 "vserver\0"
607 },
608 [SYSCALL_FILTER_SET_PRIVILEGED] = {
609 .name = "@privileged",
610 .help = "All system calls which need super-user capabilities",
611 .value =
612 "@chown\0"
613 "@clock\0"
614 "@module\0"
615 "@raw-io\0"
616 "@reboot\0"
617 "@swap\0"
618 "_sysctl\0"
619 "acct\0"
620 "bpf\0"
621 "capset\0"
622 "chroot\0"
623 "fanotify_init\0"
624 "nfsservctl\0"
625 "open_by_handle_at\0"
626 "pivot_root\0"
627 "quotactl\0"
628 "setdomainname\0"
629 "setfsuid\0"
630 "setfsuid32\0"
631 "setgroups\0"
632 "setgroups32\0"
633 "sethostname\0"
634 "setresuid\0"
635 "setresuid32\0"
636 "setreuid\0"
637 "setreuid32\0"
638 "setuid\0" /* We list the explicit system calls here, as @setuid also includes setgid() which is not necessarily privileged */
639 "setuid32\0"
640 "vhangup\0"
641 },
642 [SYSCALL_FILTER_SET_PROCESS] = {
643 .name = "@process",
644 .help = "Process control, execution, namespaceing operations",
645 .value =
646 "arch_prctl\0"
647 "capget\0" /* Able to query arbitrary processes */
648 "clone\0"
649 "execveat\0"
650 "fork\0"
651 "getrusage\0"
652 "kill\0"
653 "prctl\0"
654 "rt_sigqueueinfo\0"
655 "rt_tgsigqueueinfo\0"
656 "setns\0"
657 "swapcontext\0" /* Some archs e.g. powerpc32 are using it to do userspace context switches */
658 "tgkill\0"
659 "times\0"
660 "tkill\0"
661 "unshare\0"
662 "vfork\0"
663 "wait4\0"
664 "waitid\0"
665 "waitpid\0"
666 },
667 [SYSCALL_FILTER_SET_RAW_IO] = {
668 .name = "@raw-io",
669 .help = "Raw I/O port access",
670 .value =
671 "ioperm\0"
672 "iopl\0"
673 "pciconfig_iobase\0"
674 "pciconfig_read\0"
675 "pciconfig_write\0"
676 #ifdef __NR_s390_pci_mmio_read
677 "s390_pci_mmio_read\0"
678 #endif
679 #ifdef __NR_s390_pci_mmio_write
680 "s390_pci_mmio_write\0"
681 #endif
682 },
683 [SYSCALL_FILTER_SET_REBOOT] = {
684 .name = "@reboot",
685 .help = "Reboot and reboot preparation/kexec",
686 .value =
687 "kexec_file_load\0"
688 "kexec_load\0"
689 "reboot\0"
690 },
691 [SYSCALL_FILTER_SET_RESOURCES] = {
692 .name = "@resources",
693 .help = "Alter resource settings",
694 .value =
695 "ioprio_set\0"
696 "mbind\0"
697 "migrate_pages\0"
698 "move_pages\0"
699 "nice\0"
700 "sched_setaffinity\0"
701 "sched_setattr\0"
702 "sched_setparam\0"
703 "sched_setscheduler\0"
704 "set_mempolicy\0"
705 "setpriority\0"
706 "setrlimit\0"
707 },
708 [SYSCALL_FILTER_SET_SETUID] = {
709 .name = "@setuid",
710 .help = "Operations for changing user/group credentials",
711 .value =
712 "setgid\0"
713 "setgid32\0"
714 "setgroups\0"
715 "setgroups32\0"
716 "setregid\0"
717 "setregid32\0"
718 "setresgid\0"
719 "setresgid32\0"
720 "setresuid\0"
721 "setresuid32\0"
722 "setreuid\0"
723 "setreuid32\0"
724 "setuid\0"
725 "setuid32\0"
726 },
727 [SYSCALL_FILTER_SET_SIGNAL] = {
728 .name = "@signal",
729 .help = "Process signal handling",
730 .value =
731 "rt_sigaction\0"
732 "rt_sigpending\0"
733 "rt_sigprocmask\0"
734 "rt_sigsuspend\0"
735 "rt_sigtimedwait\0"
736 "sigaction\0"
737 "sigaltstack\0"
738 "signal\0"
739 "signalfd\0"
740 "signalfd4\0"
741 "sigpending\0"
742 "sigprocmask\0"
743 "sigsuspend\0"
744 },
745 [SYSCALL_FILTER_SET_SWAP] = {
746 .name = "@swap",
747 .help = "Enable/disable swap devices",
748 .value =
749 "swapoff\0"
750 "swapon\0"
751 },
752 [SYSCALL_FILTER_SET_SYNC] = {
753 .name = "@sync",
754 .help = "Synchronize files and memory to storage",
755 .value =
756 "fdatasync\0"
757 "fsync\0"
758 "msync\0"
759 "sync\0"
760 "sync_file_range\0"
761 "syncfs\0"
762 },
763 [SYSCALL_FILTER_SET_SYSTEM_SERVICE] = {
764 .name = "@system-service",
765 .help = "General system service operations",
766 .value =
767 "@aio\0"
768 "@basic-io\0"
769 "@chown\0"
770 "@default\0"
771 "@file-system\0"
772 "@io-event\0"
773 "@ipc\0"
774 "@keyring\0"
775 "@memlock\0"
776 "@network-io\0"
777 "@process\0"
778 "@resources\0"
779 "@setuid\0"
780 "@signal\0"
781 "@sync\0"
782 "@timer\0"
783 "brk\0"
784 "capget\0"
785 "capset\0"
786 "copy_file_range\0"
787 "fadvise64\0"
788 "fadvise64_64\0"
789 "flock\0"
790 "get_mempolicy\0"
791 "getcpu\0"
792 "getpriority\0"
793 "getrandom\0"
794 "ioctl\0"
795 "ioprio_get\0"
796 "kcmp\0"
797 "madvise\0"
798 "mincore\0"
799 "mprotect\0"
800 "mremap\0"
801 "name_to_handle_at\0"
802 "oldolduname\0"
803 "olduname\0"
804 "personality\0"
805 "readahead\0"
806 "readdir\0"
807 "remap_file_pages\0"
808 "sched_get_priority_max\0"
809 "sched_get_priority_min\0"
810 "sched_getaffinity\0"
811 "sched_getattr\0"
812 "sched_getparam\0"
813 "sched_getscheduler\0"
814 "sched_rr_get_interval\0"
815 "sched_yield\0"
816 "sendfile\0"
817 "sendfile64\0"
818 "setfsgid\0"
819 "setfsgid32\0"
820 "setfsuid\0"
821 "setfsuid32\0"
822 "setpgid\0"
823 "setsid\0"
824 "splice\0"
825 "sysinfo\0"
826 "tee\0"
827 "umask\0"
828 "uname\0"
829 "userfaultfd\0"
830 "vmsplice\0"
831 },
832 [SYSCALL_FILTER_SET_TIMER] = {
833 .name = "@timer",
834 .help = "Schedule operations by time",
835 .value =
836 "alarm\0"
837 "getitimer\0"
838 "setitimer\0"
839 "timer_create\0"
840 "timer_delete\0"
841 "timer_getoverrun\0"
842 "timer_gettime\0"
843 "timer_settime\0"
844 "timerfd_create\0"
845 "timerfd_gettime\0"
846 "timerfd_settime\0"
847 "times\0"
848 },
849 };
850
851 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
852 unsigned i;
853
854 if (isempty(name) || name[0] != '@')
855 return NULL;
856
857 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
858 if (streq(syscall_filter_sets[i].name, name))
859 return syscall_filter_sets + i;
860
861 return NULL;
862 }
863
864 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude, bool log_missing);
865
866 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude, bool log_missing) {
867 assert(seccomp);
868 assert(name);
869
870 if (strv_contains(exclude, name))
871 return 0;
872
873 if (name[0] == '@') {
874 const SyscallFilterSet *other;
875
876 other = syscall_filter_set_find(name);
877 if (!other) {
878 log_debug("Filter set %s is not known!", name);
879 return -EINVAL;
880 }
881
882 return seccomp_add_syscall_filter_set(seccomp, other, action, exclude, log_missing);
883
884 } else {
885 int id, r;
886
887 id = seccomp_syscall_resolve_name(name);
888 if (id == __NR_SCMP_ERROR) {
889 if (log_missing)
890 log_debug("System call %s is not known, ignoring.", name);
891 return 0;
892 }
893
894 r = seccomp_rule_add_exact(seccomp, action, id, 0);
895 if (r < 0) {
896 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
897 bool ignore = r == -EDOM;
898
899 if (!ignore || log_missing)
900 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
901 name, id, ignore ? ", ignoring" : "");
902 if (!ignore)
903 return r;
904 }
905
906 return 0;
907 }
908 }
909
910 static int seccomp_add_syscall_filter_set(
911 scmp_filter_ctx seccomp,
912 const SyscallFilterSet *set,
913 uint32_t action,
914 char **exclude,
915 bool log_missing) {
916
917 const char *sys;
918 int r;
919
920 assert(seccomp);
921 assert(set);
922
923 NULSTR_FOREACH(sys, set->value) {
924 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude, log_missing);
925 if (r < 0)
926 return r;
927 }
928
929 return 0;
930 }
931
932 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action, bool log_missing) {
933 uint32_t arch;
934 int r;
935
936 assert(set);
937
938 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
939 * each local arch. */
940
941 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
942 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
943
944 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
945
946 r = seccomp_init_for_arch(&seccomp, arch, default_action);
947 if (r < 0)
948 return r;
949
950 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL, log_missing);
951 if (r < 0)
952 return log_debug_errno(r, "Failed to add filter set: %m");
953
954 r = seccomp_load(seccomp);
955 if (IN_SET(r, -EPERM, -EACCES))
956 return r;
957 if (r < 0)
958 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
959 }
960
961 return 0;
962 }
963
964 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action, bool log_missing) {
965 uint32_t arch;
966 int r;
967
968 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
969 * SyscallFilterSet* table. */
970
971 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
972 return 0;
973
974 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
975 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
976 Iterator i;
977 void *syscall_id, *val;
978
979 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
980
981 r = seccomp_init_for_arch(&seccomp, arch, default_action);
982 if (r < 0)
983 return r;
984
985 HASHMAP_FOREACH_KEY(val, syscall_id, set, i) {
986 uint32_t a = action;
987 int id = PTR_TO_INT(syscall_id) - 1;
988 int error = PTR_TO_INT(val);
989
990 if (action != SCMP_ACT_ALLOW && error >= 0)
991 a = SCMP_ACT_ERRNO(error);
992
993 r = seccomp_rule_add_exact(seccomp, a, id, 0);
994 if (r < 0) {
995 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
996 _cleanup_free_ char *n = NULL;
997 bool ignore;
998
999 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, id);
1000 ignore = r == -EDOM;
1001 if (!ignore || log_missing)
1002 log_debug_errno(r, "Failed to add rule for system call %s() / %d%s: %m",
1003 strna(n), id, ignore ? ", ignoring" : "");
1004 if (!ignore)
1005 return r;
1006 }
1007 }
1008
1009 r = seccomp_load(seccomp);
1010 if (IN_SET(r, -EPERM, -EACCES))
1011 return r;
1012 if (r < 0)
1013 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1014 }
1015
1016 return 0;
1017 }
1018
1019 int seccomp_parse_syscall_filter_full(
1020 const char *name,
1021 int errno_num,
1022 Hashmap *filter,
1023 SeccompParseFlags flags,
1024 const char *unit,
1025 const char *filename,
1026 unsigned line) {
1027
1028 int r;
1029
1030 assert(name);
1031 assert(filter);
1032
1033 if (name[0] == '@') {
1034 const SyscallFilterSet *set;
1035 const char *i;
1036
1037 set = syscall_filter_set_find(name);
1038 if (!set) {
1039 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1040 return -EINVAL;
1041
1042 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1043 "Unknown system call group, ignoring: %s", name);
1044 return 0;
1045 }
1046
1047 NULSTR_FOREACH(i, set->value) {
1048 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
1049 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
1050 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
1051 * about them. */
1052 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
1053 if (r < 0)
1054 return r;
1055 }
1056 } else {
1057 int id;
1058
1059 id = seccomp_syscall_resolve_name(name);
1060 if (id == __NR_SCMP_ERROR) {
1061 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
1062 return -EINVAL;
1063
1064 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
1065 "Failed to parse system call, ignoring: %s", name);
1066 return 0;
1067 }
1068
1069 /* If we previously wanted to forbid a syscall and now
1070 * we want to allow it, then remove it from the list. */
1071 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
1072 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
1073 if (r < 0)
1074 switch (r) {
1075 case -ENOMEM:
1076 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
1077 case -EEXIST:
1078 assert_se(hashmap_update(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num)) == 0);
1079 break;
1080 default:
1081 return r;
1082 }
1083 } else
1084 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1085 }
1086
1087 return 0;
1088 }
1089
1090 int seccomp_restrict_namespaces(unsigned long retain) {
1091 uint32_t arch;
1092 int r;
1093
1094 if (DEBUG_LOGGING) {
1095 _cleanup_free_ char *s = NULL;
1096
1097 (void) namespace_flags_to_string(retain, &s);
1098 log_debug("Restricting namespace to: %s.", strna(s));
1099 }
1100
1101 /* NOOP? */
1102 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1103 return 0;
1104
1105 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1106 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1107 unsigned i;
1108
1109 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1110
1111 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1112 if (r < 0)
1113 return r;
1114
1115 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1116 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1117 * altogether. */
1118 r = seccomp_rule_add_exact(
1119 seccomp,
1120 SCMP_ACT_ERRNO(EPERM),
1121 SCMP_SYS(setns),
1122 0);
1123 else
1124 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1125 * special invocation with a zero flags argument, right here. */
1126 r = seccomp_rule_add_exact(
1127 seccomp,
1128 SCMP_ACT_ERRNO(EPERM),
1129 SCMP_SYS(setns),
1130 1,
1131 SCMP_A1(SCMP_CMP_EQ, 0));
1132 if (r < 0) {
1133 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1134 continue;
1135 }
1136
1137 for (i = 0; namespace_flag_map[i].name; i++) {
1138 unsigned long f;
1139
1140 f = namespace_flag_map[i].flag;
1141 if ((retain & f) == f) {
1142 log_debug("Permitting %s.", namespace_flag_map[i].name);
1143 continue;
1144 }
1145
1146 log_debug("Blocking %s.", namespace_flag_map[i].name);
1147
1148 r = seccomp_rule_add_exact(
1149 seccomp,
1150 SCMP_ACT_ERRNO(EPERM),
1151 SCMP_SYS(unshare),
1152 1,
1153 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1154 if (r < 0) {
1155 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1156 break;
1157 }
1158
1159 /* On s390/s390x the first two parameters to clone are switched */
1160 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1161 r = seccomp_rule_add_exact(
1162 seccomp,
1163 SCMP_ACT_ERRNO(EPERM),
1164 SCMP_SYS(clone),
1165 1,
1166 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1167 else
1168 r = seccomp_rule_add_exact(
1169 seccomp,
1170 SCMP_ACT_ERRNO(EPERM),
1171 SCMP_SYS(clone),
1172 1,
1173 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1174 if (r < 0) {
1175 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1176 break;
1177 }
1178
1179 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1180 r = seccomp_rule_add_exact(
1181 seccomp,
1182 SCMP_ACT_ERRNO(EPERM),
1183 SCMP_SYS(setns),
1184 1,
1185 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1186 if (r < 0) {
1187 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1188 break;
1189 }
1190 }
1191 }
1192 if (r < 0)
1193 continue;
1194
1195 r = seccomp_load(seccomp);
1196 if (IN_SET(r, -EPERM, -EACCES))
1197 return r;
1198 if (r < 0)
1199 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 }
1201
1202 return 0;
1203 }
1204
1205 int seccomp_protect_sysctl(void) {
1206 uint32_t arch;
1207 int r;
1208
1209 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1210 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1211
1212 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1213
1214 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1215 /* No _sysctl syscall */
1216 continue;
1217
1218 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1219 if (r < 0)
1220 return r;
1221
1222 r = seccomp_rule_add_exact(
1223 seccomp,
1224 SCMP_ACT_ERRNO(EPERM),
1225 SCMP_SYS(_sysctl),
1226 0);
1227 if (r < 0) {
1228 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1229 continue;
1230 }
1231
1232 r = seccomp_load(seccomp);
1233 if (IN_SET(r, -EPERM, -EACCES))
1234 return r;
1235 if (r < 0)
1236 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1237 }
1238
1239 return 0;
1240 }
1241
1242 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1243 uint32_t arch;
1244 int r;
1245
1246 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1247 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1248 bool supported;
1249 Iterator i;
1250
1251 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1252
1253 switch (arch) {
1254
1255 case SCMP_ARCH_X86_64:
1256 case SCMP_ARCH_X32:
1257 case SCMP_ARCH_ARM:
1258 case SCMP_ARCH_AARCH64:
1259 case SCMP_ARCH_PPC:
1260 case SCMP_ARCH_PPC64:
1261 case SCMP_ARCH_PPC64LE:
1262 case SCMP_ARCH_MIPSEL64N32:
1263 case SCMP_ARCH_MIPS64N32:
1264 case SCMP_ARCH_MIPSEL64:
1265 case SCMP_ARCH_MIPS64:
1266 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1267 supported = true;
1268 break;
1269
1270 case SCMP_ARCH_S390:
1271 case SCMP_ARCH_S390X:
1272 case SCMP_ARCH_X86:
1273 case SCMP_ARCH_MIPSEL:
1274 case SCMP_ARCH_MIPS:
1275 default:
1276 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1277 * don't know */
1278 supported = false;
1279 break;
1280 }
1281
1282 if (!supported)
1283 continue;
1284
1285 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1286 if (r < 0)
1287 return r;
1288
1289 if (whitelist) {
1290 int af, first = 0, last = 0;
1291 void *afp;
1292
1293 /* If this is a whitelist, we first block the address families that are out of range and then
1294 * everything that is not in the set. First, we find the lowest and highest address family in
1295 * the set. */
1296
1297 SET_FOREACH(afp, address_families, i) {
1298 af = PTR_TO_INT(afp);
1299
1300 if (af <= 0 || af >= af_max())
1301 continue;
1302
1303 if (first == 0 || af < first)
1304 first = af;
1305
1306 if (last == 0 || af > last)
1307 last = af;
1308 }
1309
1310 assert((first == 0) == (last == 0));
1311
1312 if (first == 0) {
1313
1314 /* No entries in the valid range, block everything */
1315 r = seccomp_rule_add_exact(
1316 seccomp,
1317 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1318 SCMP_SYS(socket),
1319 0);
1320 if (r < 0) {
1321 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1322 continue;
1323 }
1324
1325 } else {
1326
1327 /* Block everything below the first entry */
1328 r = seccomp_rule_add_exact(
1329 seccomp,
1330 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1331 SCMP_SYS(socket),
1332 1,
1333 SCMP_A0(SCMP_CMP_LT, first));
1334 if (r < 0) {
1335 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1336 continue;
1337 }
1338
1339 /* Block everything above the last entry */
1340 r = seccomp_rule_add_exact(
1341 seccomp,
1342 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1343 SCMP_SYS(socket),
1344 1,
1345 SCMP_A0(SCMP_CMP_GT, last));
1346 if (r < 0) {
1347 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1348 continue;
1349 }
1350
1351 /* Block everything between the first and last entry */
1352 for (af = 1; af < af_max(); af++) {
1353
1354 if (set_contains(address_families, INT_TO_PTR(af)))
1355 continue;
1356
1357 r = seccomp_rule_add_exact(
1358 seccomp,
1359 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1360 SCMP_SYS(socket),
1361 1,
1362 SCMP_A0(SCMP_CMP_EQ, af));
1363 if (r < 0)
1364 break;
1365 }
1366 if (r < 0) {
1367 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1368 continue;
1369 }
1370 }
1371
1372 } else {
1373 void *af;
1374
1375 /* If this is a blacklist, then generate one rule for
1376 * each address family that are then combined in OR
1377 * checks. */
1378
1379 SET_FOREACH(af, address_families, i) {
1380
1381 r = seccomp_rule_add_exact(
1382 seccomp,
1383 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1384 SCMP_SYS(socket),
1385 1,
1386 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1387 if (r < 0)
1388 break;
1389 }
1390 if (r < 0) {
1391 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1392 continue;
1393 }
1394 }
1395
1396 r = seccomp_load(seccomp);
1397 if (IN_SET(r, -EPERM, -EACCES))
1398 return r;
1399 if (r < 0)
1400 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1401 }
1402
1403 return 0;
1404 }
1405
1406 int seccomp_restrict_realtime(void) {
1407 static const int permitted_policies[] = {
1408 SCHED_OTHER,
1409 SCHED_BATCH,
1410 SCHED_IDLE,
1411 };
1412
1413 int r, max_policy = 0;
1414 uint32_t arch;
1415 unsigned i;
1416
1417 /* Determine the highest policy constant we want to allow */
1418 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1419 if (permitted_policies[i] > max_policy)
1420 max_policy = permitted_policies[i];
1421
1422 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1423 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1424 int p;
1425
1426 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1427
1428 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1429 if (r < 0)
1430 return r;
1431
1432 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1433 * whitelist. */
1434 for (p = 0; p < max_policy; p++) {
1435 bool good = false;
1436
1437 /* Check if this is in the whitelist. */
1438 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1439 if (permitted_policies[i] == p) {
1440 good = true;
1441 break;
1442 }
1443
1444 if (good)
1445 continue;
1446
1447 /* Deny this policy */
1448 r = seccomp_rule_add_exact(
1449 seccomp,
1450 SCMP_ACT_ERRNO(EPERM),
1451 SCMP_SYS(sched_setscheduler),
1452 1,
1453 SCMP_A1(SCMP_CMP_EQ, p));
1454 if (r < 0) {
1455 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1456 continue;
1457 }
1458 }
1459
1460 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1461 * unsigned here, hence no need no check for < 0 values. */
1462 r = seccomp_rule_add_exact(
1463 seccomp,
1464 SCMP_ACT_ERRNO(EPERM),
1465 SCMP_SYS(sched_setscheduler),
1466 1,
1467 SCMP_A1(SCMP_CMP_GT, max_policy));
1468 if (r < 0) {
1469 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1470 continue;
1471 }
1472
1473 r = seccomp_load(seccomp);
1474 if (IN_SET(r, -EPERM, -EACCES))
1475 return r;
1476 if (r < 0)
1477 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1478 }
1479
1480 return 0;
1481 }
1482
1483 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1484 uint32_t arch,
1485 int nr,
1486 unsigned arg_cnt,
1487 const struct scmp_arg_cmp arg) {
1488 int r;
1489
1490 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1491 if (r < 0) {
1492 _cleanup_free_ char *n = NULL;
1493
1494 n = seccomp_syscall_resolve_num_arch(arch, nr);
1495 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1496 strna(n),
1497 seccomp_arch_to_string(arch));
1498 }
1499
1500 return r;
1501 }
1502
1503 /* For known architectures, check that syscalls are indeed defined or not. */
1504 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1505 assert_cc(SCMP_SYS(shmget) > 0);
1506 assert_cc(SCMP_SYS(shmat) > 0);
1507 assert_cc(SCMP_SYS(shmdt) > 0);
1508 #elif defined(__i386__) || defined(__powerpc64__)
1509 assert_cc(SCMP_SYS(shmget) < 0);
1510 assert_cc(SCMP_SYS(shmat) < 0);
1511 assert_cc(SCMP_SYS(shmdt) < 0);
1512 #endif
1513
1514 int seccomp_memory_deny_write_execute(void) {
1515
1516 uint32_t arch;
1517 int r;
1518
1519 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1520 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1521 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1522
1523 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1524
1525 switch (arch) {
1526
1527 case SCMP_ARCH_X86:
1528 filter_syscall = SCMP_SYS(mmap2);
1529 block_syscall = SCMP_SYS(mmap);
1530 break;
1531
1532 case SCMP_ARCH_PPC:
1533 case SCMP_ARCH_PPC64:
1534 case SCMP_ARCH_PPC64LE:
1535 filter_syscall = SCMP_SYS(mmap);
1536
1537 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1538 * We ignore that here, which means there's still a way to get writable/executable
1539 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1540
1541 break;
1542
1543 case SCMP_ARCH_ARM:
1544 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1545 shmat_syscall = SCMP_SYS(shmat);
1546 break;
1547
1548 case SCMP_ARCH_X86_64:
1549 case SCMP_ARCH_X32:
1550 case SCMP_ARCH_AARCH64:
1551 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1552 shmat_syscall = SCMP_SYS(shmat);
1553 break;
1554
1555 /* Please add more definitions here, if you port systemd to other architectures! */
1556
1557 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1558 #warning "Consider adding the right mmap() syscall definitions here!"
1559 #endif
1560 }
1561
1562 /* Can't filter mmap() on this arch, then skip it */
1563 if (filter_syscall == 0)
1564 continue;
1565
1566 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1567 if (r < 0)
1568 return r;
1569
1570 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1571 1,
1572 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1573 if (r < 0)
1574 continue;
1575
1576 if (block_syscall != 0) {
1577 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1578 if (r < 0)
1579 continue;
1580 }
1581
1582 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1583 1,
1584 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1585 if (r < 0)
1586 continue;
1587
1588 #ifdef __NR_pkey_mprotect
1589 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1590 1,
1591 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1592 if (r < 0)
1593 continue;
1594 #endif
1595
1596 if (shmat_syscall != 0) {
1597 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1598 1,
1599 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1600 if (r < 0)
1601 continue;
1602 }
1603
1604 r = seccomp_load(seccomp);
1605 if (IN_SET(r, -EPERM, -EACCES))
1606 return r;
1607 if (r < 0)
1608 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1609 }
1610
1611 return 0;
1612 }
1613
1614 int seccomp_restrict_archs(Set *archs) {
1615 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1616 Iterator i;
1617 void *id;
1618 int r;
1619
1620 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1621 * list.
1622 *
1623 * There are some qualifications. However the most important use is to stop processes from bypassing
1624 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1625 * in a non-native architecture. There are no holes in this use case, at least so far. */
1626
1627 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1628 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1629 * to run a program with the restrictions applied. */
1630 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1631 if (!seccomp)
1632 return -ENOMEM;
1633
1634 SET_FOREACH(id, archs, i) {
1635 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1636 if (r < 0 && r != -EEXIST)
1637 return r;
1638 }
1639
1640 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1641 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1642 * The important thing is that you can block the old 32-bit x86 syscalls.
1643 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1644
1645 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1646 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1647
1648 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1649 if (r < 0 && r != -EEXIST)
1650 return r;
1651 }
1652
1653 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1654 if (r < 0)
1655 return r;
1656
1657 r = seccomp_load(seccomp);
1658 if (IN_SET(r, -EPERM, -EACCES))
1659 return r;
1660 if (r < 0)
1661 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1662
1663 return 0;
1664 }
1665
1666 int parse_syscall_archs(char **l, Set **archs) {
1667 _cleanup_set_free_ Set *_archs;
1668 char **s;
1669 int r;
1670
1671 assert(l);
1672 assert(archs);
1673
1674 r = set_ensure_allocated(&_archs, NULL);
1675 if (r < 0)
1676 return r;
1677
1678 STRV_FOREACH(s, l) {
1679 uint32_t a;
1680
1681 r = seccomp_arch_from_string(*s, &a);
1682 if (r < 0)
1683 return -EINVAL;
1684
1685 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1686 if (r < 0)
1687 return -ENOMEM;
1688 }
1689
1690 *archs = TAKE_PTR(_archs);
1691
1692 return 0;
1693 }
1694
1695 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1696 const char *i;
1697 int r;
1698
1699 assert(set);
1700
1701 NULSTR_FOREACH(i, set->value) {
1702
1703 if (i[0] == '@') {
1704 const SyscallFilterSet *more;
1705
1706 more = syscall_filter_set_find(i);
1707 if (!more)
1708 return -ENXIO;
1709
1710 r = seccomp_filter_set_add(filter, add, more);
1711 if (r < 0)
1712 return r;
1713 } else {
1714 int id;
1715
1716 id = seccomp_syscall_resolve_name(i);
1717 if (id == __NR_SCMP_ERROR) {
1718 log_debug("Couldn't resolve system call, ignoring: %s", i);
1719 continue;
1720 }
1721
1722 if (add) {
1723 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1724 if (r < 0)
1725 return r;
1726 } else
1727 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1728 }
1729 }
1730
1731 return 0;
1732 }
1733
1734 int seccomp_lock_personality(unsigned long personality) {
1735 uint32_t arch;
1736 int r;
1737
1738 if (personality >= PERSONALITY_INVALID)
1739 return -EINVAL;
1740
1741 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1742 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1743
1744 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1745 if (r < 0)
1746 return r;
1747
1748 r = seccomp_rule_add_exact(
1749 seccomp,
1750 SCMP_ACT_ERRNO(EPERM),
1751 SCMP_SYS(personality),
1752 1,
1753 SCMP_A0(SCMP_CMP_NE, personality));
1754 if (r < 0) {
1755 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1756 continue;
1757 }
1758
1759 r = seccomp_load(seccomp);
1760 if (IN_SET(r, -EPERM, -EACCES))
1761 return r;
1762 if (r < 0)
1763 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1764 }
1765
1766 return 0;
1767 }