]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
tree-wide: drop 'This file is part of systemd' blurb
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 Copyright 2014 Lennart Poettering
4 ***/
5
6 #include <errno.h>
7 #include <linux/seccomp.h>
8 #include <seccomp.h>
9 #include <stddef.h>
10 #include <sys/mman.h>
11 #include <sys/prctl.h>
12 #include <sys/shm.h>
13
14 #include "af-list.h"
15 #include "alloc-util.h"
16 #include "macro.h"
17 #include "nsflags.h"
18 #include "process-util.h"
19 #include "seccomp-util.h"
20 #include "set.h"
21 #include "string-util.h"
22 #include "strv.h"
23 #include "util.h"
24 #include "errno-list.h"
25
26 const uint32_t seccomp_local_archs[] = {
27
28 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
29
30 #if defined(__x86_64__) && defined(__ILP32__)
31 SCMP_ARCH_X86,
32 SCMP_ARCH_X86_64,
33 SCMP_ARCH_X32, /* native */
34 #elif defined(__x86_64__) && !defined(__ILP32__)
35 SCMP_ARCH_X86,
36 SCMP_ARCH_X32,
37 SCMP_ARCH_X86_64, /* native */
38 #elif defined(__i386__)
39 SCMP_ARCH_X86,
40 #elif defined(__aarch64__)
41 SCMP_ARCH_ARM,
42 SCMP_ARCH_AARCH64, /* native */
43 #elif defined(__arm__)
44 SCMP_ARCH_ARM,
45 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
46 SCMP_ARCH_MIPSEL,
47 SCMP_ARCH_MIPS, /* native */
48 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
49 SCMP_ARCH_MIPS,
50 SCMP_ARCH_MIPSEL, /* native */
51 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
52 SCMP_ARCH_MIPSEL,
53 SCMP_ARCH_MIPS,
54 SCMP_ARCH_MIPSEL64N32,
55 SCMP_ARCH_MIPS64N32,
56 SCMP_ARCH_MIPSEL64,
57 SCMP_ARCH_MIPS64, /* native */
58 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
59 SCMP_ARCH_MIPS,
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS64N32,
62 SCMP_ARCH_MIPSEL64N32,
63 SCMP_ARCH_MIPS64,
64 SCMP_ARCH_MIPSEL64, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64,
69 SCMP_ARCH_MIPS64,
70 SCMP_ARCH_MIPSEL64N32,
71 SCMP_ARCH_MIPS64N32, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64,
76 SCMP_ARCH_MIPSEL64,
77 SCMP_ARCH_MIPS64N32,
78 SCMP_ARCH_MIPSEL64N32, /* native */
79 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
80 SCMP_ARCH_PPC,
81 SCMP_ARCH_PPC64LE,
82 SCMP_ARCH_PPC64, /* native */
83 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
84 SCMP_ARCH_PPC,
85 SCMP_ARCH_PPC64,
86 SCMP_ARCH_PPC64LE, /* native */
87 #elif defined(__powerpc__)
88 SCMP_ARCH_PPC,
89 #elif defined(__s390x__)
90 SCMP_ARCH_S390,
91 SCMP_ARCH_S390X, /* native */
92 #elif defined(__s390__)
93 SCMP_ARCH_S390,
94 #endif
95 (uint32_t) -1
96 };
97
98 const char* seccomp_arch_to_string(uint32_t c) {
99 /* Maintain order used in <seccomp.h>.
100 *
101 * Names used here should be the same as those used for ConditionArchitecture=,
102 * except for "subarchitectures" like x32. */
103
104 switch(c) {
105 case SCMP_ARCH_NATIVE:
106 return "native";
107 case SCMP_ARCH_X86:
108 return "x86";
109 case SCMP_ARCH_X86_64:
110 return "x86-64";
111 case SCMP_ARCH_X32:
112 return "x32";
113 case SCMP_ARCH_ARM:
114 return "arm";
115 case SCMP_ARCH_AARCH64:
116 return "arm64";
117 case SCMP_ARCH_MIPS:
118 return "mips";
119 case SCMP_ARCH_MIPS64:
120 return "mips64";
121 case SCMP_ARCH_MIPS64N32:
122 return "mips64-n32";
123 case SCMP_ARCH_MIPSEL:
124 return "mips-le";
125 case SCMP_ARCH_MIPSEL64:
126 return "mips64-le";
127 case SCMP_ARCH_MIPSEL64N32:
128 return "mips64-le-n32";
129 case SCMP_ARCH_PPC:
130 return "ppc";
131 case SCMP_ARCH_PPC64:
132 return "ppc64";
133 case SCMP_ARCH_PPC64LE:
134 return "ppc64-le";
135 case SCMP_ARCH_S390:
136 return "s390";
137 case SCMP_ARCH_S390X:
138 return "s390x";
139 default:
140 return NULL;
141 }
142 }
143
144 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
145 if (!n)
146 return -EINVAL;
147
148 assert(ret);
149
150 if (streq(n, "native"))
151 *ret = SCMP_ARCH_NATIVE;
152 else if (streq(n, "x86"))
153 *ret = SCMP_ARCH_X86;
154 else if (streq(n, "x86-64"))
155 *ret = SCMP_ARCH_X86_64;
156 else if (streq(n, "x32"))
157 *ret = SCMP_ARCH_X32;
158 else if (streq(n, "arm"))
159 *ret = SCMP_ARCH_ARM;
160 else if (streq(n, "arm64"))
161 *ret = SCMP_ARCH_AARCH64;
162 else if (streq(n, "mips"))
163 *ret = SCMP_ARCH_MIPS;
164 else if (streq(n, "mips64"))
165 *ret = SCMP_ARCH_MIPS64;
166 else if (streq(n, "mips64-n32"))
167 *ret = SCMP_ARCH_MIPS64N32;
168 else if (streq(n, "mips-le"))
169 *ret = SCMP_ARCH_MIPSEL;
170 else if (streq(n, "mips64-le"))
171 *ret = SCMP_ARCH_MIPSEL64;
172 else if (streq(n, "mips64-le-n32"))
173 *ret = SCMP_ARCH_MIPSEL64N32;
174 else if (streq(n, "ppc"))
175 *ret = SCMP_ARCH_PPC;
176 else if (streq(n, "ppc64"))
177 *ret = SCMP_ARCH_PPC64;
178 else if (streq(n, "ppc64-le"))
179 *ret = SCMP_ARCH_PPC64LE;
180 else if (streq(n, "s390"))
181 *ret = SCMP_ARCH_S390;
182 else if (streq(n, "s390x"))
183 *ret = SCMP_ARCH_S390X;
184 else
185 return -EINVAL;
186
187 return 0;
188 }
189
190 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
191 scmp_filter_ctx seccomp;
192 int r;
193
194 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
195 * any others. Also, turns off the NNP fiddling. */
196
197 seccomp = seccomp_init(default_action);
198 if (!seccomp)
199 return -ENOMEM;
200
201 if (arch != SCMP_ARCH_NATIVE &&
202 arch != seccomp_arch_native()) {
203
204 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
205 if (r < 0)
206 goto finish;
207
208 r = seccomp_arch_add(seccomp, arch);
209 if (r < 0)
210 goto finish;
211
212 assert(seccomp_arch_exist(seccomp, arch) >= 0);
213 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
214 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
215 } else {
216 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
217 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
218 }
219
220 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
221 if (r < 0)
222 goto finish;
223
224 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
225 if (r < 0)
226 goto finish;
227
228 *ret = seccomp;
229 return 0;
230
231 finish:
232 seccomp_release(seccomp);
233 return r;
234 }
235
236 static bool is_basic_seccomp_available(void) {
237 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
238 }
239
240 static bool is_seccomp_filter_available(void) {
241 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
242 errno == EFAULT;
243 }
244
245 bool is_seccomp_available(void) {
246 static int cached_enabled = -1;
247
248 if (cached_enabled < 0)
249 cached_enabled =
250 is_basic_seccomp_available() &&
251 is_seccomp_filter_available();
252
253 return cached_enabled;
254 }
255
256 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
257 [SYSCALL_FILTER_SET_DEFAULT] = {
258 .name = "@default",
259 .help = "System calls that are always permitted",
260 .value =
261 "clock_getres\0"
262 "clock_gettime\0"
263 "clock_nanosleep\0"
264 "execve\0"
265 "exit\0"
266 "exit_group\0"
267 "futex\0"
268 "get_robust_list\0"
269 "get_thread_area\0"
270 "getegid\0"
271 "getegid32\0"
272 "geteuid\0"
273 "geteuid32\0"
274 "getgid\0"
275 "getgid32\0"
276 "getgroups\0"
277 "getgroups32\0"
278 "getpgid\0"
279 "getpgrp\0"
280 "getpid\0"
281 "getppid\0"
282 "getresgid\0"
283 "getresgid32\0"
284 "getresuid\0"
285 "getresuid32\0"
286 "getrlimit\0" /* make sure processes can query stack size and such */
287 "getsid\0"
288 "gettid\0"
289 "gettimeofday\0"
290 "getuid\0"
291 "getuid32\0"
292 "membarrier\0"
293 "nanosleep\0"
294 "pause\0"
295 "prlimit64\0"
296 "restart_syscall\0"
297 "rt_sigreturn\0"
298 "sched_yield\0"
299 "set_robust_list\0"
300 "set_thread_area\0"
301 "set_tid_address\0"
302 "set_tls\0"
303 "sigreturn\0"
304 "time\0"
305 "ugetrlimit\0"
306 },
307 [SYSCALL_FILTER_SET_AIO] = {
308 .name = "@aio",
309 .help = "Asynchronous IO",
310 .value =
311 "io_cancel\0"
312 "io_destroy\0"
313 "io_getevents\0"
314 "io_setup\0"
315 "io_submit\0"
316 },
317 [SYSCALL_FILTER_SET_BASIC_IO] = {
318 .name = "@basic-io",
319 .help = "Basic IO",
320 .value =
321 "_llseek\0"
322 "close\0"
323 "dup\0"
324 "dup2\0"
325 "dup3\0"
326 "lseek\0"
327 "pread64\0"
328 "preadv\0"
329 "preadv2\0"
330 "pwrite64\0"
331 "pwritev\0"
332 "pwritev2\0"
333 "read\0"
334 "readv\0"
335 "write\0"
336 "writev\0"
337 },
338 [SYSCALL_FILTER_SET_CHOWN] = {
339 .name = "@chown",
340 .help = "Change ownership of files and directories",
341 .value =
342 "chown\0"
343 "chown32\0"
344 "fchown\0"
345 "fchown32\0"
346 "fchownat\0"
347 "lchown\0"
348 "lchown32\0"
349 },
350 [SYSCALL_FILTER_SET_CLOCK] = {
351 .name = "@clock",
352 .help = "Change the system time",
353 .value =
354 "adjtimex\0"
355 "clock_adjtime\0"
356 "clock_settime\0"
357 "settimeofday\0"
358 "stime\0"
359 },
360 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
361 .name = "@cpu-emulation",
362 .help = "System calls for CPU emulation functionality",
363 .value =
364 "modify_ldt\0"
365 "subpage_prot\0"
366 "switch_endian\0"
367 "vm86\0"
368 "vm86old\0"
369 },
370 [SYSCALL_FILTER_SET_DEBUG] = {
371 .name = "@debug",
372 .help = "Debugging, performance monitoring and tracing functionality",
373 .value =
374 "lookup_dcookie\0"
375 "perf_event_open\0"
376 "process_vm_readv\0"
377 "process_vm_writev\0"
378 "ptrace\0"
379 "rtas\0"
380 #ifdef __NR_s390_runtime_instr
381 "s390_runtime_instr\0"
382 #endif
383 "sys_debug_setcontext\0"
384 },
385 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
386 .name = "@file-system",
387 .help = "File system operations",
388 .value =
389 "access\0"
390 "chdir\0"
391 "chmod\0"
392 "close\0"
393 "creat\0"
394 "faccessat\0"
395 "fallocate\0"
396 "fchdir\0"
397 "fchmod\0"
398 "fchmodat\0"
399 "fcntl\0"
400 "fcntl64\0"
401 "fgetxattr\0"
402 "flistxattr\0"
403 "fremovexattr\0"
404 "fsetxattr\0"
405 "fstat\0"
406 "fstat64\0"
407 "fstatat64\0"
408 "fstatfs\0"
409 "fstatfs64\0"
410 "ftruncate\0"
411 "ftruncate64\0"
412 "futimesat\0"
413 "getcwd\0"
414 "getdents\0"
415 "getdents64\0"
416 "getxattr\0"
417 "inotify_add_watch\0"
418 "inotify_init\0"
419 "inotify_init1\0"
420 "inotify_rm_watch\0"
421 "lgetxattr\0"
422 "link\0"
423 "linkat\0"
424 "listxattr\0"
425 "llistxattr\0"
426 "lremovexattr\0"
427 "lsetxattr\0"
428 "lstat\0"
429 "lstat64\0"
430 "mkdir\0"
431 "mkdirat\0"
432 "mknod\0"
433 "mknodat\0"
434 "mmap\0"
435 "mmap2\0"
436 "munmap\0"
437 "newfstatat\0"
438 "oldfstat\0"
439 "oldlstat\0"
440 "oldstat\0"
441 "open\0"
442 "openat\0"
443 "readlink\0"
444 "readlinkat\0"
445 "removexattr\0"
446 "rename\0"
447 "renameat\0"
448 "renameat2\0"
449 "rmdir\0"
450 "setxattr\0"
451 "stat\0"
452 "stat64\0"
453 "statfs\0"
454 "statfs64\0"
455 #ifdef __NR_statx
456 "statx\0"
457 #endif
458 "symlink\0"
459 "symlinkat\0"
460 "truncate\0"
461 "truncate64\0"
462 "unlink\0"
463 "unlinkat\0"
464 "utime\0"
465 "utimensat\0"
466 "utimes\0"
467 },
468 [SYSCALL_FILTER_SET_IO_EVENT] = {
469 .name = "@io-event",
470 .help = "Event loop system calls",
471 .value =
472 "_newselect\0"
473 "epoll_create\0"
474 "epoll_create1\0"
475 "epoll_ctl\0"
476 "epoll_ctl_old\0"
477 "epoll_pwait\0"
478 "epoll_wait\0"
479 "epoll_wait_old\0"
480 "eventfd\0"
481 "eventfd2\0"
482 "poll\0"
483 "ppoll\0"
484 "pselect6\0"
485 "select\0"
486 },
487 [SYSCALL_FILTER_SET_IPC] = {
488 .name = "@ipc",
489 .help = "SysV IPC, POSIX Message Queues or other IPC",
490 .value =
491 "ipc\0"
492 "memfd_create\0"
493 "mq_getsetattr\0"
494 "mq_notify\0"
495 "mq_open\0"
496 "mq_timedreceive\0"
497 "mq_timedsend\0"
498 "mq_unlink\0"
499 "msgctl\0"
500 "msgget\0"
501 "msgrcv\0"
502 "msgsnd\0"
503 "pipe\0"
504 "pipe2\0"
505 "process_vm_readv\0"
506 "process_vm_writev\0"
507 "semctl\0"
508 "semget\0"
509 "semop\0"
510 "semtimedop\0"
511 "shmat\0"
512 "shmctl\0"
513 "shmdt\0"
514 "shmget\0"
515 },
516 [SYSCALL_FILTER_SET_KEYRING] = {
517 .name = "@keyring",
518 .help = "Kernel keyring access",
519 .value =
520 "add_key\0"
521 "keyctl\0"
522 "request_key\0"
523 },
524 [SYSCALL_FILTER_SET_MEMLOCK] = {
525 .name = "@memlock",
526 .help = "Memory locking control",
527 .value =
528 "mlock\0"
529 "mlock2\0"
530 "mlockall\0"
531 "munlock\0"
532 "munlockall\0"
533 },
534 [SYSCALL_FILTER_SET_MODULE] = {
535 .name = "@module",
536 .help = "Loading and unloading of kernel modules",
537 .value =
538 "delete_module\0"
539 "finit_module\0"
540 "init_module\0"
541 },
542 [SYSCALL_FILTER_SET_MOUNT] = {
543 .name = "@mount",
544 .help = "Mounting and unmounting of file systems",
545 .value =
546 "chroot\0"
547 "mount\0"
548 "pivot_root\0"
549 "umount\0"
550 "umount2\0"
551 },
552 [SYSCALL_FILTER_SET_NETWORK_IO] = {
553 .name = "@network-io",
554 .help = "Network or Unix socket IO, should not be needed if not network facing",
555 .value =
556 "accept\0"
557 "accept4\0"
558 "bind\0"
559 "connect\0"
560 "getpeername\0"
561 "getsockname\0"
562 "getsockopt\0"
563 "listen\0"
564 "recv\0"
565 "recvfrom\0"
566 "recvmmsg\0"
567 "recvmsg\0"
568 "send\0"
569 "sendmmsg\0"
570 "sendmsg\0"
571 "sendto\0"
572 "setsockopt\0"
573 "shutdown\0"
574 "socket\0"
575 "socketcall\0"
576 "socketpair\0"
577 },
578 [SYSCALL_FILTER_SET_OBSOLETE] = {
579 /* some unknown even to libseccomp */
580 .name = "@obsolete",
581 .help = "Unusual, obsolete or unimplemented system calls",
582 .value =
583 "_sysctl\0"
584 "afs_syscall\0"
585 "bdflush\0"
586 "break\0"
587 "create_module\0"
588 "ftime\0"
589 "get_kernel_syms\0"
590 "getpmsg\0"
591 "gtty\0"
592 "idle\0"
593 "lock\0"
594 "mpx\0"
595 "prof\0"
596 "profil\0"
597 "putpmsg\0"
598 "query_module\0"
599 "security\0"
600 "sgetmask\0"
601 "ssetmask\0"
602 "stty\0"
603 "sysfs\0"
604 "tuxcall\0"
605 "ulimit\0"
606 "uselib\0"
607 "ustat\0"
608 "vserver\0"
609 },
610 [SYSCALL_FILTER_SET_PRIVILEGED] = {
611 .name = "@privileged",
612 .help = "All system calls which need super-user capabilities",
613 .value =
614 "@chown\0"
615 "@clock\0"
616 "@module\0"
617 "@raw-io\0"
618 "@reboot\0"
619 "@swap\0"
620 "_sysctl\0"
621 "acct\0"
622 "bpf\0"
623 "capset\0"
624 "chroot\0"
625 "nfsservctl\0"
626 "pivot_root\0"
627 "quotactl\0"
628 "setdomainname\0"
629 "setfsuid\0"
630 "setfsuid32\0"
631 "setgroups\0"
632 "setgroups32\0"
633 "sethostname\0"
634 "setresuid\0"
635 "setresuid32\0"
636 "setreuid\0"
637 "setreuid32\0"
638 "setuid\0"
639 "setuid32\0"
640 "vhangup\0"
641 },
642 [SYSCALL_FILTER_SET_PROCESS] = {
643 .name = "@process",
644 .help = "Process control, execution, namespaceing operations",
645 .value =
646 "arch_prctl\0"
647 "capget\0" /* Able to query arbitrary processes */
648 "clone\0"
649 "execveat\0"
650 "fork\0"
651 "getrusage\0"
652 "kill\0"
653 "prctl\0"
654 "rt_sigqueueinfo\0"
655 "rt_tgsigqueueinfo\0"
656 "setns\0"
657 "tgkill\0"
658 "times\0"
659 "tkill\0"
660 "unshare\0"
661 "vfork\0"
662 "wait4\0"
663 "waitid\0"
664 "waitpid\0"
665 },
666 [SYSCALL_FILTER_SET_RAW_IO] = {
667 .name = "@raw-io",
668 .help = "Raw I/O port access",
669 .value =
670 "ioperm\0"
671 "iopl\0"
672 "pciconfig_iobase\0"
673 "pciconfig_read\0"
674 "pciconfig_write\0"
675 #ifdef __NR_s390_pci_mmio_read
676 "s390_pci_mmio_read\0"
677 #endif
678 #ifdef __NR_s390_pci_mmio_write
679 "s390_pci_mmio_write\0"
680 #endif
681 },
682 [SYSCALL_FILTER_SET_REBOOT] = {
683 .name = "@reboot",
684 .help = "Reboot and reboot preparation/kexec",
685 .value =
686 "kexec_file_load\0"
687 "kexec_load\0"
688 "reboot\0"
689 },
690 [SYSCALL_FILTER_SET_RESOURCES] = {
691 .name = "@resources",
692 .help = "Alter resource settings",
693 .value =
694 "ioprio_set\0"
695 "mbind\0"
696 "migrate_pages\0"
697 "move_pages\0"
698 "nice\0"
699 "sched_setaffinity\0"
700 "sched_setattr\0"
701 "sched_setparam\0"
702 "sched_setscheduler\0"
703 "set_mempolicy\0"
704 "setpriority\0"
705 "setrlimit\0"
706 },
707 [SYSCALL_FILTER_SET_SETUID] = {
708 .name = "@setuid",
709 .help = "Operations for changing user/group credentials",
710 .value =
711 "setgid\0"
712 "setgid32\0"
713 "setgroups\0"
714 "setgroups32\0"
715 "setregid\0"
716 "setregid32\0"
717 "setresgid\0"
718 "setresgid32\0"
719 "setresuid\0"
720 "setresuid32\0"
721 "setreuid\0"
722 "setreuid32\0"
723 "setuid\0"
724 "setuid32\0"
725 },
726 [SYSCALL_FILTER_SET_SIGNAL] = {
727 .name = "@signal",
728 .help = "Process signal handling",
729 .value =
730 "rt_sigaction\0"
731 "rt_sigpending\0"
732 "rt_sigprocmask\0"
733 "rt_sigsuspend\0"
734 "rt_sigtimedwait\0"
735 "sigaction\0"
736 "sigaltstack\0"
737 "signal\0"
738 "signalfd\0"
739 "signalfd4\0"
740 "sigpending\0"
741 "sigprocmask\0"
742 "sigsuspend\0"
743 },
744 [SYSCALL_FILTER_SET_SWAP] = {
745 .name = "@swap",
746 .help = "Enable/disable swap devices",
747 .value =
748 "swapoff\0"
749 "swapon\0"
750 },
751 [SYSCALL_FILTER_SET_SYNC] = {
752 .name = "@sync",
753 .help = "Synchronize files and memory to storage",
754 .value =
755 "fdatasync\0"
756 "fsync\0"
757 "msync\0"
758 "sync\0"
759 "sync_file_range\0"
760 "syncfs\0"
761 },
762 [SYSCALL_FILTER_SET_TIMER] = {
763 .name = "@timer",
764 .help = "Schedule operations by time",
765 .value =
766 "alarm\0"
767 "getitimer\0"
768 "setitimer\0"
769 "timer_create\0"
770 "timer_delete\0"
771 "timer_getoverrun\0"
772 "timer_gettime\0"
773 "timer_settime\0"
774 "timerfd_create\0"
775 "timerfd_gettime\0"
776 "timerfd_settime\0"
777 "times\0"
778 },
779 };
780
781 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
782 unsigned i;
783
784 if (isempty(name) || name[0] != '@')
785 return NULL;
786
787 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
788 if (streq(syscall_filter_sets[i].name, name))
789 return syscall_filter_sets + i;
790
791 return NULL;
792 }
793
794 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
795
796 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
797 int r;
798
799 assert(seccomp);
800 assert(name);
801
802 if (strv_contains(exclude, name))
803 return 0;
804
805 if (name[0] == '@') {
806 const SyscallFilterSet *other;
807
808 other = syscall_filter_set_find(name);
809 if (!other) {
810 log_debug("Filter set %s is not known!", name);
811 return -EINVAL;
812 }
813
814 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
815 if (r < 0)
816 return r;
817 } else {
818 int id;
819
820 id = seccomp_syscall_resolve_name(name);
821 if (id == __NR_SCMP_ERROR) {
822 log_debug("System call %s is not known, ignoring.", name);
823 return 0;
824 }
825
826 r = seccomp_rule_add_exact(seccomp, action, id, 0);
827 if (r < 0)
828 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
829 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
830 }
831
832 return 0;
833 }
834
835 static int seccomp_add_syscall_filter_set(
836 scmp_filter_ctx seccomp,
837 const SyscallFilterSet *set,
838 uint32_t action,
839 char **exclude) {
840
841 const char *sys;
842 int r;
843
844 assert(seccomp);
845 assert(set);
846
847 NULSTR_FOREACH(sys, set->value) {
848 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
849 if (r < 0)
850 return r;
851 }
852
853 return 0;
854 }
855
856 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
857 uint32_t arch;
858 int r;
859
860 assert(set);
861
862 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
863 * earch local arch. */
864
865 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
866 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
867
868 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
869
870 r = seccomp_init_for_arch(&seccomp, arch, default_action);
871 if (r < 0)
872 return r;
873
874 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
875 if (r < 0) {
876 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
877 continue;
878 }
879
880 r = seccomp_load(seccomp);
881 if (IN_SET(r, -EPERM, -EACCES))
882 return r;
883 if (r < 0)
884 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
885 }
886
887 return 0;
888 }
889
890 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Hashmap* set, uint32_t action) {
891 uint32_t arch;
892 int r;
893
894 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
895 * SyscallFilterSet* table. */
896
897 if (hashmap_isempty(set) && default_action == SCMP_ACT_ALLOW)
898 return 0;
899
900 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
901 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
902 Iterator i;
903 void *id, *val;
904
905 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
906
907 r = seccomp_init_for_arch(&seccomp, arch, default_action);
908 if (r < 0)
909 return r;
910
911 HASHMAP_FOREACH_KEY(val, id, set, i) {
912 uint32_t a = action;
913 int e = PTR_TO_INT(val);
914
915 if (action != SCMP_ACT_ALLOW && e >= 0)
916 a = SCMP_ACT_ERRNO(e);
917
918 r = seccomp_rule_add_exact(seccomp, a, PTR_TO_INT(id) - 1, 0);
919 if (r < 0) {
920 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
921 _cleanup_free_ char *n = NULL;
922
923 n = seccomp_syscall_resolve_num_arch(SCMP_ARCH_NATIVE, PTR_TO_INT(id) - 1);
924 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
925 }
926 }
927
928 r = seccomp_load(seccomp);
929 if (IN_SET(r, -EPERM, -EACCES))
930 return r;
931 if (r < 0)
932 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
933 }
934
935 return 0;
936 }
937
938 int seccomp_parse_syscall_filter_full(
939 const char *name,
940 int errno_num,
941 Hashmap *filter,
942 SeccompParseFlags flags,
943 const char *unit,
944 const char *filename,
945 unsigned line) {
946
947 int r;
948
949 assert(name);
950 assert(filter);
951
952 if (name[0] == '@') {
953 const SyscallFilterSet *set;
954 const char *i;
955
956 set = syscall_filter_set_find(name);
957 if (!set) {
958 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
959 return -EINVAL;
960
961 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
962 "Unknown system call group, ignoring: %s", name);
963 return 0;
964 }
965
966 NULSTR_FOREACH(i, set->value) {
967 /* Call ourselves again, for the group to parse. Note that we downgrade logging here (i.e. take
968 * away the SECCOMP_PARSE_LOG flag) since any issues in the group table are our own problem,
969 * not a problem in user configuration data and we shouldn't pretend otherwise by complaining
970 * about them. */
971 r = seccomp_parse_syscall_filter_full(i, errno_num, filter, flags &~ SECCOMP_PARSE_LOG, unit, filename, line);
972 if (r < 0)
973 return r;
974 }
975 } else {
976 int id;
977
978 id = seccomp_syscall_resolve_name(name);
979 if (id == __NR_SCMP_ERROR) {
980 if (!(flags & SECCOMP_PARSE_PERMISSIVE))
981 return -EINVAL;
982
983 log_syntax(unit, flags & SECCOMP_PARSE_LOG ? LOG_WARNING : LOG_DEBUG, filename, line, 0,
984 "Failed to parse system call, ignoring: %s", name);
985 return 0;
986 }
987
988 /* If we previously wanted to forbid a syscall and now
989 * we want to allow it, then remove it from the list. */
990 if (!(flags & SECCOMP_PARSE_INVERT) == !!(flags & SECCOMP_PARSE_WHITELIST)) {
991 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(errno_num));
992 if (r < 0)
993 return flags & SECCOMP_PARSE_LOG ? log_oom() : -ENOMEM;
994 } else
995 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
996 }
997
998 return 0;
999 }
1000
1001 int seccomp_restrict_namespaces(unsigned long retain) {
1002 uint32_t arch;
1003 int r;
1004
1005 if (DEBUG_LOGGING) {
1006 _cleanup_free_ char *s = NULL;
1007
1008 (void) namespace_flags_to_string(retain, &s);
1009 log_debug("Restricting namespace to: %s.", strna(s));
1010 }
1011
1012 /* NOOP? */
1013 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
1014 return 0;
1015
1016 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1017 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1018 unsigned i;
1019
1020 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1021
1022 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1023 if (r < 0)
1024 return r;
1025
1026 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
1027 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
1028 * altogether. */
1029 r = seccomp_rule_add_exact(
1030 seccomp,
1031 SCMP_ACT_ERRNO(EPERM),
1032 SCMP_SYS(setns),
1033 0);
1034 else
1035 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
1036 * special invocation with a zero flags argument, right here. */
1037 r = seccomp_rule_add_exact(
1038 seccomp,
1039 SCMP_ACT_ERRNO(EPERM),
1040 SCMP_SYS(setns),
1041 1,
1042 SCMP_A1(SCMP_CMP_EQ, 0));
1043 if (r < 0) {
1044 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1045 continue;
1046 }
1047
1048 for (i = 0; namespace_flag_map[i].name; i++) {
1049 unsigned long f;
1050
1051 f = namespace_flag_map[i].flag;
1052 if ((retain & f) == f) {
1053 log_debug("Permitting %s.", namespace_flag_map[i].name);
1054 continue;
1055 }
1056
1057 log_debug("Blocking %s.", namespace_flag_map[i].name);
1058
1059 r = seccomp_rule_add_exact(
1060 seccomp,
1061 SCMP_ACT_ERRNO(EPERM),
1062 SCMP_SYS(unshare),
1063 1,
1064 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1065 if (r < 0) {
1066 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1067 break;
1068 }
1069
1070 /* On s390/s390x the first two parameters to clone are switched */
1071 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1072 r = seccomp_rule_add_exact(
1073 seccomp,
1074 SCMP_ACT_ERRNO(EPERM),
1075 SCMP_SYS(clone),
1076 1,
1077 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1078 else
1079 r = seccomp_rule_add_exact(
1080 seccomp,
1081 SCMP_ACT_ERRNO(EPERM),
1082 SCMP_SYS(clone),
1083 1,
1084 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1085 if (r < 0) {
1086 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1087 break;
1088 }
1089
1090 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1091 r = seccomp_rule_add_exact(
1092 seccomp,
1093 SCMP_ACT_ERRNO(EPERM),
1094 SCMP_SYS(setns),
1095 1,
1096 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1097 if (r < 0) {
1098 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1099 break;
1100 }
1101 }
1102 }
1103 if (r < 0)
1104 continue;
1105
1106 r = seccomp_load(seccomp);
1107 if (IN_SET(r, -EPERM, -EACCES))
1108 return r;
1109 if (r < 0)
1110 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1111 }
1112
1113 return 0;
1114 }
1115
1116 int seccomp_protect_sysctl(void) {
1117 uint32_t arch;
1118 int r;
1119
1120 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1121 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1122
1123 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1124
1125 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1126 /* No _sysctl syscall */
1127 continue;
1128
1129 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1130 if (r < 0)
1131 return r;
1132
1133 r = seccomp_rule_add_exact(
1134 seccomp,
1135 SCMP_ACT_ERRNO(EPERM),
1136 SCMP_SYS(_sysctl),
1137 0);
1138 if (r < 0) {
1139 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1140 continue;
1141 }
1142
1143 r = seccomp_load(seccomp);
1144 if (IN_SET(r, -EPERM, -EACCES))
1145 return r;
1146 if (r < 0)
1147 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1148 }
1149
1150 return 0;
1151 }
1152
1153 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1154 uint32_t arch;
1155 int r;
1156
1157 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1158 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1159 bool supported;
1160 Iterator i;
1161
1162 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1163
1164 switch (arch) {
1165
1166 case SCMP_ARCH_X86_64:
1167 case SCMP_ARCH_X32:
1168 case SCMP_ARCH_ARM:
1169 case SCMP_ARCH_AARCH64:
1170 case SCMP_ARCH_PPC:
1171 case SCMP_ARCH_PPC64:
1172 case SCMP_ARCH_PPC64LE:
1173 case SCMP_ARCH_MIPSEL64N32:
1174 case SCMP_ARCH_MIPS64N32:
1175 case SCMP_ARCH_MIPSEL64:
1176 case SCMP_ARCH_MIPS64:
1177 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1178 supported = true;
1179 break;
1180
1181 case SCMP_ARCH_S390:
1182 case SCMP_ARCH_S390X:
1183 case SCMP_ARCH_X86:
1184 case SCMP_ARCH_MIPSEL:
1185 case SCMP_ARCH_MIPS:
1186 default:
1187 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1188 * don't know */
1189 supported = false;
1190 break;
1191 }
1192
1193 if (!supported)
1194 continue;
1195
1196 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1197 if (r < 0)
1198 return r;
1199
1200 if (whitelist) {
1201 int af, first = 0, last = 0;
1202 void *afp;
1203
1204 /* If this is a whitelist, we first block the address families that are out of range and then
1205 * everything that is not in the set. First, we find the lowest and highest address family in
1206 * the set. */
1207
1208 SET_FOREACH(afp, address_families, i) {
1209 af = PTR_TO_INT(afp);
1210
1211 if (af <= 0 || af >= af_max())
1212 continue;
1213
1214 if (first == 0 || af < first)
1215 first = af;
1216
1217 if (last == 0 || af > last)
1218 last = af;
1219 }
1220
1221 assert((first == 0) == (last == 0));
1222
1223 if (first == 0) {
1224
1225 /* No entries in the valid range, block everything */
1226 r = seccomp_rule_add_exact(
1227 seccomp,
1228 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1229 SCMP_SYS(socket),
1230 0);
1231 if (r < 0) {
1232 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1233 continue;
1234 }
1235
1236 } else {
1237
1238 /* Block everything below the first entry */
1239 r = seccomp_rule_add_exact(
1240 seccomp,
1241 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1242 SCMP_SYS(socket),
1243 1,
1244 SCMP_A0(SCMP_CMP_LT, first));
1245 if (r < 0) {
1246 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1247 continue;
1248 }
1249
1250 /* Block everything above the last entry */
1251 r = seccomp_rule_add_exact(
1252 seccomp,
1253 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1254 SCMP_SYS(socket),
1255 1,
1256 SCMP_A0(SCMP_CMP_GT, last));
1257 if (r < 0) {
1258 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1259 continue;
1260 }
1261
1262 /* Block everything between the first and last entry */
1263 for (af = 1; af < af_max(); af++) {
1264
1265 if (set_contains(address_families, INT_TO_PTR(af)))
1266 continue;
1267
1268 r = seccomp_rule_add_exact(
1269 seccomp,
1270 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1271 SCMP_SYS(socket),
1272 1,
1273 SCMP_A0(SCMP_CMP_EQ, af));
1274 if (r < 0)
1275 break;
1276 }
1277 if (r < 0) {
1278 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1279 continue;
1280 }
1281 }
1282
1283 } else {
1284 void *af;
1285
1286 /* If this is a blacklist, then generate one rule for
1287 * each address family that are then combined in OR
1288 * checks. */
1289
1290 SET_FOREACH(af, address_families, i) {
1291
1292 r = seccomp_rule_add_exact(
1293 seccomp,
1294 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1295 SCMP_SYS(socket),
1296 1,
1297 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1298 if (r < 0)
1299 break;
1300 }
1301 if (r < 0) {
1302 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1303 continue;
1304 }
1305 }
1306
1307 r = seccomp_load(seccomp);
1308 if (IN_SET(r, -EPERM, -EACCES))
1309 return r;
1310 if (r < 0)
1311 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1312 }
1313
1314 return 0;
1315 }
1316
1317 int seccomp_restrict_realtime(void) {
1318 static const int permitted_policies[] = {
1319 SCHED_OTHER,
1320 SCHED_BATCH,
1321 SCHED_IDLE,
1322 };
1323
1324 int r, max_policy = 0;
1325 uint32_t arch;
1326 unsigned i;
1327
1328 /* Determine the highest policy constant we want to allow */
1329 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1330 if (permitted_policies[i] > max_policy)
1331 max_policy = permitted_policies[i];
1332
1333 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1334 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1335 int p;
1336
1337 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1338
1339 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1340 if (r < 0)
1341 return r;
1342
1343 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1344 * whitelist. */
1345 for (p = 0; p < max_policy; p++) {
1346 bool good = false;
1347
1348 /* Check if this is in the whitelist. */
1349 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1350 if (permitted_policies[i] == p) {
1351 good = true;
1352 break;
1353 }
1354
1355 if (good)
1356 continue;
1357
1358 /* Deny this policy */
1359 r = seccomp_rule_add_exact(
1360 seccomp,
1361 SCMP_ACT_ERRNO(EPERM),
1362 SCMP_SYS(sched_setscheduler),
1363 1,
1364 SCMP_A1(SCMP_CMP_EQ, p));
1365 if (r < 0) {
1366 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1367 continue;
1368 }
1369 }
1370
1371 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1372 * unsigned here, hence no need no check for < 0 values. */
1373 r = seccomp_rule_add_exact(
1374 seccomp,
1375 SCMP_ACT_ERRNO(EPERM),
1376 SCMP_SYS(sched_setscheduler),
1377 1,
1378 SCMP_A1(SCMP_CMP_GT, max_policy));
1379 if (r < 0) {
1380 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1381 continue;
1382 }
1383
1384 r = seccomp_load(seccomp);
1385 if (IN_SET(r, -EPERM, -EACCES))
1386 return r;
1387 if (r < 0)
1388 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1389 }
1390
1391 return 0;
1392 }
1393
1394 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1395 uint32_t arch,
1396 int nr,
1397 unsigned int arg_cnt,
1398 const struct scmp_arg_cmp arg) {
1399 int r;
1400
1401 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1402 if (r < 0) {
1403 _cleanup_free_ char *n = NULL;
1404
1405 n = seccomp_syscall_resolve_num_arch(arch, nr);
1406 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1407 strna(n),
1408 seccomp_arch_to_string(arch));
1409 }
1410
1411 return r;
1412 }
1413
1414 /* For known architectures, check that syscalls are indeed defined or not. */
1415 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1416 assert_cc(SCMP_SYS(shmget) > 0);
1417 assert_cc(SCMP_SYS(shmat) > 0);
1418 assert_cc(SCMP_SYS(shmdt) > 0);
1419 #elif defined(__i386__) || defined(__powerpc64__)
1420 assert_cc(SCMP_SYS(shmget) < 0);
1421 assert_cc(SCMP_SYS(shmat) < 0);
1422 assert_cc(SCMP_SYS(shmdt) < 0);
1423 #endif
1424
1425 int seccomp_memory_deny_write_execute(void) {
1426
1427 uint32_t arch;
1428 int r;
1429
1430 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1431 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1432 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1433
1434 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1435
1436 switch (arch) {
1437
1438 case SCMP_ARCH_X86:
1439 filter_syscall = SCMP_SYS(mmap2);
1440 block_syscall = SCMP_SYS(mmap);
1441 break;
1442
1443 case SCMP_ARCH_PPC:
1444 case SCMP_ARCH_PPC64:
1445 case SCMP_ARCH_PPC64LE:
1446 filter_syscall = SCMP_SYS(mmap);
1447
1448 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1449 * We ignore that here, which means there's still a way to get writable/executable
1450 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1451
1452 break;
1453
1454 case SCMP_ARCH_ARM:
1455 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1456 shmat_syscall = SCMP_SYS(shmat);
1457 break;
1458
1459 case SCMP_ARCH_X86_64:
1460 case SCMP_ARCH_X32:
1461 case SCMP_ARCH_AARCH64:
1462 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1463 shmat_syscall = SCMP_SYS(shmat);
1464 break;
1465
1466 /* Please add more definitions here, if you port systemd to other architectures! */
1467
1468 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1469 #warning "Consider adding the right mmap() syscall definitions here!"
1470 #endif
1471 }
1472
1473 /* Can't filter mmap() on this arch, then skip it */
1474 if (filter_syscall == 0)
1475 continue;
1476
1477 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1478 if (r < 0)
1479 return r;
1480
1481 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1482 1,
1483 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1484 if (r < 0)
1485 continue;
1486
1487 if (block_syscall != 0) {
1488 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1489 if (r < 0)
1490 continue;
1491 }
1492
1493 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1494 1,
1495 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1496 if (r < 0)
1497 continue;
1498
1499 #ifdef __NR_pkey_mprotect
1500 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(pkey_mprotect),
1501 1,
1502 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1503 if (r < 0)
1504 continue;
1505 #endif
1506
1507 if (shmat_syscall != 0) {
1508 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1509 1,
1510 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1511 if (r < 0)
1512 continue;
1513 }
1514
1515 r = seccomp_load(seccomp);
1516 if (IN_SET(r, -EPERM, -EACCES))
1517 return r;
1518 if (r < 0)
1519 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1520 }
1521
1522 return 0;
1523 }
1524
1525 int seccomp_restrict_archs(Set *archs) {
1526 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1527 Iterator i;
1528 void *id;
1529 int r;
1530
1531 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1532 * list.
1533 *
1534 * There are some qualifications. However the most important use is to stop processes from bypassing
1535 * system call restrictions, in case they used a broader (multiplexing) syscall which is only available
1536 * in a non-native architecture. There are no holes in this use case, at least so far. */
1537
1538 /* Note libseccomp includes our "native" (current) architecture in the filter by default.
1539 * We do not remove it. For example, our callers expect to be able to call execve() afterwards
1540 * to run a program with the restrictions applied. */
1541 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1542 if (!seccomp)
1543 return -ENOMEM;
1544
1545 SET_FOREACH(id, archs, i) {
1546 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1547 if (r < 0 && r != -EEXIST)
1548 return r;
1549 }
1550
1551 /* The vdso for x32 assumes that x86-64 syscalls are available. Let's allow them, since x32
1552 * x32 syscalls should basically match x86-64 for everything except the pointer type.
1553 * The important thing is that you can block the old 32-bit x86 syscalls.
1554 * https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=850047 */
1555
1556 if (seccomp_arch_native() == SCMP_ARCH_X32 ||
1557 set_contains(archs, UINT32_TO_PTR(SCMP_ARCH_X32 + 1))) {
1558
1559 r = seccomp_arch_add(seccomp, SCMP_ARCH_X86_64);
1560 if (r < 0 && r != -EEXIST)
1561 return r;
1562 }
1563
1564 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1565 if (r < 0)
1566 return r;
1567
1568 r = seccomp_load(seccomp);
1569 if (IN_SET(r, -EPERM, -EACCES))
1570 return r;
1571 if (r < 0)
1572 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1573
1574 return 0;
1575 }
1576
1577 int parse_syscall_archs(char **l, Set **archs) {
1578 _cleanup_set_free_ Set *_archs;
1579 char **s;
1580 int r;
1581
1582 assert(l);
1583 assert(archs);
1584
1585 r = set_ensure_allocated(&_archs, NULL);
1586 if (r < 0)
1587 return r;
1588
1589 STRV_FOREACH(s, l) {
1590 uint32_t a;
1591
1592 r = seccomp_arch_from_string(*s, &a);
1593 if (r < 0)
1594 return -EINVAL;
1595
1596 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1597 if (r < 0)
1598 return -ENOMEM;
1599 }
1600
1601 *archs = TAKE_PTR(_archs);
1602
1603 return 0;
1604 }
1605
1606 int seccomp_filter_set_add(Hashmap *filter, bool add, const SyscallFilterSet *set) {
1607 const char *i;
1608 int r;
1609
1610 assert(set);
1611
1612 NULSTR_FOREACH(i, set->value) {
1613
1614 if (i[0] == '@') {
1615 const SyscallFilterSet *more;
1616
1617 more = syscall_filter_set_find(i);
1618 if (!more)
1619 return -ENXIO;
1620
1621 r = seccomp_filter_set_add(filter, add, more);
1622 if (r < 0)
1623 return r;
1624 } else {
1625 int id;
1626
1627 id = seccomp_syscall_resolve_name(i);
1628 if (id == __NR_SCMP_ERROR) {
1629 log_debug("Couldn't resolve system call, ignoring: %s", i);
1630 continue;
1631 }
1632
1633 if (add) {
1634 r = hashmap_put(filter, INT_TO_PTR(id + 1), INT_TO_PTR(-1));
1635 if (r < 0)
1636 return r;
1637 } else
1638 (void) hashmap_remove(filter, INT_TO_PTR(id + 1));
1639 }
1640 }
1641
1642 return 0;
1643 }
1644
1645 int seccomp_lock_personality(unsigned long personality) {
1646 uint32_t arch;
1647 int r;
1648
1649 if (personality >= PERSONALITY_INVALID)
1650 return -EINVAL;
1651
1652 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1653 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1654
1655 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1656 if (r < 0)
1657 return r;
1658
1659 r = seccomp_rule_add_exact(
1660 seccomp,
1661 SCMP_ACT_ERRNO(EPERM),
1662 SCMP_SYS(personality),
1663 1,
1664 SCMP_A0(SCMP_CMP_NE, personality));
1665 if (r < 0) {
1666 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1667 continue;
1668 }
1669
1670 r = seccomp_load(seccomp);
1671 if (IN_SET(r, -EPERM, -EACCES))
1672 return r;
1673 if (r < 0)
1674 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1675 }
1676
1677 return 0;
1678 }