]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
6a4d30bac163b4183bafe577362cb0b1e356b8e7
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "process-util.h"
33 #include "seccomp-util.h"
34 #include "set.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "util.h"
38 #include "errno-list.h"
39
40 const uint32_t seccomp_local_archs[] = {
41
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44 #if defined(__x86_64__) && defined(__ILP32__)
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
47 SCMP_ARCH_X32, /* native */
48 #elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
50 SCMP_ARCH_X32,
51 SCMP_ARCH_X86_64, /* native */
52 #elif defined(__i386__)
53 SCMP_ARCH_X86,
54 #elif defined(__aarch64__)
55 SCMP_ARCH_ARM,
56 SCMP_ARCH_AARCH64, /* native */
57 #elif defined(__arm__)
58 SCMP_ARCH_ARM,
59 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
69 SCMP_ARCH_MIPS64N32,
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
82 SCMP_ARCH_MIPSEL64,
83 SCMP_ARCH_MIPS64,
84 SCMP_ARCH_MIPSEL64N32,
85 SCMP_ARCH_MIPS64N32, /* native */
86 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64LE,
96 SCMP_ARCH_PPC64, /* native */
97 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101 #elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103 #elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106 #elif defined(__s390__)
107 SCMP_ARCH_S390,
108 #endif
109 (uint32_t) -1
110 };
111
112 const char* seccomp_arch_to_string(uint32_t c) {
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
117
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
120 return "native";
121 case SCMP_ARCH_X86:
122 return "x86";
123 case SCMP_ARCH_X86_64:
124 return "x86-64";
125 case SCMP_ARCH_X32:
126 return "x32";
127 case SCMP_ARCH_ARM:
128 return "arm";
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
150 return "s390";
151 case SCMP_ARCH_S390X:
152 return "s390x";
153 default:
154 return NULL;
155 }
156 }
157
158 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 scmp_filter_ctx seccomp;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 goto finish;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245 finish:
246 seccomp_release(seccomp);
247 return r;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
300 "getrlimit\0" /* make sure processes can query stack size and such */
301 "getsid\0"
302 "gettid\0"
303 "gettimeofday\0"
304 "getuid\0"
305 "getuid32\0"
306 "membarrier\0"
307 "nanosleep\0"
308 "pause\0"
309 "prlimit64\0"
310 "restart_syscall\0"
311 "rt_sigreturn\0"
312 "sched_yield\0"
313 "set_robust_list\0"
314 "set_thread_area\0"
315 "set_tid_address\0"
316 "sigreturn\0"
317 "time\0"
318 "ugetrlimit\0"
319 },
320 [SYSCALL_FILTER_SET_BASIC_IO] = {
321 .name = "@basic-io",
322 .help = "Basic IO",
323 .value =
324 "_llseek\0"
325 "close\0"
326 "dup\0"
327 "dup2\0"
328 "dup3\0"
329 "lseek\0"
330 "pread64\0"
331 "preadv\0"
332 "pwrite64\0"
333 "pwritev\0"
334 "read\0"
335 "readv\0"
336 "write\0"
337 "writev\0"
338 },
339 [SYSCALL_FILTER_SET_CLOCK] = {
340 .name = "@clock",
341 .help = "Change the system time",
342 .value =
343 "adjtimex\0"
344 "clock_adjtime\0"
345 "clock_settime\0"
346 "settimeofday\0"
347 "stime\0"
348 },
349 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
350 .name = "@cpu-emulation",
351 .help = "System calls for CPU emulation functionality",
352 .value =
353 "modify_ldt\0"
354 "subpage_prot\0"
355 "switch_endian\0"
356 "vm86\0"
357 "vm86old\0"
358 },
359 [SYSCALL_FILTER_SET_DEBUG] = {
360 .name = "@debug",
361 .help = "Debugging, performance monitoring and tracing functionality",
362 .value =
363 "lookup_dcookie\0"
364 "perf_event_open\0"
365 "process_vm_readv\0"
366 "process_vm_writev\0"
367 "ptrace\0"
368 "rtas\0"
369 #ifdef __NR_s390_runtime_instr
370 "s390_runtime_instr\0"
371 #endif
372 "sys_debug_setcontext\0"
373 },
374 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
375 .name = "@file-system",
376 .help = "File system operations",
377 .value =
378 "access\0"
379 "chdir\0"
380 "chmod\0"
381 "close\0"
382 "creat\0"
383 "faccessat\0"
384 "fallocate\0"
385 "fchdir\0"
386 "fchmod\0"
387 "fchmodat\0"
388 "fcntl\0"
389 "fcntl64\0"
390 "fgetxattr\0"
391 "flistxattr\0"
392 "fremovexattr\0"
393 "fsetxattr\0"
394 "fstat\0"
395 "fstat64\0"
396 "fstatat64\0"
397 "fstatfs\0"
398 "fstatfs64\0"
399 "ftruncate\0"
400 "ftruncate64\0"
401 "futimesat\0"
402 "getcwd\0"
403 "getdents\0"
404 "getdents64\0"
405 "getxattr\0"
406 "inotify_add_watch\0"
407 "inotify_init\0"
408 "inotify_init1\0"
409 "inotify_rm_watch\0"
410 "lgetxattr\0"
411 "link\0"
412 "linkat\0"
413 "listxattr\0"
414 "llistxattr\0"
415 "lremovexattr\0"
416 "lsetxattr\0"
417 "lstat\0"
418 "lstat64\0"
419 "mkdir\0"
420 "mkdirat\0"
421 "mknod\0"
422 "mknodat\0"
423 "mmap\0"
424 "mmap2\0"
425 "munmap\0"
426 "newfstatat\0"
427 "oldfstat\0"
428 "oldlstat\0"
429 "oldstat\0"
430 "open\0"
431 "openat\0"
432 "readlink\0"
433 "readlinkat\0"
434 "removexattr\0"
435 "rename\0"
436 "renameat\0"
437 "renameat2\0"
438 "rmdir\0"
439 "setxattr\0"
440 "stat\0"
441 "stat64\0"
442 "statfs\0"
443 "statfs64\0"
444 #ifdef __PNR_statx
445 "statx\0"
446 #endif
447 "symlink\0"
448 "symlinkat\0"
449 "truncate\0"
450 "truncate64\0"
451 "unlink\0"
452 "unlinkat\0"
453 "utime\0"
454 "utimensat\0"
455 "utimes\0"
456 },
457 [SYSCALL_FILTER_SET_IO_EVENT] = {
458 .name = "@io-event",
459 .help = "Event loop system calls",
460 .value =
461 "_newselect\0"
462 "epoll_create\0"
463 "epoll_create1\0"
464 "epoll_ctl\0"
465 "epoll_ctl_old\0"
466 "epoll_pwait\0"
467 "epoll_wait\0"
468 "epoll_wait_old\0"
469 "eventfd\0"
470 "eventfd2\0"
471 "poll\0"
472 "ppoll\0"
473 "pselect6\0"
474 "select\0"
475 },
476 [SYSCALL_FILTER_SET_IPC] = {
477 .name = "@ipc",
478 .help = "SysV IPC, POSIX Message Queues or other IPC",
479 .value =
480 "ipc\0"
481 "memfd_create\0"
482 "mq_getsetattr\0"
483 "mq_notify\0"
484 "mq_open\0"
485 "mq_timedreceive\0"
486 "mq_timedsend\0"
487 "mq_unlink\0"
488 "msgctl\0"
489 "msgget\0"
490 "msgrcv\0"
491 "msgsnd\0"
492 "pipe\0"
493 "pipe2\0"
494 "process_vm_readv\0"
495 "process_vm_writev\0"
496 "semctl\0"
497 "semget\0"
498 "semop\0"
499 "semtimedop\0"
500 "shmat\0"
501 "shmctl\0"
502 "shmdt\0"
503 "shmget\0"
504 },
505 [SYSCALL_FILTER_SET_KEYRING] = {
506 .name = "@keyring",
507 .help = "Kernel keyring access",
508 .value =
509 "add_key\0"
510 "keyctl\0"
511 "request_key\0"
512 },
513 [SYSCALL_FILTER_SET_MEMLOCK] = {
514 .name = "@memlock",
515 .help = "Memory locking control",
516 .value =
517 "mlock\0"
518 "mlock2\0"
519 "mlockall\0"
520 "munlock\0"
521 "munlockall\0"
522 },
523 [SYSCALL_FILTER_SET_MODULE] = {
524 .name = "@module",
525 .help = "Loading and unloading of kernel modules",
526 .value =
527 "delete_module\0"
528 "finit_module\0"
529 "init_module\0"
530 },
531 [SYSCALL_FILTER_SET_MOUNT] = {
532 .name = "@mount",
533 .help = "Mounting and unmounting of file systems",
534 .value =
535 "chroot\0"
536 "mount\0"
537 "pivot_root\0"
538 "umount\0"
539 "umount2\0"
540 },
541 [SYSCALL_FILTER_SET_NETWORK_IO] = {
542 .name = "@network-io",
543 .help = "Network or Unix socket IO, should not be needed if not network facing",
544 .value =
545 "accept\0"
546 "accept4\0"
547 "bind\0"
548 "connect\0"
549 "getpeername\0"
550 "getsockname\0"
551 "getsockopt\0"
552 "listen\0"
553 "recv\0"
554 "recvfrom\0"
555 "recvmmsg\0"
556 "recvmsg\0"
557 "send\0"
558 "sendmmsg\0"
559 "sendmsg\0"
560 "sendto\0"
561 "setsockopt\0"
562 "shutdown\0"
563 "socket\0"
564 "socketcall\0"
565 "socketpair\0"
566 },
567 [SYSCALL_FILTER_SET_OBSOLETE] = {
568 /* some unknown even to libseccomp */
569 .name = "@obsolete",
570 .help = "Unusual, obsolete or unimplemented system calls",
571 .value =
572 "_sysctl\0"
573 "afs_syscall\0"
574 "bdflush\0"
575 "break\0"
576 "create_module\0"
577 "ftime\0"
578 "get_kernel_syms\0"
579 "getpmsg\0"
580 "gtty\0"
581 "idle\0"
582 "lock\0"
583 "mpx\0"
584 "prof\0"
585 "profil\0"
586 "putpmsg\0"
587 "query_module\0"
588 "security\0"
589 "sgetmask\0"
590 "ssetmask\0"
591 "stty\0"
592 "sysfs\0"
593 "tuxcall\0"
594 "ulimit\0"
595 "uselib\0"
596 "ustat\0"
597 "vserver\0"
598 },
599 [SYSCALL_FILTER_SET_PRIVILEGED] = {
600 .name = "@privileged",
601 .help = "All system calls which need super-user capabilities",
602 .value =
603 "@clock\0"
604 "@module\0"
605 "@raw-io\0"
606 "_sysctl\0"
607 "acct\0"
608 "bpf\0"
609 "capset\0"
610 "chown\0"
611 "chown32\0"
612 "chroot\0"
613 "fchown\0"
614 "fchown32\0"
615 "fchownat\0"
616 "kexec_file_load\0"
617 "kexec_load\0"
618 "lchown\0"
619 "lchown32\0"
620 "nfsservctl\0"
621 "pivot_root\0"
622 "quotactl\0"
623 "reboot\0"
624 "setdomainname\0"
625 "setfsuid\0"
626 "setfsuid32\0"
627 "setgroups\0"
628 "setgroups32\0"
629 "sethostname\0"
630 "setresuid\0"
631 "setresuid32\0"
632 "setreuid\0"
633 "setreuid32\0"
634 "setuid\0"
635 "setuid32\0"
636 "swapoff\0"
637 "swapon\0"
638 "vhangup\0"
639 },
640 [SYSCALL_FILTER_SET_PROCESS] = {
641 .name = "@process",
642 .help = "Process control, execution, namespaceing operations",
643 .value =
644 "arch_prctl\0"
645 "capget\0" /* Able to query arbitrary processes */
646 "clone\0"
647 "execveat\0"
648 "fork\0"
649 "getrusage\0"
650 "gettid\0"
651 "kill\0"
652 "prctl\0"
653 "rt_sigqueueinfo\0"
654 "rt_tgsigqueueinfo\0"
655 "setns\0"
656 "tgkill\0"
657 "times\0"
658 "tkill\0"
659 "unshare\0"
660 "vfork\0"
661 "wait4\0"
662 "waitid\0"
663 "waitpid\0"
664 },
665 [SYSCALL_FILTER_SET_RAW_IO] = {
666 .name = "@raw-io",
667 .help = "Raw I/O port access",
668 .value =
669 "ioperm\0"
670 "iopl\0"
671 "pciconfig_iobase\0"
672 "pciconfig_read\0"
673 "pciconfig_write\0"
674 #ifdef __NR_s390_pci_mmio_read
675 "s390_pci_mmio_read\0"
676 #endif
677 #ifdef __NR_s390_pci_mmio_write
678 "s390_pci_mmio_write\0"
679 #endif
680 },
681 [SYSCALL_FILTER_SET_REBOOT] = {
682 .name = "@reboot",
683 .help = "Reboot and reboot preparation/kexec",
684 .value =
685 "kexec\0"
686 "kexec_file_load\0"
687 "reboot\0"
688 },
689 [SYSCALL_FILTER_SET_RESOURCES] = {
690 .name = "@resources",
691 .help = "Alter resource settings",
692 .value =
693 "ioprio_set\0"
694 "mbind\0"
695 "migrate_pages\0"
696 "move_pages\0"
697 "nice\0"
698 "sched_setaffinity\0"
699 "sched_setattr\0"
700 "sched_setparam\0"
701 "sched_setscheduler\0"
702 "set_mempolicy\0"
703 "setpriority\0"
704 "setrlimit\0"
705 },
706 [SYSCALL_FILTER_SET_SETUID] = {
707 .name = "@setuid",
708 .help = "Operations for changing user/group credentials",
709 .value =
710 "setgid\0"
711 "setgid32\0"
712 "setgroups\0"
713 "setgroups32\0"
714 "setregid\0"
715 "setregid32\0"
716 "setresgid\0"
717 "setresgid32\0"
718 "setresuid\0"
719 "setresuid32\0"
720 "setreuid\0"
721 "setreuid32\0"
722 "setuid\0"
723 "setuid32\0"
724 },
725 [SYSCALL_FILTER_SET_SIGNAL] = {
726 .name = "@signal",
727 .help = "Process signal handling",
728 .value =
729 "rt_sigaction\0"
730 "rt_sigpending\0"
731 "rt_sigprocmask\0"
732 "rt_sigsuspend\0"
733 "rt_sigtimedwait\0"
734 "sigaction\0"
735 "sigaltstack\0"
736 "signal\0"
737 "signalfd\0"
738 "signalfd4\0"
739 "sigpending\0"
740 "sigprocmask\0"
741 "sigsuspend\0"
742 },
743 [SYSCALL_FILTER_SET_SWAP] = {
744 .name = "@swap",
745 .help = "Enable/disable swap devices",
746 .value =
747 "swapoff\0"
748 "swapon\0"
749 },
750 [SYSCALL_FILTER_SET_TIMER] = {
751 .name = "@timer",
752 .help = "Schedule operations by time",
753 .value =
754 "alarm\0"
755 "getitimer\0"
756 "setitimer\0"
757 "timer_create\0"
758 "timer_delete\0"
759 "timer_getoverrun\0"
760 "timer_gettime\0"
761 "timer_settime\0"
762 "timerfd_create\0"
763 "timerfd_gettime\0"
764 "timerfd_settime\0"
765 "times\0"
766 },
767 };
768
769 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
770 unsigned i;
771
772 if (isempty(name) || name[0] != '@')
773 return NULL;
774
775 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
776 if (streq(syscall_filter_sets[i].name, name))
777 return syscall_filter_sets + i;
778
779 return NULL;
780 }
781
782 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
783
784 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
785 int r;
786
787 assert(seccomp);
788 assert(name);
789
790 if (strv_contains(exclude, name))
791 return 0;
792
793 if (name[0] == '@') {
794 const SyscallFilterSet *other;
795
796 other = syscall_filter_set_find(name);
797 if (!other) {
798 log_debug("Filter set %s is not known!", name);
799 return -EINVAL;
800 }
801
802 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
803 if (r < 0)
804 return r;
805 } else {
806 int id;
807
808 id = seccomp_syscall_resolve_name(name);
809 if (id == __NR_SCMP_ERROR) {
810 log_debug("System call %s is not known!", name);
811 return -EINVAL; /* Not known at all? Then that's a real error */
812 }
813
814 r = seccomp_rule_add_exact(seccomp, action, id, 0);
815 if (r < 0)
816 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
817 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
818 }
819
820 return 0;
821 }
822
823 static int seccomp_add_syscall_filter_set(
824 scmp_filter_ctx seccomp,
825 const SyscallFilterSet *set,
826 uint32_t action,
827 char **exclude) {
828
829 const char *sys;
830 int r;
831
832 assert(seccomp);
833 assert(set);
834
835 NULSTR_FOREACH(sys, set->value) {
836 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
837 if (r < 0)
838 return r;
839 }
840
841 return 0;
842 }
843
844 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
845 uint32_t arch;
846 int r;
847
848 assert(set);
849
850 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
851 * earch local arch. */
852
853 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
854 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
855
856 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
857
858 r = seccomp_init_for_arch(&seccomp, arch, default_action);
859 if (r < 0)
860 return r;
861
862 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
863 if (r < 0) {
864 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
865 continue;
866 }
867
868 r = seccomp_load(seccomp);
869 if (IN_SET(r, -EPERM, -EACCES))
870 return r;
871 if (r < 0)
872 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
873 }
874
875 return 0;
876 }
877
878 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
879 uint32_t arch;
880 int r;
881
882 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
883 * SyscallFilterSet* table. */
884
885 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
886 return 0;
887
888 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
889 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
890 Iterator i;
891 void *id;
892
893 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
894
895 r = seccomp_init_for_arch(&seccomp, arch, default_action);
896 if (r < 0)
897 return r;
898
899 SET_FOREACH(id, set, i) {
900 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
901 if (r < 0) {
902 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
903 _cleanup_free_ char *n = NULL;
904
905 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
906 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
907 }
908 }
909
910 r = seccomp_load(seccomp);
911 if (IN_SET(r, -EPERM, -EACCES))
912 return r;
913 if (r < 0)
914 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
915 }
916
917 return 0;
918 }
919
920 int seccomp_restrict_namespaces(unsigned long retain) {
921 uint32_t arch;
922 int r;
923
924 if (log_get_max_level() >= LOG_DEBUG) {
925 _cleanup_free_ char *s = NULL;
926
927 (void) namespace_flag_to_string_many(retain, &s);
928 log_debug("Restricting namespace to: %s.", strna(s));
929 }
930
931 /* NOOP? */
932 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
933 return 0;
934
935 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
936 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
937 unsigned i;
938
939 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
940
941 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
942 if (r < 0)
943 return r;
944
945 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
946 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
947 * altogether. */
948 r = seccomp_rule_add_exact(
949 seccomp,
950 SCMP_ACT_ERRNO(EPERM),
951 SCMP_SYS(setns),
952 0);
953 else
954 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
955 * special invocation with a zero flags argument, right here. */
956 r = seccomp_rule_add_exact(
957 seccomp,
958 SCMP_ACT_ERRNO(EPERM),
959 SCMP_SYS(setns),
960 1,
961 SCMP_A1(SCMP_CMP_EQ, 0));
962 if (r < 0) {
963 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
964 continue;
965 }
966
967 for (i = 0; namespace_flag_map[i].name; i++) {
968 unsigned long f;
969
970 f = namespace_flag_map[i].flag;
971 if ((retain & f) == f) {
972 log_debug("Permitting %s.", namespace_flag_map[i].name);
973 continue;
974 }
975
976 log_debug("Blocking %s.", namespace_flag_map[i].name);
977
978 r = seccomp_rule_add_exact(
979 seccomp,
980 SCMP_ACT_ERRNO(EPERM),
981 SCMP_SYS(unshare),
982 1,
983 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
984 if (r < 0) {
985 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
986 break;
987 }
988
989 /* On s390/s390x the first two parameters to clone are switched */
990 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
991 r = seccomp_rule_add_exact(
992 seccomp,
993 SCMP_ACT_ERRNO(EPERM),
994 SCMP_SYS(clone),
995 1,
996 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
997 else
998 r = seccomp_rule_add_exact(
999 seccomp,
1000 SCMP_ACT_ERRNO(EPERM),
1001 SCMP_SYS(clone),
1002 1,
1003 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1004 if (r < 0) {
1005 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1006 break;
1007 }
1008
1009 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1010 r = seccomp_rule_add_exact(
1011 seccomp,
1012 SCMP_ACT_ERRNO(EPERM),
1013 SCMP_SYS(setns),
1014 1,
1015 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1016 if (r < 0) {
1017 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1018 break;
1019 }
1020 }
1021 }
1022 if (r < 0)
1023 continue;
1024
1025 r = seccomp_load(seccomp);
1026 if (IN_SET(r, -EPERM, -EACCES))
1027 return r;
1028 if (r < 0)
1029 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1030 }
1031
1032 return 0;
1033 }
1034
1035 int seccomp_protect_sysctl(void) {
1036 uint32_t arch;
1037 int r;
1038
1039 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1040 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1041
1042 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1043
1044 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1045 /* No _sysctl syscall */
1046 continue;
1047
1048 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1049 if (r < 0)
1050 return r;
1051
1052 r = seccomp_rule_add_exact(
1053 seccomp,
1054 SCMP_ACT_ERRNO(EPERM),
1055 SCMP_SYS(_sysctl),
1056 0);
1057 if (r < 0) {
1058 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1059 continue;
1060 }
1061
1062 r = seccomp_load(seccomp);
1063 if (IN_SET(r, -EPERM, -EACCES))
1064 return r;
1065 if (r < 0)
1066 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1067 }
1068
1069 return 0;
1070 }
1071
1072 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1073 uint32_t arch;
1074 int r;
1075
1076 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1077 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1078 bool supported;
1079 Iterator i;
1080
1081 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1082
1083 switch (arch) {
1084
1085 case SCMP_ARCH_X86_64:
1086 case SCMP_ARCH_X32:
1087 case SCMP_ARCH_ARM:
1088 case SCMP_ARCH_AARCH64:
1089 case SCMP_ARCH_PPC64:
1090 case SCMP_ARCH_PPC64LE:
1091 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1092 supported = true;
1093 break;
1094
1095 case SCMP_ARCH_S390:
1096 case SCMP_ARCH_S390X:
1097 case SCMP_ARCH_PPC:
1098 case SCMP_ARCH_X86:
1099 default:
1100 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1101 * don't know */
1102 supported = false;
1103 break;
1104 }
1105
1106 if (!supported)
1107 continue;
1108
1109 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1110 if (r < 0)
1111 return r;
1112
1113 if (whitelist) {
1114 int af, first = 0, last = 0;
1115 void *afp;
1116
1117 /* If this is a whitelist, we first block the address families that are out of range and then
1118 * everything that is not in the set. First, we find the lowest and highest address family in
1119 * the set. */
1120
1121 SET_FOREACH(afp, address_families, i) {
1122 af = PTR_TO_INT(afp);
1123
1124 if (af <= 0 || af >= af_max())
1125 continue;
1126
1127 if (first == 0 || af < first)
1128 first = af;
1129
1130 if (last == 0 || af > last)
1131 last = af;
1132 }
1133
1134 assert((first == 0) == (last == 0));
1135
1136 if (first == 0) {
1137
1138 /* No entries in the valid range, block everything */
1139 r = seccomp_rule_add_exact(
1140 seccomp,
1141 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1142 SCMP_SYS(socket),
1143 0);
1144 if (r < 0) {
1145 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1146 continue;
1147 }
1148
1149 } else {
1150
1151 /* Block everything below the first entry */
1152 r = seccomp_rule_add_exact(
1153 seccomp,
1154 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1155 SCMP_SYS(socket),
1156 1,
1157 SCMP_A0(SCMP_CMP_LT, first));
1158 if (r < 0) {
1159 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1160 continue;
1161 }
1162
1163 /* Block everything above the last entry */
1164 r = seccomp_rule_add_exact(
1165 seccomp,
1166 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1167 SCMP_SYS(socket),
1168 1,
1169 SCMP_A0(SCMP_CMP_GT, last));
1170 if (r < 0) {
1171 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1172 continue;
1173 }
1174
1175 /* Block everything between the first and last entry */
1176 for (af = 1; af < af_max(); af++) {
1177
1178 if (set_contains(address_families, INT_TO_PTR(af)))
1179 continue;
1180
1181 r = seccomp_rule_add_exact(
1182 seccomp,
1183 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1184 SCMP_SYS(socket),
1185 1,
1186 SCMP_A0(SCMP_CMP_EQ, af));
1187 if (r < 0)
1188 break;
1189 }
1190
1191 if (r < 0) {
1192 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1193 continue;
1194 }
1195 }
1196
1197 } else {
1198 void *af;
1199
1200 /* If this is a blacklist, then generate one rule for
1201 * each address family that are then combined in OR
1202 * checks. */
1203
1204 SET_FOREACH(af, address_families, i) {
1205
1206 r = seccomp_rule_add_exact(
1207 seccomp,
1208 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1209 SCMP_SYS(socket),
1210 1,
1211 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1212 if (r < 0)
1213 break;
1214 }
1215
1216 if (r < 0) {
1217 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1218 continue;
1219 }
1220 }
1221
1222 r = seccomp_load(seccomp);
1223 if (IN_SET(r, -EPERM, -EACCES))
1224 return r;
1225 if (r < 0)
1226 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1227 }
1228
1229 return 0;
1230 }
1231
1232 int seccomp_restrict_realtime(void) {
1233 static const int permitted_policies[] = {
1234 SCHED_OTHER,
1235 SCHED_BATCH,
1236 SCHED_IDLE,
1237 };
1238
1239 int r, max_policy = 0;
1240 uint32_t arch;
1241 unsigned i;
1242
1243 /* Determine the highest policy constant we want to allow */
1244 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1245 if (permitted_policies[i] > max_policy)
1246 max_policy = permitted_policies[i];
1247
1248 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1249 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1250 int p;
1251
1252 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1253
1254 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1255 if (r < 0)
1256 return r;
1257
1258 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1259 * whitelist. */
1260 for (p = 0; p < max_policy; p++) {
1261 bool good = false;
1262
1263 /* Check if this is in the whitelist. */
1264 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1265 if (permitted_policies[i] == p) {
1266 good = true;
1267 break;
1268 }
1269
1270 if (good)
1271 continue;
1272
1273 /* Deny this policy */
1274 r = seccomp_rule_add_exact(
1275 seccomp,
1276 SCMP_ACT_ERRNO(EPERM),
1277 SCMP_SYS(sched_setscheduler),
1278 1,
1279 SCMP_A1(SCMP_CMP_EQ, p));
1280 if (r < 0) {
1281 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1282 continue;
1283 }
1284 }
1285
1286 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1287 * unsigned here, hence no need no check for < 0 values. */
1288 r = seccomp_rule_add_exact(
1289 seccomp,
1290 SCMP_ACT_ERRNO(EPERM),
1291 SCMP_SYS(sched_setscheduler),
1292 1,
1293 SCMP_A1(SCMP_CMP_GT, max_policy));
1294 if (r < 0) {
1295 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1296 continue;
1297 }
1298
1299 r = seccomp_load(seccomp);
1300 if (IN_SET(r, -EPERM, -EACCES))
1301 return r;
1302 if (r < 0)
1303 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1304 }
1305
1306 return 0;
1307 }
1308
1309 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1310 uint32_t arch,
1311 int nr,
1312 unsigned int arg_cnt,
1313 const struct scmp_arg_cmp arg) {
1314 int r;
1315
1316 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1317 if (r < 0) {
1318 _cleanup_free_ char *n = NULL;
1319
1320 n = seccomp_syscall_resolve_num_arch(arch, nr);
1321 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1322 strna(n),
1323 seccomp_arch_to_string(arch));
1324 }
1325
1326 return r;
1327 }
1328
1329 /* For known architectures, check that syscalls are indeed defined or not. */
1330 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1331 assert_cc(SCMP_SYS(shmget) > 0);
1332 assert_cc(SCMP_SYS(shmat) > 0);
1333 assert_cc(SCMP_SYS(shmdt) > 0);
1334 #elif defined(__i386__) || defined(__powerpc64__)
1335 assert_cc(SCMP_SYS(shmget) < 0);
1336 assert_cc(SCMP_SYS(shmat) < 0);
1337 assert_cc(SCMP_SYS(shmdt) < 0);
1338 #endif
1339
1340 int seccomp_memory_deny_write_execute(void) {
1341
1342 uint32_t arch;
1343 int r;
1344
1345 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1346 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1347 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1348
1349 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1350
1351 switch (arch) {
1352
1353 case SCMP_ARCH_X86:
1354 filter_syscall = SCMP_SYS(mmap2);
1355 block_syscall = SCMP_SYS(mmap);
1356 break;
1357
1358 case SCMP_ARCH_PPC64:
1359 case SCMP_ARCH_PPC64LE:
1360 filter_syscall = SCMP_SYS(mmap);
1361
1362 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1363 * We ignore that here, which means there's still a way to get writable/executable
1364 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1365
1366 break;
1367
1368 case SCMP_ARCH_ARM:
1369 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1370 shmat_syscall = SCMP_SYS(shmat);
1371 break;
1372
1373 case SCMP_ARCH_X86_64:
1374 case SCMP_ARCH_X32:
1375 case SCMP_ARCH_AARCH64:
1376 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1377 shmat_syscall = SCMP_SYS(shmat);
1378 break;
1379
1380 /* Please add more definitions here, if you port systemd to other architectures! */
1381
1382 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1383 #warning "Consider adding the right mmap() syscall definitions here!"
1384 #endif
1385 }
1386
1387 /* Can't filter mmap() on this arch, then skip it */
1388 if (filter_syscall == 0)
1389 continue;
1390
1391 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1392 if (r < 0)
1393 return r;
1394
1395 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1396 1,
1397 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1398 if (r < 0)
1399 continue;
1400
1401 if (block_syscall != 0) {
1402 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1403 if (r < 0)
1404 continue;
1405 }
1406
1407 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1408 1,
1409 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1410 if (r < 0)
1411 continue;
1412
1413 if (shmat_syscall != 0) {
1414 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1415 1,
1416 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1417 if (r < 0)
1418 continue;
1419 }
1420
1421 r = seccomp_load(seccomp);
1422 if (IN_SET(r, -EPERM, -EACCES))
1423 return r;
1424 if (r < 0)
1425 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1426 }
1427
1428 return 0;
1429 }
1430
1431 int seccomp_restrict_archs(Set *archs) {
1432 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1433 Iterator i;
1434 void *id;
1435 int r;
1436
1437 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1438 * list. */
1439
1440 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1441 if (!seccomp)
1442 return -ENOMEM;
1443
1444 SET_FOREACH(id, archs, i) {
1445 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1446 if (r == -EEXIST)
1447 continue;
1448 if (r < 0)
1449 return r;
1450 }
1451
1452 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1453 if (r < 0)
1454 return r;
1455
1456 return seccomp_load(seccomp);
1457 }
1458
1459 int parse_syscall_archs(char **l, Set **archs) {
1460 _cleanup_set_free_ Set *_archs;
1461 char **s;
1462 int r;
1463
1464 assert(l);
1465 assert(archs);
1466
1467 r = set_ensure_allocated(&_archs, NULL);
1468 if (r < 0)
1469 return r;
1470
1471 STRV_FOREACH(s, l) {
1472 uint32_t a;
1473
1474 r = seccomp_arch_from_string(*s, &a);
1475 if (r < 0)
1476 return -EINVAL;
1477
1478 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1479 if (r < 0)
1480 return -ENOMEM;
1481 }
1482
1483 *archs = _archs;
1484 _archs = NULL;
1485
1486 return 0;
1487 }
1488
1489 int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1490 const char *i;
1491 int r;
1492
1493 assert(set);
1494
1495 NULSTR_FOREACH(i, set->value) {
1496
1497 if (i[0] == '@') {
1498 const SyscallFilterSet *more;
1499
1500 more = syscall_filter_set_find(i);
1501 if (!more)
1502 return -ENXIO;
1503
1504
1505 r = seccomp_filter_set_add(filter, add, more);
1506 if (r < 0)
1507 return r;
1508 } else {
1509 int id;
1510
1511 id = seccomp_syscall_resolve_name(i);
1512 if (id == __NR_SCMP_ERROR)
1513 return -ENXIO;
1514
1515 if (add) {
1516 r = set_put(filter, INT_TO_PTR(id + 1));
1517 if (r < 0)
1518 return r;
1519 } else
1520 (void) set_remove(filter, INT_TO_PTR(id + 1));
1521 }
1522 }
1523
1524 return 0;
1525 }
1526
1527 int seccomp_lock_personality(unsigned long personality) {
1528 uint32_t arch;
1529 int r;
1530
1531 if (personality >= PERSONALITY_INVALID)
1532 return -EINVAL;
1533
1534 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1535 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1536
1537 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1538 if (r < 0)
1539 return r;
1540
1541 r = seccomp_rule_add_exact(
1542 seccomp,
1543 SCMP_ACT_ERRNO(EPERM),
1544 SCMP_SYS(personality),
1545 1,
1546 SCMP_A0(SCMP_CMP_NE, personality));
1547 if (r < 0)
1548 return r;
1549
1550 r = seccomp_load(seccomp);
1551 if (IN_SET(r, -EPERM, -EACCES))
1552 return r;
1553 if (r < 0)
1554 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1555 }
1556
1557 return 0;
1558 }