]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/seccomp-util.c
seccomp: add three more seccomp groups
[thirdparty/systemd.git] / src / shared / seccomp-util.c
1 /***
2 This file is part of systemd.
3
4 Copyright 2014 Lennart Poettering
5
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
10
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
15
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
18 ***/
19
20 #include <errno.h>
21 #include <linux/seccomp.h>
22 #include <seccomp.h>
23 #include <stddef.h>
24 #include <sys/mman.h>
25 #include <sys/prctl.h>
26 #include <sys/shm.h>
27
28 #include "af-list.h"
29 #include "alloc-util.h"
30 #include "macro.h"
31 #include "nsflags.h"
32 #include "process-util.h"
33 #include "seccomp-util.h"
34 #include "set.h"
35 #include "string-util.h"
36 #include "strv.h"
37 #include "util.h"
38 #include "errno-list.h"
39
40 const uint32_t seccomp_local_archs[] = {
41
42 /* Note: always list the native arch we are compiled as last, so that users can blacklist seccomp(), but our own calls to it still succeed */
43
44 #if defined(__x86_64__) && defined(__ILP32__)
45 SCMP_ARCH_X86,
46 SCMP_ARCH_X86_64,
47 SCMP_ARCH_X32, /* native */
48 #elif defined(__x86_64__) && !defined(__ILP32__)
49 SCMP_ARCH_X86,
50 SCMP_ARCH_X32,
51 SCMP_ARCH_X86_64, /* native */
52 #elif defined(__i386__)
53 SCMP_ARCH_X86,
54 #elif defined(__aarch64__)
55 SCMP_ARCH_ARM,
56 SCMP_ARCH_AARCH64, /* native */
57 #elif defined(__arm__)
58 SCMP_ARCH_ARM,
59 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
60 SCMP_ARCH_MIPSEL,
61 SCMP_ARCH_MIPS, /* native */
62 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI32
63 SCMP_ARCH_MIPS,
64 SCMP_ARCH_MIPSEL, /* native */
65 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
66 SCMP_ARCH_MIPSEL,
67 SCMP_ARCH_MIPS,
68 SCMP_ARCH_MIPSEL64N32,
69 SCMP_ARCH_MIPS64N32,
70 SCMP_ARCH_MIPSEL64,
71 SCMP_ARCH_MIPS64, /* native */
72 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_ABI64
73 SCMP_ARCH_MIPS,
74 SCMP_ARCH_MIPSEL,
75 SCMP_ARCH_MIPS64N32,
76 SCMP_ARCH_MIPSEL64N32,
77 SCMP_ARCH_MIPS64,
78 SCMP_ARCH_MIPSEL64, /* native */
79 #elif defined(__mips__) && __BYTE_ORDER == __BIG_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
80 SCMP_ARCH_MIPSEL,
81 SCMP_ARCH_MIPS,
82 SCMP_ARCH_MIPSEL64,
83 SCMP_ARCH_MIPS64,
84 SCMP_ARCH_MIPSEL64N32,
85 SCMP_ARCH_MIPS64N32, /* native */
86 #elif defined(__mips__) && __BYTE_ORDER == __LITTLE_ENDIAN && _MIPS_SIM == _MIPS_SIM_NABI32
87 SCMP_ARCH_MIPS,
88 SCMP_ARCH_MIPSEL,
89 SCMP_ARCH_MIPS64,
90 SCMP_ARCH_MIPSEL64,
91 SCMP_ARCH_MIPS64N32,
92 SCMP_ARCH_MIPSEL64N32, /* native */
93 #elif defined(__powerpc64__) && __BYTE_ORDER == __BIG_ENDIAN
94 SCMP_ARCH_PPC,
95 SCMP_ARCH_PPC64LE,
96 SCMP_ARCH_PPC64, /* native */
97 #elif defined(__powerpc64__) && __BYTE_ORDER == __LITTLE_ENDIAN
98 SCMP_ARCH_PPC,
99 SCMP_ARCH_PPC64,
100 SCMP_ARCH_PPC64LE, /* native */
101 #elif defined(__powerpc__)
102 SCMP_ARCH_PPC,
103 #elif defined(__s390x__)
104 SCMP_ARCH_S390,
105 SCMP_ARCH_S390X, /* native */
106 #elif defined(__s390__)
107 SCMP_ARCH_S390,
108 #endif
109 (uint32_t) -1
110 };
111
112 const char* seccomp_arch_to_string(uint32_t c) {
113 /* Maintain order used in <seccomp.h>.
114 *
115 * Names used here should be the same as those used for ConditionArchitecture=,
116 * except for "subarchitectures" like x32. */
117
118 switch(c) {
119 case SCMP_ARCH_NATIVE:
120 return "native";
121 case SCMP_ARCH_X86:
122 return "x86";
123 case SCMP_ARCH_X86_64:
124 return "x86-64";
125 case SCMP_ARCH_X32:
126 return "x32";
127 case SCMP_ARCH_ARM:
128 return "arm";
129 case SCMP_ARCH_AARCH64:
130 return "arm64";
131 case SCMP_ARCH_MIPS:
132 return "mips";
133 case SCMP_ARCH_MIPS64:
134 return "mips64";
135 case SCMP_ARCH_MIPS64N32:
136 return "mips64-n32";
137 case SCMP_ARCH_MIPSEL:
138 return "mips-le";
139 case SCMP_ARCH_MIPSEL64:
140 return "mips64-le";
141 case SCMP_ARCH_MIPSEL64N32:
142 return "mips64-le-n32";
143 case SCMP_ARCH_PPC:
144 return "ppc";
145 case SCMP_ARCH_PPC64:
146 return "ppc64";
147 case SCMP_ARCH_PPC64LE:
148 return "ppc64-le";
149 case SCMP_ARCH_S390:
150 return "s390";
151 case SCMP_ARCH_S390X:
152 return "s390x";
153 default:
154 return NULL;
155 }
156 }
157
158 int seccomp_arch_from_string(const char *n, uint32_t *ret) {
159 if (!n)
160 return -EINVAL;
161
162 assert(ret);
163
164 if (streq(n, "native"))
165 *ret = SCMP_ARCH_NATIVE;
166 else if (streq(n, "x86"))
167 *ret = SCMP_ARCH_X86;
168 else if (streq(n, "x86-64"))
169 *ret = SCMP_ARCH_X86_64;
170 else if (streq(n, "x32"))
171 *ret = SCMP_ARCH_X32;
172 else if (streq(n, "arm"))
173 *ret = SCMP_ARCH_ARM;
174 else if (streq(n, "arm64"))
175 *ret = SCMP_ARCH_AARCH64;
176 else if (streq(n, "mips"))
177 *ret = SCMP_ARCH_MIPS;
178 else if (streq(n, "mips64"))
179 *ret = SCMP_ARCH_MIPS64;
180 else if (streq(n, "mips64-n32"))
181 *ret = SCMP_ARCH_MIPS64N32;
182 else if (streq(n, "mips-le"))
183 *ret = SCMP_ARCH_MIPSEL;
184 else if (streq(n, "mips64-le"))
185 *ret = SCMP_ARCH_MIPSEL64;
186 else if (streq(n, "mips64-le-n32"))
187 *ret = SCMP_ARCH_MIPSEL64N32;
188 else if (streq(n, "ppc"))
189 *ret = SCMP_ARCH_PPC;
190 else if (streq(n, "ppc64"))
191 *ret = SCMP_ARCH_PPC64;
192 else if (streq(n, "ppc64-le"))
193 *ret = SCMP_ARCH_PPC64LE;
194 else if (streq(n, "s390"))
195 *ret = SCMP_ARCH_S390;
196 else if (streq(n, "s390x"))
197 *ret = SCMP_ARCH_S390X;
198 else
199 return -EINVAL;
200
201 return 0;
202 }
203
204 int seccomp_init_for_arch(scmp_filter_ctx *ret, uint32_t arch, uint32_t default_action) {
205 scmp_filter_ctx seccomp;
206 int r;
207
208 /* Much like seccomp_init(), but initializes the filter for one specific architecture only, without affecting
209 * any others. Also, turns off the NNP fiddling. */
210
211 seccomp = seccomp_init(default_action);
212 if (!seccomp)
213 return -ENOMEM;
214
215 if (arch != SCMP_ARCH_NATIVE &&
216 arch != seccomp_arch_native()) {
217
218 r = seccomp_arch_remove(seccomp, seccomp_arch_native());
219 if (r < 0)
220 goto finish;
221
222 r = seccomp_arch_add(seccomp, arch);
223 if (r < 0)
224 goto finish;
225
226 assert(seccomp_arch_exist(seccomp, arch) >= 0);
227 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) == -EEXIST);
228 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) == -EEXIST);
229 } else {
230 assert(seccomp_arch_exist(seccomp, SCMP_ARCH_NATIVE) >= 0);
231 assert(seccomp_arch_exist(seccomp, seccomp_arch_native()) >= 0);
232 }
233
234 r = seccomp_attr_set(seccomp, SCMP_FLTATR_ACT_BADARCH, SCMP_ACT_ALLOW);
235 if (r < 0)
236 goto finish;
237
238 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
239 if (r < 0)
240 goto finish;
241
242 *ret = seccomp;
243 return 0;
244
245 finish:
246 seccomp_release(seccomp);
247 return r;
248 }
249
250 static bool is_basic_seccomp_available(void) {
251 return prctl(PR_GET_SECCOMP, 0, 0, 0, 0) >= 0;
252 }
253
254 static bool is_seccomp_filter_available(void) {
255 return prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, NULL, 0, 0) < 0 &&
256 errno == EFAULT;
257 }
258
259 bool is_seccomp_available(void) {
260 static int cached_enabled = -1;
261
262 if (cached_enabled < 0)
263 cached_enabled =
264 is_basic_seccomp_available() &&
265 is_seccomp_filter_available();
266
267 return cached_enabled;
268 }
269
270 const SyscallFilterSet syscall_filter_sets[_SYSCALL_FILTER_SET_MAX] = {
271 [SYSCALL_FILTER_SET_DEFAULT] = {
272 .name = "@default",
273 .help = "System calls that are always permitted",
274 .value =
275 "clock_getres\0"
276 "clock_gettime\0"
277 "clock_nanosleep\0"
278 "execve\0"
279 "exit\0"
280 "exit_group\0"
281 "futex\0"
282 "get_robust_list\0"
283 "get_thread_area\0"
284 "getegid\0"
285 "getegid32\0"
286 "geteuid\0"
287 "geteuid32\0"
288 "getgid\0"
289 "getgid32\0"
290 "getgroups\0"
291 "getgroups32\0"
292 "getpgid\0"
293 "getpgrp\0"
294 "getpid\0"
295 "getppid\0"
296 "getresgid\0"
297 "getresgid32\0"
298 "getresuid\0"
299 "getresuid32\0"
300 "getrlimit\0" /* make sure processes can query stack size and such */
301 "getsid\0"
302 "gettid\0"
303 "gettimeofday\0"
304 "getuid\0"
305 "getuid32\0"
306 "membarrier\0"
307 "nanosleep\0"
308 "pause\0"
309 "prlimit64\0"
310 "restart_syscall\0"
311 "rt_sigreturn\0"
312 "sched_yield\0"
313 "set_robust_list\0"
314 "set_thread_area\0"
315 "set_tid_address\0"
316 "sigreturn\0"
317 "time\0"
318 "ugetrlimit\0"
319 },
320 [SYSCALL_FILTER_SET_AIO] = {
321 .name = "@aio",
322 .help = "Asynchronous IO",
323 .value =
324 "io_cancel\0"
325 "io_destroy\0"
326 "io_getevents\0"
327 "io_setup\0"
328 "io_submit\0"
329 },
330 [SYSCALL_FILTER_SET_BASIC_IO] = {
331 .name = "@basic-io",
332 .help = "Basic IO",
333 .value =
334 "_llseek\0"
335 "close\0"
336 "dup\0"
337 "dup2\0"
338 "dup3\0"
339 "lseek\0"
340 "pread64\0"
341 "preadv\0"
342 "preadv2\0"
343 "pwrite64\0"
344 "pwritev\0"
345 "pwritev2\0"
346 "read\0"
347 "readv\0"
348 "write\0"
349 "writev\0"
350 },
351 [SYSCALL_FILTER_SET_CHOWN] = {
352 .name = "@chown",
353 .help = "Change ownership of files and directories",
354 .value =
355 "chown\0"
356 "chown32\0"
357 "fchown\0"
358 "fchown32\0"
359 "fchownat\0"
360 "lchown\0"
361 "lchown32\0"
362 },
363 [SYSCALL_FILTER_SET_CLOCK] = {
364 .name = "@clock",
365 .help = "Change the system time",
366 .value =
367 "adjtimex\0"
368 "clock_adjtime\0"
369 "clock_settime\0"
370 "settimeofday\0"
371 "stime\0"
372 },
373 [SYSCALL_FILTER_SET_CPU_EMULATION] = {
374 .name = "@cpu-emulation",
375 .help = "System calls for CPU emulation functionality",
376 .value =
377 "modify_ldt\0"
378 "subpage_prot\0"
379 "switch_endian\0"
380 "vm86\0"
381 "vm86old\0"
382 },
383 [SYSCALL_FILTER_SET_DEBUG] = {
384 .name = "@debug",
385 .help = "Debugging, performance monitoring and tracing functionality",
386 .value =
387 "lookup_dcookie\0"
388 "perf_event_open\0"
389 "process_vm_readv\0"
390 "process_vm_writev\0"
391 "ptrace\0"
392 "rtas\0"
393 #ifdef __NR_s390_runtime_instr
394 "s390_runtime_instr\0"
395 #endif
396 "sys_debug_setcontext\0"
397 },
398 [SYSCALL_FILTER_SET_FILE_SYSTEM] = {
399 .name = "@file-system",
400 .help = "File system operations",
401 .value =
402 "access\0"
403 "chdir\0"
404 "chmod\0"
405 "close\0"
406 "creat\0"
407 "faccessat\0"
408 "fallocate\0"
409 "fchdir\0"
410 "fchmod\0"
411 "fchmodat\0"
412 "fcntl\0"
413 "fcntl64\0"
414 "fgetxattr\0"
415 "flistxattr\0"
416 "fremovexattr\0"
417 "fsetxattr\0"
418 "fstat\0"
419 "fstat64\0"
420 "fstatat64\0"
421 "fstatfs\0"
422 "fstatfs64\0"
423 "ftruncate\0"
424 "ftruncate64\0"
425 "futimesat\0"
426 "getcwd\0"
427 "getdents\0"
428 "getdents64\0"
429 "getxattr\0"
430 "inotify_add_watch\0"
431 "inotify_init\0"
432 "inotify_init1\0"
433 "inotify_rm_watch\0"
434 "lgetxattr\0"
435 "link\0"
436 "linkat\0"
437 "listxattr\0"
438 "llistxattr\0"
439 "lremovexattr\0"
440 "lsetxattr\0"
441 "lstat\0"
442 "lstat64\0"
443 "mkdir\0"
444 "mkdirat\0"
445 "mknod\0"
446 "mknodat\0"
447 "mmap\0"
448 "mmap2\0"
449 "munmap\0"
450 "newfstatat\0"
451 "oldfstat\0"
452 "oldlstat\0"
453 "oldstat\0"
454 "open\0"
455 "openat\0"
456 "readlink\0"
457 "readlinkat\0"
458 "removexattr\0"
459 "rename\0"
460 "renameat\0"
461 "renameat2\0"
462 "rmdir\0"
463 "setxattr\0"
464 "stat\0"
465 "stat64\0"
466 "statfs\0"
467 "statfs64\0"
468 #ifdef __PNR_statx
469 "statx\0"
470 #endif
471 "symlink\0"
472 "symlinkat\0"
473 "truncate\0"
474 "truncate64\0"
475 "unlink\0"
476 "unlinkat\0"
477 "utime\0"
478 "utimensat\0"
479 "utimes\0"
480 },
481 [SYSCALL_FILTER_SET_IO_EVENT] = {
482 .name = "@io-event",
483 .help = "Event loop system calls",
484 .value =
485 "_newselect\0"
486 "epoll_create\0"
487 "epoll_create1\0"
488 "epoll_ctl\0"
489 "epoll_ctl_old\0"
490 "epoll_pwait\0"
491 "epoll_wait\0"
492 "epoll_wait_old\0"
493 "eventfd\0"
494 "eventfd2\0"
495 "poll\0"
496 "ppoll\0"
497 "pselect6\0"
498 "select\0"
499 },
500 [SYSCALL_FILTER_SET_IPC] = {
501 .name = "@ipc",
502 .help = "SysV IPC, POSIX Message Queues or other IPC",
503 .value =
504 "ipc\0"
505 "memfd_create\0"
506 "mq_getsetattr\0"
507 "mq_notify\0"
508 "mq_open\0"
509 "mq_timedreceive\0"
510 "mq_timedsend\0"
511 "mq_unlink\0"
512 "msgctl\0"
513 "msgget\0"
514 "msgrcv\0"
515 "msgsnd\0"
516 "pipe\0"
517 "pipe2\0"
518 "process_vm_readv\0"
519 "process_vm_writev\0"
520 "semctl\0"
521 "semget\0"
522 "semop\0"
523 "semtimedop\0"
524 "shmat\0"
525 "shmctl\0"
526 "shmdt\0"
527 "shmget\0"
528 },
529 [SYSCALL_FILTER_SET_KEYRING] = {
530 .name = "@keyring",
531 .help = "Kernel keyring access",
532 .value =
533 "add_key\0"
534 "keyctl\0"
535 "request_key\0"
536 },
537 [SYSCALL_FILTER_SET_MEMLOCK] = {
538 .name = "@memlock",
539 .help = "Memory locking control",
540 .value =
541 "mlock\0"
542 "mlock2\0"
543 "mlockall\0"
544 "munlock\0"
545 "munlockall\0"
546 },
547 [SYSCALL_FILTER_SET_MODULE] = {
548 .name = "@module",
549 .help = "Loading and unloading of kernel modules",
550 .value =
551 "delete_module\0"
552 "finit_module\0"
553 "init_module\0"
554 },
555 [SYSCALL_FILTER_SET_MOUNT] = {
556 .name = "@mount",
557 .help = "Mounting and unmounting of file systems",
558 .value =
559 "chroot\0"
560 "mount\0"
561 "pivot_root\0"
562 "umount\0"
563 "umount2\0"
564 },
565 [SYSCALL_FILTER_SET_NETWORK_IO] = {
566 .name = "@network-io",
567 .help = "Network or Unix socket IO, should not be needed if not network facing",
568 .value =
569 "accept\0"
570 "accept4\0"
571 "bind\0"
572 "connect\0"
573 "getpeername\0"
574 "getsockname\0"
575 "getsockopt\0"
576 "listen\0"
577 "recv\0"
578 "recvfrom\0"
579 "recvmmsg\0"
580 "recvmsg\0"
581 "send\0"
582 "sendmmsg\0"
583 "sendmsg\0"
584 "sendto\0"
585 "setsockopt\0"
586 "shutdown\0"
587 "socket\0"
588 "socketcall\0"
589 "socketpair\0"
590 },
591 [SYSCALL_FILTER_SET_OBSOLETE] = {
592 /* some unknown even to libseccomp */
593 .name = "@obsolete",
594 .help = "Unusual, obsolete or unimplemented system calls",
595 .value =
596 "_sysctl\0"
597 "afs_syscall\0"
598 "bdflush\0"
599 "break\0"
600 "create_module\0"
601 "ftime\0"
602 "get_kernel_syms\0"
603 "getpmsg\0"
604 "gtty\0"
605 "idle\0"
606 "lock\0"
607 "mpx\0"
608 "prof\0"
609 "profil\0"
610 "putpmsg\0"
611 "query_module\0"
612 "security\0"
613 "sgetmask\0"
614 "ssetmask\0"
615 "stty\0"
616 "sysfs\0"
617 "tuxcall\0"
618 "ulimit\0"
619 "uselib\0"
620 "ustat\0"
621 "vserver\0"
622 },
623 [SYSCALL_FILTER_SET_PRIVILEGED] = {
624 .name = "@privileged",
625 .help = "All system calls which need super-user capabilities",
626 .value =
627 "@chown\0"
628 "@clock\0"
629 "@module\0"
630 "@raw-io\0"
631 "_sysctl\0"
632 "acct\0"
633 "bpf\0"
634 "capset\0"
635 "chroot\0"
636 "kexec_file_load\0"
637 "kexec_load\0"
638 "nfsservctl\0"
639 "pivot_root\0"
640 "quotactl\0"
641 "reboot\0"
642 "setdomainname\0"
643 "setfsuid\0"
644 "setfsuid32\0"
645 "setgroups\0"
646 "setgroups32\0"
647 "sethostname\0"
648 "setresuid\0"
649 "setresuid32\0"
650 "setreuid\0"
651 "setreuid32\0"
652 "setuid\0"
653 "setuid32\0"
654 "swapoff\0"
655 "swapon\0"
656 "vhangup\0"
657 },
658 [SYSCALL_FILTER_SET_PROCESS] = {
659 .name = "@process",
660 .help = "Process control, execution, namespaceing operations",
661 .value =
662 "arch_prctl\0"
663 "capget\0" /* Able to query arbitrary processes */
664 "clone\0"
665 "execveat\0"
666 "fork\0"
667 "getrusage\0"
668 "kill\0"
669 "prctl\0"
670 "rt_sigqueueinfo\0"
671 "rt_tgsigqueueinfo\0"
672 "setns\0"
673 "tgkill\0"
674 "times\0"
675 "tkill\0"
676 "unshare\0"
677 "vfork\0"
678 "wait4\0"
679 "waitid\0"
680 "waitpid\0"
681 },
682 [SYSCALL_FILTER_SET_RAW_IO] = {
683 .name = "@raw-io",
684 .help = "Raw I/O port access",
685 .value =
686 "ioperm\0"
687 "iopl\0"
688 "pciconfig_iobase\0"
689 "pciconfig_read\0"
690 "pciconfig_write\0"
691 #ifdef __NR_s390_pci_mmio_read
692 "s390_pci_mmio_read\0"
693 #endif
694 #ifdef __NR_s390_pci_mmio_write
695 "s390_pci_mmio_write\0"
696 #endif
697 },
698 [SYSCALL_FILTER_SET_REBOOT] = {
699 .name = "@reboot",
700 .help = "Reboot and reboot preparation/kexec",
701 .value =
702 "kexec\0"
703 "kexec_file_load\0"
704 "reboot\0"
705 },
706 [SYSCALL_FILTER_SET_RESOURCES] = {
707 .name = "@resources",
708 .help = "Alter resource settings",
709 .value =
710 "ioprio_set\0"
711 "mbind\0"
712 "migrate_pages\0"
713 "move_pages\0"
714 "nice\0"
715 "sched_setaffinity\0"
716 "sched_setattr\0"
717 "sched_setparam\0"
718 "sched_setscheduler\0"
719 "set_mempolicy\0"
720 "setpriority\0"
721 "setrlimit\0"
722 },
723 [SYSCALL_FILTER_SET_SETUID] = {
724 .name = "@setuid",
725 .help = "Operations for changing user/group credentials",
726 .value =
727 "setgid\0"
728 "setgid32\0"
729 "setgroups\0"
730 "setgroups32\0"
731 "setregid\0"
732 "setregid32\0"
733 "setresgid\0"
734 "setresgid32\0"
735 "setresuid\0"
736 "setresuid32\0"
737 "setreuid\0"
738 "setreuid32\0"
739 "setuid\0"
740 "setuid32\0"
741 },
742 [SYSCALL_FILTER_SET_SIGNAL] = {
743 .name = "@signal",
744 .help = "Process signal handling",
745 .value =
746 "rt_sigaction\0"
747 "rt_sigpending\0"
748 "rt_sigprocmask\0"
749 "rt_sigsuspend\0"
750 "rt_sigtimedwait\0"
751 "sigaction\0"
752 "sigaltstack\0"
753 "signal\0"
754 "signalfd\0"
755 "signalfd4\0"
756 "sigpending\0"
757 "sigprocmask\0"
758 "sigsuspend\0"
759 },
760 [SYSCALL_FILTER_SET_SWAP] = {
761 .name = "@swap",
762 .help = "Enable/disable swap devices",
763 .value =
764 "swapoff\0"
765 "swapon\0"
766 },
767 [SYSCALL_FILTER_SET_SYNC] = {
768 .name = "@sync",
769 .help = "Synchronize files and memory to storage",
770 .value =
771 "fdatasync\0"
772 "fsync\0"
773 "msync\0"
774 "sync\0"
775 "sync_file_range\0"
776 "syncfs\0"
777 },
778 [SYSCALL_FILTER_SET_TIMER] = {
779 .name = "@timer",
780 .help = "Schedule operations by time",
781 .value =
782 "alarm\0"
783 "getitimer\0"
784 "setitimer\0"
785 "timer_create\0"
786 "timer_delete\0"
787 "timer_getoverrun\0"
788 "timer_gettime\0"
789 "timer_settime\0"
790 "timerfd_create\0"
791 "timerfd_gettime\0"
792 "timerfd_settime\0"
793 "times\0"
794 },
795 };
796
797 const SyscallFilterSet *syscall_filter_set_find(const char *name) {
798 unsigned i;
799
800 if (isempty(name) || name[0] != '@')
801 return NULL;
802
803 for (i = 0; i < _SYSCALL_FILTER_SET_MAX; i++)
804 if (streq(syscall_filter_sets[i].name, name))
805 return syscall_filter_sets + i;
806
807 return NULL;
808 }
809
810 static int seccomp_add_syscall_filter_set(scmp_filter_ctx seccomp, const SyscallFilterSet *set, uint32_t action, char **exclude);
811
812 int seccomp_add_syscall_filter_item(scmp_filter_ctx *seccomp, const char *name, uint32_t action, char **exclude) {
813 int r;
814
815 assert(seccomp);
816 assert(name);
817
818 if (strv_contains(exclude, name))
819 return 0;
820
821 if (name[0] == '@') {
822 const SyscallFilterSet *other;
823
824 other = syscall_filter_set_find(name);
825 if (!other) {
826 log_debug("Filter set %s is not known!", name);
827 return -EINVAL;
828 }
829
830 r = seccomp_add_syscall_filter_set(seccomp, other, action, exclude);
831 if (r < 0)
832 return r;
833 } else {
834 int id;
835
836 id = seccomp_syscall_resolve_name(name);
837 if (id == __NR_SCMP_ERROR) {
838 log_debug("System call %s is not known, ignoring.", name);
839 return 0;
840 }
841
842 r = seccomp_rule_add_exact(seccomp, action, id, 0);
843 if (r < 0)
844 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
845 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", name, id);
846 }
847
848 return 0;
849 }
850
851 static int seccomp_add_syscall_filter_set(
852 scmp_filter_ctx seccomp,
853 const SyscallFilterSet *set,
854 uint32_t action,
855 char **exclude) {
856
857 const char *sys;
858 int r;
859
860 assert(seccomp);
861 assert(set);
862
863 NULSTR_FOREACH(sys, set->value) {
864 r = seccomp_add_syscall_filter_item(seccomp, sys, action, exclude);
865 if (r < 0)
866 return r;
867 }
868
869 return 0;
870 }
871
872 int seccomp_load_syscall_filter_set(uint32_t default_action, const SyscallFilterSet *set, uint32_t action) {
873 uint32_t arch;
874 int r;
875
876 assert(set);
877
878 /* The one-stop solution: allocate a seccomp object, add the specified filter to it, and apply it. Once for
879 * earch local arch. */
880
881 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
882 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
883
884 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
885
886 r = seccomp_init_for_arch(&seccomp, arch, default_action);
887 if (r < 0)
888 return r;
889
890 r = seccomp_add_syscall_filter_set(seccomp, set, action, NULL);
891 if (r < 0) {
892 log_debug_errno(r, "Failed to add filter set, ignoring: %m");
893 continue;
894 }
895
896 r = seccomp_load(seccomp);
897 if (IN_SET(r, -EPERM, -EACCES))
898 return r;
899 if (r < 0)
900 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
901 }
902
903 return 0;
904 }
905
906 int seccomp_load_syscall_filter_set_raw(uint32_t default_action, Set* set, uint32_t action) {
907 uint32_t arch;
908 int r;
909
910 /* Similar to seccomp_load_syscall_filter_set(), but takes a raw Set* of syscalls, instead of a
911 * SyscallFilterSet* table. */
912
913 if (set_isempty(set) && default_action == SCMP_ACT_ALLOW)
914 return 0;
915
916 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
917 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
918 Iterator i;
919 void *id;
920
921 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
922
923 r = seccomp_init_for_arch(&seccomp, arch, default_action);
924 if (r < 0)
925 return r;
926
927 SET_FOREACH(id, set, i) {
928 r = seccomp_rule_add_exact(seccomp, action, PTR_TO_INT(id) - 1, 0);
929 if (r < 0) {
930 /* If the system call is not known on this architecture, then that's fine, let's ignore it */
931 _cleanup_free_ char *n = NULL;
932
933 n = seccomp_syscall_resolve_num_arch(arch, PTR_TO_INT(id) - 1);
934 log_debug_errno(r, "Failed to add rule for system call %s() / %d, ignoring: %m", strna(n), PTR_TO_INT(id) - 1);
935 }
936 }
937
938 r = seccomp_load(seccomp);
939 if (IN_SET(r, -EPERM, -EACCES))
940 return r;
941 if (r < 0)
942 log_debug_errno(r, "Failed to install filter set for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
943 }
944
945 return 0;
946 }
947
948 int seccomp_restrict_namespaces(unsigned long retain) {
949 uint32_t arch;
950 int r;
951
952 if (log_get_max_level() >= LOG_DEBUG) {
953 _cleanup_free_ char *s = NULL;
954
955 (void) namespace_flag_to_string_many(retain, &s);
956 log_debug("Restricting namespace to: %s.", strna(s));
957 }
958
959 /* NOOP? */
960 if ((retain & NAMESPACE_FLAGS_ALL) == NAMESPACE_FLAGS_ALL)
961 return 0;
962
963 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
964 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
965 unsigned i;
966
967 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
968
969 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
970 if (r < 0)
971 return r;
972
973 if ((retain & NAMESPACE_FLAGS_ALL) == 0)
974 /* If every single kind of namespace shall be prohibited, then let's block the whole setns() syscall
975 * altogether. */
976 r = seccomp_rule_add_exact(
977 seccomp,
978 SCMP_ACT_ERRNO(EPERM),
979 SCMP_SYS(setns),
980 0);
981 else
982 /* Otherwise, block only the invocations with the appropriate flags in the loop below, but also the
983 * special invocation with a zero flags argument, right here. */
984 r = seccomp_rule_add_exact(
985 seccomp,
986 SCMP_ACT_ERRNO(EPERM),
987 SCMP_SYS(setns),
988 1,
989 SCMP_A1(SCMP_CMP_EQ, 0));
990 if (r < 0) {
991 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
992 continue;
993 }
994
995 for (i = 0; namespace_flag_map[i].name; i++) {
996 unsigned long f;
997
998 f = namespace_flag_map[i].flag;
999 if ((retain & f) == f) {
1000 log_debug("Permitting %s.", namespace_flag_map[i].name);
1001 continue;
1002 }
1003
1004 log_debug("Blocking %s.", namespace_flag_map[i].name);
1005
1006 r = seccomp_rule_add_exact(
1007 seccomp,
1008 SCMP_ACT_ERRNO(EPERM),
1009 SCMP_SYS(unshare),
1010 1,
1011 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1012 if (r < 0) {
1013 log_debug_errno(r, "Failed to add unshare() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1014 break;
1015 }
1016
1017 /* On s390/s390x the first two parameters to clone are switched */
1018 if (!IN_SET(arch, SCMP_ARCH_S390, SCMP_ARCH_S390X))
1019 r = seccomp_rule_add_exact(
1020 seccomp,
1021 SCMP_ACT_ERRNO(EPERM),
1022 SCMP_SYS(clone),
1023 1,
1024 SCMP_A0(SCMP_CMP_MASKED_EQ, f, f));
1025 else
1026 r = seccomp_rule_add_exact(
1027 seccomp,
1028 SCMP_ACT_ERRNO(EPERM),
1029 SCMP_SYS(clone),
1030 1,
1031 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1032 if (r < 0) {
1033 log_debug_errno(r, "Failed to add clone() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1034 break;
1035 }
1036
1037 if ((retain & NAMESPACE_FLAGS_ALL) != 0) {
1038 r = seccomp_rule_add_exact(
1039 seccomp,
1040 SCMP_ACT_ERRNO(EPERM),
1041 SCMP_SYS(setns),
1042 1,
1043 SCMP_A1(SCMP_CMP_MASKED_EQ, f, f));
1044 if (r < 0) {
1045 log_debug_errno(r, "Failed to add setns() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1046 break;
1047 }
1048 }
1049 }
1050 if (r < 0)
1051 continue;
1052
1053 r = seccomp_load(seccomp);
1054 if (IN_SET(r, -EPERM, -EACCES))
1055 return r;
1056 if (r < 0)
1057 log_debug_errno(r, "Failed to install namespace restriction rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1058 }
1059
1060 return 0;
1061 }
1062
1063 int seccomp_protect_sysctl(void) {
1064 uint32_t arch;
1065 int r;
1066
1067 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1068 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1069
1070 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1071
1072 if (IN_SET(arch, SCMP_ARCH_X32, SCMP_ARCH_AARCH64))
1073 /* No _sysctl syscall */
1074 continue;
1075
1076 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1077 if (r < 0)
1078 return r;
1079
1080 r = seccomp_rule_add_exact(
1081 seccomp,
1082 SCMP_ACT_ERRNO(EPERM),
1083 SCMP_SYS(_sysctl),
1084 0);
1085 if (r < 0) {
1086 log_debug_errno(r, "Failed to add _sysctl() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1087 continue;
1088 }
1089
1090 r = seccomp_load(seccomp);
1091 if (IN_SET(r, -EPERM, -EACCES))
1092 return r;
1093 if (r < 0)
1094 log_debug_errno(r, "Failed to install sysctl protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1095 }
1096
1097 return 0;
1098 }
1099
1100 int seccomp_restrict_address_families(Set *address_families, bool whitelist) {
1101 uint32_t arch;
1102 int r;
1103
1104 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1105 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1106 bool supported;
1107 Iterator i;
1108
1109 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1110
1111 switch (arch) {
1112
1113 case SCMP_ARCH_X86_64:
1114 case SCMP_ARCH_X32:
1115 case SCMP_ARCH_ARM:
1116 case SCMP_ARCH_AARCH64:
1117 case SCMP_ARCH_PPC64:
1118 case SCMP_ARCH_PPC64LE:
1119 /* These we know we support (i.e. are the ones that do not use socketcall()) */
1120 supported = true;
1121 break;
1122
1123 case SCMP_ARCH_S390:
1124 case SCMP_ARCH_S390X:
1125 case SCMP_ARCH_PPC:
1126 case SCMP_ARCH_X86:
1127 default:
1128 /* These we either know we don't support (i.e. are the ones that do use socketcall()), or we
1129 * don't know */
1130 supported = false;
1131 break;
1132 }
1133
1134 if (!supported)
1135 continue;
1136
1137 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1138 if (r < 0)
1139 return r;
1140
1141 if (whitelist) {
1142 int af, first = 0, last = 0;
1143 void *afp;
1144
1145 /* If this is a whitelist, we first block the address families that are out of range and then
1146 * everything that is not in the set. First, we find the lowest and highest address family in
1147 * the set. */
1148
1149 SET_FOREACH(afp, address_families, i) {
1150 af = PTR_TO_INT(afp);
1151
1152 if (af <= 0 || af >= af_max())
1153 continue;
1154
1155 if (first == 0 || af < first)
1156 first = af;
1157
1158 if (last == 0 || af > last)
1159 last = af;
1160 }
1161
1162 assert((first == 0) == (last == 0));
1163
1164 if (first == 0) {
1165
1166 /* No entries in the valid range, block everything */
1167 r = seccomp_rule_add_exact(
1168 seccomp,
1169 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1170 SCMP_SYS(socket),
1171 0);
1172 if (r < 0) {
1173 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1174 continue;
1175 }
1176
1177 } else {
1178
1179 /* Block everything below the first entry */
1180 r = seccomp_rule_add_exact(
1181 seccomp,
1182 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1183 SCMP_SYS(socket),
1184 1,
1185 SCMP_A0(SCMP_CMP_LT, first));
1186 if (r < 0) {
1187 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1188 continue;
1189 }
1190
1191 /* Block everything above the last entry */
1192 r = seccomp_rule_add_exact(
1193 seccomp,
1194 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1195 SCMP_SYS(socket),
1196 1,
1197 SCMP_A0(SCMP_CMP_GT, last));
1198 if (r < 0) {
1199 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1200 continue;
1201 }
1202
1203 /* Block everything between the first and last entry */
1204 for (af = 1; af < af_max(); af++) {
1205
1206 if (set_contains(address_families, INT_TO_PTR(af)))
1207 continue;
1208
1209 r = seccomp_rule_add_exact(
1210 seccomp,
1211 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1212 SCMP_SYS(socket),
1213 1,
1214 SCMP_A0(SCMP_CMP_EQ, af));
1215 if (r < 0)
1216 break;
1217 }
1218 if (r < 0) {
1219 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1220 continue;
1221 }
1222 }
1223
1224 } else {
1225 void *af;
1226
1227 /* If this is a blacklist, then generate one rule for
1228 * each address family that are then combined in OR
1229 * checks. */
1230
1231 SET_FOREACH(af, address_families, i) {
1232
1233 r = seccomp_rule_add_exact(
1234 seccomp,
1235 SCMP_ACT_ERRNO(EAFNOSUPPORT),
1236 SCMP_SYS(socket),
1237 1,
1238 SCMP_A0(SCMP_CMP_EQ, PTR_TO_INT(af)));
1239 if (r < 0)
1240 break;
1241 }
1242 if (r < 0) {
1243 log_debug_errno(r, "Failed to add socket() rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1244 continue;
1245 }
1246 }
1247
1248 r = seccomp_load(seccomp);
1249 if (IN_SET(r, -EPERM, -EACCES))
1250 return r;
1251 if (r < 0)
1252 log_debug_errno(r, "Failed to install socket family rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1253 }
1254
1255 return 0;
1256 }
1257
1258 int seccomp_restrict_realtime(void) {
1259 static const int permitted_policies[] = {
1260 SCHED_OTHER,
1261 SCHED_BATCH,
1262 SCHED_IDLE,
1263 };
1264
1265 int r, max_policy = 0;
1266 uint32_t arch;
1267 unsigned i;
1268
1269 /* Determine the highest policy constant we want to allow */
1270 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1271 if (permitted_policies[i] > max_policy)
1272 max_policy = permitted_policies[i];
1273
1274 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1275 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1276 int p;
1277
1278 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1279
1280 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1281 if (r < 0)
1282 return r;
1283
1284 /* Go through all policies with lower values than that, and block them -- unless they appear in the
1285 * whitelist. */
1286 for (p = 0; p < max_policy; p++) {
1287 bool good = false;
1288
1289 /* Check if this is in the whitelist. */
1290 for (i = 0; i < ELEMENTSOF(permitted_policies); i++)
1291 if (permitted_policies[i] == p) {
1292 good = true;
1293 break;
1294 }
1295
1296 if (good)
1297 continue;
1298
1299 /* Deny this policy */
1300 r = seccomp_rule_add_exact(
1301 seccomp,
1302 SCMP_ACT_ERRNO(EPERM),
1303 SCMP_SYS(sched_setscheduler),
1304 1,
1305 SCMP_A1(SCMP_CMP_EQ, p));
1306 if (r < 0) {
1307 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1308 continue;
1309 }
1310 }
1311
1312 /* Blacklist all other policies, i.e. the ones with higher values. Note that all comparisons are
1313 * unsigned here, hence no need no check for < 0 values. */
1314 r = seccomp_rule_add_exact(
1315 seccomp,
1316 SCMP_ACT_ERRNO(EPERM),
1317 SCMP_SYS(sched_setscheduler),
1318 1,
1319 SCMP_A1(SCMP_CMP_GT, max_policy));
1320 if (r < 0) {
1321 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1322 continue;
1323 }
1324
1325 r = seccomp_load(seccomp);
1326 if (IN_SET(r, -EPERM, -EACCES))
1327 return r;
1328 if (r < 0)
1329 log_debug_errno(r, "Failed to install realtime protection rules for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1330 }
1331
1332 return 0;
1333 }
1334
1335 static int add_seccomp_syscall_filter(scmp_filter_ctx seccomp,
1336 uint32_t arch,
1337 int nr,
1338 unsigned int arg_cnt,
1339 const struct scmp_arg_cmp arg) {
1340 int r;
1341
1342 r = seccomp_rule_add_exact(seccomp, SCMP_ACT_ERRNO(EPERM), nr, arg_cnt, arg);
1343 if (r < 0) {
1344 _cleanup_free_ char *n = NULL;
1345
1346 n = seccomp_syscall_resolve_num_arch(arch, nr);
1347 log_debug_errno(r, "Failed to add %s() rule for architecture %s, skipping: %m",
1348 strna(n),
1349 seccomp_arch_to_string(arch));
1350 }
1351
1352 return r;
1353 }
1354
1355 /* For known architectures, check that syscalls are indeed defined or not. */
1356 #if defined(__x86_64__) || defined(__arm__) || defined(__aarch64__)
1357 assert_cc(SCMP_SYS(shmget) > 0);
1358 assert_cc(SCMP_SYS(shmat) > 0);
1359 assert_cc(SCMP_SYS(shmdt) > 0);
1360 #elif defined(__i386__) || defined(__powerpc64__)
1361 assert_cc(SCMP_SYS(shmget) < 0);
1362 assert_cc(SCMP_SYS(shmat) < 0);
1363 assert_cc(SCMP_SYS(shmdt) < 0);
1364 #endif
1365
1366 int seccomp_memory_deny_write_execute(void) {
1367
1368 uint32_t arch;
1369 int r;
1370
1371 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1372 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1373 int filter_syscall = 0, block_syscall = 0, shmat_syscall = 0;
1374
1375 log_debug("Operating on architecture: %s", seccomp_arch_to_string(arch));
1376
1377 switch (arch) {
1378
1379 case SCMP_ARCH_X86:
1380 filter_syscall = SCMP_SYS(mmap2);
1381 block_syscall = SCMP_SYS(mmap);
1382 break;
1383
1384 case SCMP_ARCH_PPC64:
1385 case SCMP_ARCH_PPC64LE:
1386 filter_syscall = SCMP_SYS(mmap);
1387
1388 /* Note that shmat() isn't available, and the call is multiplexed through ipc().
1389 * We ignore that here, which means there's still a way to get writable/executable
1390 * memory, if an IPC key is mapped like this. That's a pity, but no total loss. */
1391
1392 break;
1393
1394 case SCMP_ARCH_ARM:
1395 filter_syscall = SCMP_SYS(mmap2); /* arm has only mmap2 */
1396 shmat_syscall = SCMP_SYS(shmat);
1397 break;
1398
1399 case SCMP_ARCH_X86_64:
1400 case SCMP_ARCH_X32:
1401 case SCMP_ARCH_AARCH64:
1402 filter_syscall = SCMP_SYS(mmap); /* amd64, x32, and arm64 have only mmap */
1403 shmat_syscall = SCMP_SYS(shmat);
1404 break;
1405
1406 /* Please add more definitions here, if you port systemd to other architectures! */
1407
1408 #if !defined(__i386__) && !defined(__x86_64__) && !defined(__powerpc64__) && !defined(__arm__) && !defined(__aarch64__)
1409 #warning "Consider adding the right mmap() syscall definitions here!"
1410 #endif
1411 }
1412
1413 /* Can't filter mmap() on this arch, then skip it */
1414 if (filter_syscall == 0)
1415 continue;
1416
1417 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1418 if (r < 0)
1419 return r;
1420
1421 r = add_seccomp_syscall_filter(seccomp, arch, filter_syscall,
1422 1,
1423 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC|PROT_WRITE, PROT_EXEC|PROT_WRITE));
1424 if (r < 0)
1425 continue;
1426
1427 if (block_syscall != 0) {
1428 r = add_seccomp_syscall_filter(seccomp, arch, block_syscall, 0, (const struct scmp_arg_cmp){} );
1429 if (r < 0)
1430 continue;
1431 }
1432
1433 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(mprotect),
1434 1,
1435 SCMP_A2(SCMP_CMP_MASKED_EQ, PROT_EXEC, PROT_EXEC));
1436 if (r < 0)
1437 continue;
1438
1439 if (shmat_syscall != 0) {
1440 r = add_seccomp_syscall_filter(seccomp, arch, SCMP_SYS(shmat),
1441 1,
1442 SCMP_A2(SCMP_CMP_MASKED_EQ, SHM_EXEC, SHM_EXEC));
1443 if (r < 0)
1444 continue;
1445 }
1446
1447 r = seccomp_load(seccomp);
1448 if (IN_SET(r, -EPERM, -EACCES))
1449 return r;
1450 if (r < 0)
1451 log_debug_errno(r, "Failed to install MemoryDenyWriteExecute= rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1452 }
1453
1454 return 0;
1455 }
1456
1457 int seccomp_restrict_archs(Set *archs) {
1458 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1459 Iterator i;
1460 void *id;
1461 int r;
1462
1463 /* This installs a filter with no rules, but that restricts the system call architectures to the specified
1464 * list. */
1465
1466 seccomp = seccomp_init(SCMP_ACT_ALLOW);
1467 if (!seccomp)
1468 return -ENOMEM;
1469
1470 SET_FOREACH(id, archs, i) {
1471 r = seccomp_arch_add(seccomp, PTR_TO_UINT32(id) - 1);
1472 if (r == -EEXIST)
1473 continue;
1474 if (r < 0)
1475 return r;
1476 }
1477
1478 r = seccomp_attr_set(seccomp, SCMP_FLTATR_CTL_NNP, 0);
1479 if (r < 0)
1480 return r;
1481
1482 r = seccomp_load(seccomp);
1483 if (IN_SET(r, -EPERM, -EACCES))
1484 return r;
1485 if (r < 0)
1486 log_debug_errno(r, "Failed to restrict system call architectures, skipping: %m");
1487
1488 return 0;
1489 }
1490
1491 int parse_syscall_archs(char **l, Set **archs) {
1492 _cleanup_set_free_ Set *_archs;
1493 char **s;
1494 int r;
1495
1496 assert(l);
1497 assert(archs);
1498
1499 r = set_ensure_allocated(&_archs, NULL);
1500 if (r < 0)
1501 return r;
1502
1503 STRV_FOREACH(s, l) {
1504 uint32_t a;
1505
1506 r = seccomp_arch_from_string(*s, &a);
1507 if (r < 0)
1508 return -EINVAL;
1509
1510 r = set_put(_archs, UINT32_TO_PTR(a + 1));
1511 if (r < 0)
1512 return -ENOMEM;
1513 }
1514
1515 *archs = _archs;
1516 _archs = NULL;
1517
1518 return 0;
1519 }
1520
1521 int seccomp_filter_set_add(Set *filter, bool add, const SyscallFilterSet *set) {
1522 const char *i;
1523 int r;
1524
1525 assert(set);
1526
1527 NULSTR_FOREACH(i, set->value) {
1528
1529 if (i[0] == '@') {
1530 const SyscallFilterSet *more;
1531
1532 more = syscall_filter_set_find(i);
1533 if (!more)
1534 return -ENXIO;
1535
1536 r = seccomp_filter_set_add(filter, add, more);
1537 if (r < 0)
1538 return r;
1539 } else {
1540 int id;
1541
1542 id = seccomp_syscall_resolve_name(i);
1543 if (id == __NR_SCMP_ERROR) {
1544 log_debug("Couldn't resolve system call, ignoring: %s", i);
1545 continue;
1546 }
1547
1548 if (add) {
1549 r = set_put(filter, INT_TO_PTR(id + 1));
1550 if (r < 0)
1551 return r;
1552 } else
1553 (void) set_remove(filter, INT_TO_PTR(id + 1));
1554 }
1555 }
1556
1557 return 0;
1558 }
1559
1560 int seccomp_lock_personality(unsigned long personality) {
1561 uint32_t arch;
1562 int r;
1563
1564 if (personality >= PERSONALITY_INVALID)
1565 return -EINVAL;
1566
1567 SECCOMP_FOREACH_LOCAL_ARCH(arch) {
1568 _cleanup_(seccomp_releasep) scmp_filter_ctx seccomp = NULL;
1569
1570 r = seccomp_init_for_arch(&seccomp, arch, SCMP_ACT_ALLOW);
1571 if (r < 0)
1572 return r;
1573
1574 r = seccomp_rule_add_exact(
1575 seccomp,
1576 SCMP_ACT_ERRNO(EPERM),
1577 SCMP_SYS(personality),
1578 1,
1579 SCMP_A0(SCMP_CMP_NE, personality));
1580 if (r < 0) {
1581 log_debug_errno(r, "Failed to add scheduler rule for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1582 continue;
1583 }
1584
1585 r = seccomp_load(seccomp);
1586 if (IN_SET(r, -EPERM, -EACCES))
1587 return r;
1588 if (r < 0)
1589 log_debug_errno(r, "Failed to enable personality lock for architecture %s, skipping: %m", seccomp_arch_to_string(arch));
1590 }
1591
1592 return 0;
1593 }